<a href="https://colab.research.google.com/github/alvinrach/learn-ai-bbc/blob/main/Kaggle_NLP_LSTM_BBC_News_Article_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import pandas as pd
import numpy as np
import tensorflow as tf

!pip install contractions
import re
import contractions
import nltk
nltk.download('stopwords')

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
d = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Test.csv')
sample = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Sample%20Solution.csv')

In [28]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


In [29]:
d = d.drop('ArticleId',1)

In [30]:
d

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business
...,...,...
1485,double eviction from big brother model caprice...,entertainment
1486,dj double act revamp chart show dj duo jk and ...,entertainment
1487,weak dollar hits reuters revenues at media gro...,business
1488,apple ipod family expands market apple has exp...,tech


In [31]:
# Cleaning the texts
def txtprocess(txt):
    # Lower the texts
    txt = str(txt).lower()
    # Remove contractions
    txt = contractions.fix(txt)
    
    # Just pick the alphabet
    txt = re.sub(r'[^a-zA-Z]', ' ', txt)
    # Fix unnecessary space
    txt = re.sub(' +', ' ', txt)
    
    txt = ' '.join(txt.split())
    
    return txt

# Apply the function
d['Text'] = d['Text'].apply(txtprocess)

In [32]:
stop_words = set(nltk.corpus.stopwords.words('english'))
# stop_words.remove('no')
# stop_words.remove('not')
# stop_words.remove('but')

def remove_stopwords(txt):
    no_stopword_txt = [w for w in txt.split() if not w in stop_words]
    return ' '.join(no_stopword_txt)

d['Text'] = d['Text'].apply(lambda x: remove_stopwords(x))

In [33]:
category = pd.get_dummies(d.Category)
d_new = pd.concat([d, category], axis=1)
d_new = d_new.drop('Category',1)
d_new

Unnamed: 0,Text,business,entertainment,politics,sport,tech
0,worldcom ex boss launches defence lawyers defe...,1,0,0,0,0
1,german business confidence slides german busin...,1,0,0,0,0
2,bbc poll indicates economic gloom citizens maj...,1,0,0,0,0
3,lifestyle governs mobile choice faster better ...,0,0,0,0,1
4,enron bosses payout eighteen former enron dire...,1,0,0,0,0
...,...,...,...,...,...,...
1485,double eviction big brother model caprice holb...,0,1,0,0,0
1486,dj double act revamp chart show dj duo jk joel...,0,1,0,0,0
1487,weak dollar hits reuters revenues media group ...,1,0,0,0,0
1488,apple ipod family expands market apple expande...,0,0,0,0,1


In [34]:
article = d_new['Text'].values
label = d_new[category.columns].values

In [35]:
tokenizer = Tokenizer(num_words=None, oov_token='-')
tokenizer.fit_on_texts(article)
 
sekuens = tokenizer.texts_to_sequences(article)
 
padded = pad_sequences(sekuens)

padded_train, padded_test, y_train, y_test = train_test_split(padded, label, test_size=0.2, random_state=42)

In [36]:
padded_train

array([[    0,     0,     0, ...,  1882,  1894,   556],
       [    0,     0,     0, ...,  3655, 14703, 14704],
       [    0,     0,     0, ...,    81,   564,    81],
       ...,
       [    0,     0,     0, ...,    36,     5,  1852],
       [    0,     0,     0, ...,   554,  3494,   167],
       [    0,     0,     0, ...,   579,  2885, 10705]], dtype=int32)

In [37]:
len(padded[1])

1643

In [38]:
len(padded)

1490

In [39]:
print(dict(list(tokenizer.word_index.items())[:30]))

{'-': 1, 'said': 2, 'mr': 3, 'would': 4, 'year': 5, 'also': 6, 'new': 7, 'people': 8, 'us': 9, 'one': 10, 'could': 11, 'first': 12, 'last': 13, 'two': 14, 'time': 15, 'world': 16, 'uk': 17, 'government': 18, 'film': 19, 'years': 20, 'best': 21, 'bn': 22, 'make': 23, 'told': 24, 'three': 25, 'made': 26, 'get': 27, 'game': 28, 'many': 29, 'back': 30}


In [40]:
len(tokenizer.word_index)

23397

In [41]:
len(padded_train[1])

1643

In [42]:
len(padded_test[1])

1643

In [43]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=500),
    tf.keras.layers.LSTM(16, dropout=0.9),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [44]:
set_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15)

history = model.fit(padded_train, y_train, epochs=60, batch_size=128,
                    validation_data=(padded_test, y_test), callbacks=[set_callback], verbose=2)

Epoch 1/60
10/10 - 6s - loss: 1.6030 - accuracy: 0.2676 - val_loss: 1.5921 - val_accuracy: 0.3221
Epoch 2/60
10/10 - 4s - loss: 1.5717 - accuracy: 0.3515 - val_loss: 1.5659 - val_accuracy: 0.3658
Epoch 3/60
10/10 - 4s - loss: 1.5223 - accuracy: 0.4161 - val_loss: 1.5115 - val_accuracy: 0.3826
Epoch 4/60
10/10 - 4s - loss: 1.4111 - accuracy: 0.4732 - val_loss: 1.3608 - val_accuracy: 0.4698
Epoch 5/60
10/10 - 4s - loss: 1.2070 - accuracy: 0.6007 - val_loss: 1.1870 - val_accuracy: 0.5705
Epoch 6/60
10/10 - 4s - loss: 1.0489 - accuracy: 0.6770 - val_loss: 1.0658 - val_accuracy: 0.5906
Epoch 7/60
10/10 - 4s - loss: 0.9175 - accuracy: 0.7248 - val_loss: 0.9909 - val_accuracy: 0.5839
Epoch 8/60
10/10 - 4s - loss: 0.7682 - accuracy: 0.7970 - val_loss: 0.8814 - val_accuracy: 0.7114
Epoch 9/60
10/10 - 4s - loss: 0.6257 - accuracy: 0.9086 - val_loss: 0.7287 - val_accuracy: 0.7953
Epoch 10/60
10/10 - 4s - loss: 0.5345 - accuracy: 0.9228 - val_loss: 0.6760 - val_accuracy: 0.7651
Epoch 11/60
10/10 -

In [45]:
model.evaluate(padded_test, y_test)



[0.454992413520813, 0.899328887462616]

In [50]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=500),
    tf.keras.layers.LSTM(16, dropout=0.9),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

X_all=np.concatenate((padded_train, padded_test))
y_all=np.concatenate((y_train, y_test))

class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('loss')<=history.history['loss'][-1]):
            print("Stopped, reach same losses as splitted trial")
            self.model.stop_training = True
set_callback = myCallback()

history = model.fit(X_all, y_all, epochs=60, batch_size=128,
                    callbacks=[set_callback], verbose=2)

Epoch 1/60
12/12 - 5s - loss: 1.6077 - accuracy: 0.2101
Epoch 2/60
12/12 - 4s - loss: 1.5776 - accuracy: 0.3537
Epoch 3/60
12/12 - 4s - loss: 1.5325 - accuracy: 0.4302
Epoch 4/60
12/12 - 4s - loss: 1.4219 - accuracy: 0.4309
Epoch 5/60
12/12 - 4s - loss: 1.2057 - accuracy: 0.6409
Epoch 6/60
12/12 - 4s - loss: 1.0611 - accuracy: 0.7866
Epoch 7/60
12/12 - 4s - loss: 0.9155 - accuracy: 0.9248
Epoch 8/60
12/12 - 4s - loss: 0.7775 - accuracy: 0.9685
Epoch 9/60
12/12 - 4s - loss: 0.6646 - accuracy: 0.9832
Epoch 10/60
12/12 - 4s - loss: 0.5505 - accuracy: 0.9859
Epoch 11/60
12/12 - 4s - loss: 0.4379 - accuracy: 0.9906
Epoch 12/60
12/12 - 4s - loss: 0.3481 - accuracy: 0.9933
Epoch 13/60
12/12 - 4s - loss: 0.2756 - accuracy: 0.9926
Epoch 14/60
12/12 - 4s - loss: 0.2222 - accuracy: 0.9953
Epoch 15/60
12/12 - 4s - loss: 0.1922 - accuracy: 0.9926
Epoch 16/60
12/12 - 4s - loss: 0.1546 - accuracy: 0.9993
Epoch 17/60
12/12 - 4s - loss: 0.1337 - accuracy: 0.9973
Epoch 18/60
12/12 - 4s - loss: 0.1139 - 

In [51]:
sample

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,business
3,459,entertainment
4,1020,politics
...,...,...
730,1923,sport
731,373,tech
732,1704,business
733,206,entertainment


In [52]:
test['Text'] = test['Text'].apply(txtprocess)
test['Text'] = test['Text'].apply(lambda x: remove_stopwords(x))
testtext = test['Text'].values
 
sekuenstesttext = tokenizer.texts_to_sequences(testtext)
 
paddedtesttext = pad_sequences(sekuenstesttext)

In [53]:
answer=np.argmax(model.predict(paddedtesttext), axis=-1)
answer=pd.DataFrame(answer, columns=['Category'])

mapping=dict(enumerate(category.columns))
answer.Category=answer.Category.map(mapping)

answer=pd.concat((test.ArticleId, answer),1)
answer

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,sport
3,459,business
4,1020,sport
...,...,...
730,1923,business
731,373,entertainment
732,1704,politics
733,206,business


In [54]:
answer.to_csv('bbc-ai-answer.csv', index=False)