In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

!pip install contractions
import re
import contractions
import nltk
nltk.download('stopwords')

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
d = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Test.csv')
sample = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Sample%20Solution.csv')

In [5]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.1+ KB


In [6]:
d = d.drop('ArticleId',axis=1)

In [7]:
d

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business
...,...,...
1485,double eviction from big brother model caprice...,entertainment
1486,dj double act revamp chart show dj duo jk and ...,entertainment
1487,weak dollar hits reuters revenues at media gro...,business
1488,apple ipod family expands market apple has exp...,tech


In [None]:
# Cleaning the texts
def txtprocess(txt):
    # Lower the texts
    txt = str(txt).lower()
    # Remove contractions
    txt = contractions.fix(txt)
    
    # Just pick the alphabet
    txt = re.sub(r'[^a-zA-Z]', ' ', txt)
    # Fix unnecessary space
    txt = re.sub(' +', ' ', txt)
    
    txt = ' '.join(txt.split())
    
    return txt

# Apply the function
d['Text'] = d['Text'].apply(txtprocess)

In [None]:
stop_words = set(nltk.corpus.stopwords.words('english'))
# stop_words.remove('no')
# stop_words.remove('not')
# stop_words.remove('but')

def remove_stopwords(txt):
    no_stopword_txt = [w for w in txt.split() if not w in stop_words]
    return ' '.join(no_stopword_txt)

d['Text'] = d['Text'].apply(lambda x: remove_stopwords(x))

In [12]:
category = pd.get_dummies(d.Category, dtype=int)
d_new = pd.concat([d, category], axis=1)
d_new = d_new.drop('Category', axis=1)
d_new

Unnamed: 0,Text,business,entertainment,politics,sport,tech
0,worldcom ex boss launches defence lawyers defe...,1,0,0,0,0
1,german business confidence slides german busin...,1,0,0,0,0
2,bbc poll indicates economic gloom citizens maj...,1,0,0,0,0
3,lifestyle governs mobile choice faster better ...,0,0,0,0,1
4,enron bosses payout eighteen former enron dire...,1,0,0,0,0
...,...,...,...,...,...,...
1485,double eviction big brother model caprice holb...,0,1,0,0,0
1486,dj double act revamp chart show dj duo jk joel...,0,1,0,0,0
1487,weak dollar hits reuters revenues media group ...,1,0,0,0,0
1488,apple ipod family expands market apple expande...,0,0,0,0,1


In [13]:
article = d_new['Text'].values
label = d_new[category.columns].values

In [14]:
tokenizer = Tokenizer(num_words=None, oov_token='-')
tokenizer.fit_on_texts(article)

sekuens = tokenizer.texts_to_sequences(article)

padded = pad_sequences(sekuens)
padded_train, padded_test, y_train, y_test = train_test_split(padded, label, test_size=0.2, random_state=42)

In [17]:
padded_train

array([[    0,     0,     0, ...,  1883,  1895,   556],
       [    0,     0,     0, ...,  3654, 14699, 14700],
       [    0,     0,     0, ...,    81,   564,    81],
       ...,
       [    0,     0,     0, ...,    36,     5,  1853],
       [    0,     0,     0, ...,   554,  3493,   167],
       [    0,     0,     0, ...,   579,  2884, 10704]], dtype=int32)

In [18]:
print(dict(list(tokenizer.word_index.items())[:30]))

{'-': 1, 'said': 2, 'mr': 3, 'would': 4, 'year': 5, 'also': 6, 'new': 7, 'people': 8, 'us': 9, 'one': 10, 'could': 11, 'first': 12, 'last': 13, 'two': 14, 'time': 15, 'world': 16, 'uk': 17, 'government': 18, 'film': 19, 'years': 20, 'best': 21, 'bn': 22, 'make': 23, 'told': 24, 'three': 25, 'made': 26, 'get': 27, 'game': 28, 'many': 29, 'back': 30}


In [19]:
len(tokenizer.word_index)

23392

In [20]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=500),
    tf.keras.layers.LSTM(16, dropout=0.9),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
set_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15)

history = model.fit(padded_train, y_train, epochs=60, batch_size=128,
                   validation_data=(padded_test, y_test), callbacks=[set_callback], verbose=2)

Epoch 1/60
10/10 - 8s - 835ms/step - accuracy: 0.2450 - loss: 1.6050 - val_accuracy: 0.2685 - val_loss: 1.6020
Epoch 2/60
10/10 - 5s - 531ms/step - accuracy: 0.2953 - loss: 1.5890 - val_accuracy: 0.2383 - val_loss: 1.5887
Epoch 3/60
10/10 - 1s - 111ms/step - accuracy: 0.3826 - loss: 1.5579 - val_accuracy: 0.2517 - val_loss: 1.5678
Epoch 4/60
10/10 - 1s - 107ms/step - accuracy: 0.4144 - loss: 1.5108 - val_accuracy: 0.2718 - val_loss: 1.5230
Epoch 5/60
10/10 - 1s - 125ms/step - accuracy: 0.4480 - loss: 1.4119 - val_accuracy: 0.3423 - val_loss: 1.3977
Epoch 6/60
10/10 - 1s - 107ms/step - accuracy: 0.5159 - loss: 1.2262 - val_accuracy: 0.4329 - val_loss: 1.2116
Epoch 7/60
10/10 - 1s - 112ms/step - accuracy: 0.6015 - loss: 1.0957 - val_accuracy: 0.5000 - val_loss: 1.1281
Epoch 8/60
10/10 - 1s - 121ms/step - accuracy: 0.6862 - loss: 0.9780 - val_accuracy: 0.6376 - val_loss: 1.0367
Epoch 9/60
10/10 - 1s - 133ms/step - accuracy: 0.8070 - loss: 0.8565 - val_accuracy: 0.7181 - val_loss: 0.9457
E

In [25]:
model.evaluate(padded_test, y_test)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.9108 - loss: 0.3066


[0.31240400671958923, 0.9127516746520996]

In [30]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=500),
    tf.keras.layers.LSTM(16, dropout=0.9),
    tf.keras.layers.Dense(5,activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

X_all = np.concatenate((padded_train, padded_test))
y_all = np.concatenate((y_train, y_test))

class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('loss')<=history.history['loss'][-1]):
            print('Stopped, reach same losses as splitted trial')
            self.model.stop_training = True
set_callback = myCallback()

history = model.fit(X_all, y_all, epochs=60, batch_size=128,
                    callbacks=[set_callback], verbose=2)

Epoch 1/60
12/12 - 3s - 271ms/step - accuracy: 0.2383 - loss: 1.6052
Epoch 2/60
12/12 - 2s - 205ms/step - accuracy: 0.3523 - loss: 1.5790
Epoch 3/60
12/12 - 1s - 100ms/step - accuracy: 0.5060 - loss: 1.5273
Epoch 4/60
12/12 - 1s - 98ms/step - accuracy: 0.6282 - loss: 1.4091
Epoch 5/60
12/12 - 1s - 105ms/step - accuracy: 0.5980 - loss: 1.2434
Epoch 6/60
12/12 - 1s - 105ms/step - accuracy: 0.7329 - loss: 1.1068
Epoch 7/60
12/12 - 1s - 105ms/step - accuracy: 0.7711 - loss: 0.9632
Epoch 8/60
12/12 - 1s - 98ms/step - accuracy: 0.8611 - loss: 0.7728
Epoch 9/60
12/12 - 1s - 106ms/step - accuracy: 0.9195 - loss: 0.6352
Epoch 10/60
12/12 - 1s - 123ms/step - accuracy: 0.9470 - loss: 0.5093
Epoch 11/60
12/12 - 2s - 196ms/step - accuracy: 0.9698 - loss: 0.4137
Epoch 12/60
12/12 - 1s - 101ms/step - accuracy: 0.9779 - loss: 0.3349
Epoch 13/60
12/12 - 1s - 107ms/step - accuracy: 0.9792 - loss: 0.2666
Epoch 14/60
12/12 - 1s - 98ms/step - accuracy: 0.9779 - loss: 0.2294
Epoch 15/60
12/12 - 1s - 114ms/s

In [31]:
sample

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,business
3,459,entertainment
4,1020,politics
...,...,...
730,1923,sport
731,373,tech
732,1704,business
733,206,entertainment


In [35]:
test['Text'] = test['Text'].apply(txtprocess)
test['Text'] = test['Text'].apply(lambda x: remove_stopwords(x))

testtext = test['Text'].values
sekuenstesttext = tokenizer.texts_to_sequences(testtext)
paddedtesttext = pad_sequences(sekuenstesttext)

In [37]:
answer = np.argmax(model.predict(paddedtesttext), axis=-1)
answer = pd.DataFrame(answer, columns=['Category'])

mapping = dict(enumerate(category.columns))
answer.Category = answer.Category.map(mapping)

answer=pd.concat((test.ArticleId, answer), axis=1)
answer

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step


Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,sport
3,459,business
4,1020,sport
...,...,...
730,1923,business
731,373,entertainment
732,1704,business
733,206,business


In [None]:
answer.to_csv('bbc-ai-answer.csv', index=False)