In [2]:
import pandas as pd
import contractions
import re
import nltk
nltk.download('stopwords')

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
d = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Test.csv')
sample = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Sample%20Solution.csv')

In [4]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.1+ KB


In [5]:
d = d.drop('ArticleId',axis=1)

In [6]:
d

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business
...,...,...
1485,double eviction from big brother model caprice...,entertainment
1486,dj double act revamp chart show dj duo jk and ...,entertainment
1487,weak dollar hits reuters revenues at media gro...,business
1488,apple ipod family expands market apple has exp...,tech


In [7]:
def txtprocess(txt):
    txt = str(txt).lower()
    txt = contractions.fix(txt)

    txt = re.sub(r'[^a-zA-Z]', ' ', txt)
    txt = re.sub(' +', ' ', txt)

    txt = ' '.join(txt.split())

    return txt

d['Text'] = d['Text'].apply(txtprocess)

In [8]:
stop_words = set(nltk.corpus.stopwords.words('english'))

# kayak you'll gitu masih ada ' nya , apa bagusnya sebelum txtprocess, tapi kecil semua sih
def remove_stopwords(txt):
    no_stopword_txt = [w for w in txt.split() if not w in stop_words]
    return ' '.join(no_stopword_txt)

d['Text'] = d['Text'].apply(lambda x: remove_stopwords(x))

In [None]:
# Alternative but just not

In [None]:
# def remove_stopwords(txt):
#     no_stopword_txt = [w for w in txt.split() if not w in stop_words]
#     return ' '.join(no_stopword_txt)

# def txtprocess(txt):
#     txt = str(txt).lower()
#     txt = contractions.fix(txt)
#     txt = remove_stopwords(txt)

#     txt = re.sub(r'[^a-zA-Z]', ' ', txt)
#     txt = re.sub(' +', ' ', txt)

#     txt = ' '.join(txt.split())

#     return txt

# d['Text'] = d['Text'].apply(txtprocess)

In [9]:
category = pd.get_dummies(d.Category, dtype=int)
d_new = pd.concat([d, category], axis=1)
d_new = d_new.drop('Category', axis=1)
d_new

Unnamed: 0,Text,business,entertainment,politics,sport,tech
0,worldcom ex boss launches defence lawyers defe...,1,0,0,0,0
1,german business confidence slides german busin...,1,0,0,0,0
2,bbc poll indicates economic gloom citizens maj...,1,0,0,0,0
3,lifestyle governs mobile choice faster better ...,0,0,0,0,1
4,enron bosses payout eighteen former enron dire...,1,0,0,0,0
...,...,...,...,...,...,...
1485,double eviction big brother model caprice holb...,0,1,0,0,0
1486,dj double act revamp chart show dj duo jk joel...,0,1,0,0,0
1487,weak dollar hits reuters revenues media group ...,1,0,0,0,0
1488,apple ipod family expands market apple expande...,0,0,0,0,1


In [10]:
article = d_new['Text'].values
label = d_new[category.columns].values

In [13]:
from tensorflow.keras.layers import TextVectorization
vectorizer = TextVectorization(
    max_tokens=None,
    standardize="lower_and_strip_punctuation"
)

vectorizer.adapt(article)

padded = vectorizer(article)
padded = padded.numpy()
padded_train, padded_test, y_train, y_test = train_test_split(padded, label, test_size=0.2, random_state=42)

In [14]:
padded_train,y_train

(array([[9165,  670, 5060, ...,    0,    0,    0],
        [ 416, 2253, 3888, ...,    0,    0,    0],
        [ 658,  744, 1263, ...,    0,    0,    0],
        ...,
        [ 341, 2902,  266, ...,    0,    0,    0],
        [1586,   35, 1352, ...,    0,    0,    0],
        [1421,  980, 1808, ...,    0,    0,    0]]),
 array([[0, 0, 0, 1, 0],
        [1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0],
        ...,
        [0, 0, 1, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0]]))

In [None]:
len(vectorizer.get_vocabulary())

23395

In [15]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(vectorizer.get_vocabulary())+1, output_dim=500, mask_zero=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
set_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15)

history = model.fit(padded_train, y_train, epochs=60, batch_size=128,
                   validation_data=(padded_test, y_test), callbacks=[set_callback], verbose=2)

Epoch 1/60
10/10 - 6s - 634ms/step - accuracy: 0.3742 - loss: 1.5878 - val_accuracy: 0.5638 - val_loss: 1.5406
Epoch 2/60
10/10 - 1s - 146ms/step - accuracy: 0.8364 - loss: 1.3462 - val_accuracy: 0.6208 - val_loss: 1.1057
Epoch 3/60
10/10 - 1s - 62ms/step - accuracy: 0.8574 - loss: 0.6738 - val_accuracy: 0.8826 - val_loss: 0.5956
Epoch 4/60
10/10 - 1s - 59ms/step - accuracy: 0.9748 - loss: 0.3118 - val_accuracy: 0.9430 - val_loss: 0.3237
Epoch 5/60
10/10 - 1s - 60ms/step - accuracy: 0.9883 - loss: 0.1369 - val_accuracy: 0.9463 - val_loss: 0.2629
Epoch 6/60
10/10 - 1s - 91ms/step - accuracy: 0.9916 - loss: 0.0733 - val_accuracy: 0.9564 - val_loss: 0.2383
Epoch 7/60
10/10 - 1s - 123ms/step - accuracy: 0.9975 - loss: 0.0371 - val_accuracy: 0.9698 - val_loss: 0.1615
Epoch 8/60
10/10 - 1s - 97ms/step - accuracy: 0.9983 - loss: 0.0246 - val_accuracy: 0.9732 - val_loss: 0.1405
Epoch 9/60
10/10 - 1s - 108ms/step - accuracy: 0.9983 - loss: 0.0174 - val_accuracy: 0.9799 - val_loss: 0.1246
Epoch 

In [17]:
model.evaluate(padded_test, y_test)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9583 - loss: 0.1667


[0.15384170413017273, 0.963087260723114]

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(vectorizer.get_vocabulary())+1, output_dim=500, mask_zero=True),
    tf.keras.layers.LSTM(16, dropout=0.9),
    tf.keras.layers.Dense(5,activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

X_all = np.concatenate((padded_train, padded_test))
y_all = np.concatenate((y_train, y_test))

class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('loss')<=history.history['loss'][-1]):
            print('Stopped, reach same losses as splitted trial')
            self.model.stop_training = True
set_callback = myCallback()

history = model.fit(X_all, y_all, epochs=60, batch_size=128,
                    callbacks=[set_callback], verbose=2)

Epoch 1/60
12/12 - 119s - 10s/step - accuracy: 0.2007 - loss: 1.6118
Epoch 2/60
12/12 - 2s - 196ms/step - accuracy: 0.2248 - loss: 1.6111
Epoch 3/60
12/12 - 1s - 100ms/step - accuracy: 0.2242 - loss: 1.6068
Epoch 4/60
12/12 - 1s - 100ms/step - accuracy: 0.2195 - loss: 1.6143
Epoch 5/60
12/12 - 1s - 99ms/step - accuracy: 0.2356 - loss: 1.6031
Epoch 6/60
12/12 - 1s - 100ms/step - accuracy: 0.2302 - loss: 1.6076
Epoch 7/60
12/12 - 1s - 100ms/step - accuracy: 0.2262 - loss: 1.6064
Epoch 8/60
12/12 - 1s - 100ms/step - accuracy: 0.2208 - loss: 1.6048
Epoch 9/60
12/12 - 1s - 107ms/step - accuracy: 0.2396 - loss: 1.6091
Epoch 10/60
12/12 - 1s - 112ms/step - accuracy: 0.2215 - loss: 1.6085
Epoch 11/60
12/12 - 2s - 202ms/step - accuracy: 0.2188 - loss: 1.6053
Epoch 12/60
12/12 - 1s - 105ms/step - accuracy: 0.2242 - loss: 1.6054
Epoch 13/60
12/12 - 1s - 105ms/step - accuracy: 0.2289 - loss: 1.6030
Epoch 14/60
12/12 - 1s - 106ms/step - accuracy: 0.2235 - loss: 1.6077
Epoch 15/60
12/12 - 1s - 104ms

In [None]:
sample

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,business
3,459,entertainment
4,1020,politics
...,...,...
730,1923,sport
731,373,tech
732,1704,business
733,206,entertainment


In [26]:
test['Text'] = test['Text'].apply(txtprocess)
test['Text'] = test['Text'].apply(lambda x: remove_stopwords(x))

testtext = test['Text'].values

paddedtesttext = vectorizer(testtext)
paddedtesttext = paddedtesttext.numpy()

In [27]:
answer=np.argmax(model.predict(paddedtesttext), axis=-1)
answer=pd.DataFrame(answer, columns=['Category'])

mapping=dict(enumerate(category.columns))
answer.Category=answer.Category.map(mapping)

answer=pd.concat((test.ArticleId, answer),axis=1)
answer

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step


Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,sport
3,459,business
4,1020,sport
...,...,...
730,1923,business
731,373,entertainment
732,1704,business
733,206,business
