In [2]:
import pandas as pd
import contractions
import re
import nltk
nltk.download('stopwords')

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
d = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Test.csv')
sample = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Sample%20Solution.csv')

In [4]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.1+ KB


In [5]:
d = d.drop('ArticleId',axis=1)

In [6]:
d

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business
...,...,...
1485,double eviction from big brother model caprice...,entertainment
1486,dj double act revamp chart show dj duo jk and ...,entertainment
1487,weak dollar hits reuters revenues at media gro...,business
1488,apple ipod family expands market apple has exp...,tech


In [7]:
def txtprocess(txt):
    txt = str(txt).lower()
    txt = contractions.fix(txt)

    txt = re.sub(r'[^a-zA-Z]', ' ', txt)
    txt = re.sub(' +', ' ', txt)

    txt = ' '.join(txt.split())

    return txt

d['Text'] = d['Text'].apply(txtprocess)

In [8]:
stop_words = set(nltk.corpus.stopwords.words('english'))

# kayak you'll gitu masih ada ' nya , apa bagusnya sebelum txtprocess, tapi kecil semua sih
def remove_stopwords(txt):
    no_stopword_txt = [w for w in txt.split() if not w in stop_words]
    return ' '.join(no_stopword_txt)

d['Text'] = d['Text'].apply(lambda x: remove_stopwords(x))

In [9]:
# Alternative but just not

In [None]:
# def remove_stopwords(txt):
#     no_stopword_txt = [w for w in txt.split() if not w in stop_words]
#     return ' '.join(no_stopword_txt)

# def txtprocess(txt):
#     txt = str(txt).lower()
#     txt = contractions.fix(txt)
#     txt = remove_stopwords(txt)

#     txt = re.sub(r'[^a-zA-Z]', ' ', txt)
#     txt = re.sub(' +', ' ', txt)

#     txt = ' '.join(txt.split())

#     return txt

# d['Text'] = d['Text'].apply(txtprocess)

In [10]:
category = pd.get_dummies(d.Category, dtype=int)
d_new = pd.concat([d, category], axis=1)
d_new = d_new.drop('Category', axis=1)
d_new

Unnamed: 0,Text,business,entertainment,politics,sport,tech
0,worldcom ex boss launches defence lawyers defe...,1,0,0,0,0
1,german business confidence slides german busin...,1,0,0,0,0
2,bbc poll indicates economic gloom citizens maj...,1,0,0,0,0
3,lifestyle governs mobile choice faster better ...,0,0,0,0,1
4,enron bosses payout eighteen former enron dire...,1,0,0,0,0
...,...,...,...,...,...,...
1485,double eviction big brother model caprice holb...,0,1,0,0,0
1486,dj double act revamp chart show dj duo jk joel...,0,1,0,0,0
1487,weak dollar hits reuters revenues media group ...,1,0,0,0,0
1488,apple ipod family expands market apple expande...,0,0,0,0,1


In [11]:
article = d_new['Text'].values
label = d_new[category.columns].values

In [13]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize directly (no fit_on_texts needed - it's already pretrained!)
tokens = tokenizer(article.tolist(), padding=True, truncation=True, return_tensors='pt')

# Split the tokenized data
padded_train, padded_test, y_train, y_test = train_test_split(
    tokens['input_ids'], label, test_size=0.2, random_state=42
)

In [None]:
padded_train, y_train

(tensor([[  101, 11865,  6562,  ...,     0,     0,     0],
         [  101,  4121,  5481,  ...,     0,     0,     0],
         [  101,  7206,  3404,  ...,     0,     0,     0],
         ...,
         [  101, 23413,  2229,  ...,     0,     0,     0],
         [  101,  3153,  2189,  ...,     0,     0,     0],
         [  101,  3306,  3940,  ...,     0,     0,     0]]),
 array([[0, 0, 0, 1, 0],
        [1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0],
        ...,
        [0, 0, 1, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0]]))

In [None]:
tokenizer.vocab_size

30522

In [14]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=tokenizer.vocab_size+1, output_dim=500, mask_zero=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
set_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15)

history = model.fit(padded_train, y_train, epochs=60, batch_size=128,
                   validation_data=(padded_test, y_test), callbacks=[set_callback], verbose=2)

Epoch 1/60
10/10 - 9s - 869ms/step - accuracy: 0.2718 - loss: 1.5916 - val_accuracy: 0.3221 - val_loss: 1.5562
Epoch 2/60
10/10 - 0s - 46ms/step - accuracy: 0.6611 - loss: 1.3697 - val_accuracy: 0.5906 - val_loss: 1.1325
Epoch 3/60
10/10 - 0s - 47ms/step - accuracy: 0.9111 - loss: 0.7094 - val_accuracy: 0.8389 - val_loss: 0.6645
Epoch 4/60
10/10 - 1s - 62ms/step - accuracy: 0.9874 - loss: 0.2542 - val_accuracy: 0.9262 - val_loss: 0.3035
Epoch 5/60
10/10 - 0s - 46ms/step - accuracy: 0.9975 - loss: 0.0524 - val_accuracy: 0.9329 - val_loss: 0.2563
Epoch 6/60
10/10 - 1s - 50ms/step - accuracy: 0.9983 - loss: 0.0177 - val_accuracy: 0.9396 - val_loss: 0.2441
Epoch 7/60
10/10 - 1s - 53ms/step - accuracy: 0.9992 - loss: 0.0094 - val_accuracy: 0.9362 - val_loss: 0.2699
Epoch 8/60
10/10 - 1s - 64ms/step - accuracy: 0.9992 - loss: 0.0088 - val_accuracy: 0.9295 - val_loss: 0.3358
Epoch 9/60
10/10 - 1s - 60ms/step - accuracy: 1.0000 - loss: 0.0043 - val_accuracy: 0.9161 - val_loss: 0.3769
Epoch 10/

In [16]:
model.evaluate(padded_test, y_test)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9234 - loss: 0.3850


[0.39577338099479675, 0.9295302033424377]

In [17]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=tokenizer.vocab_size+1, output_dim=500, mask_zero=True),
    tf.keras.layers.LSTM(16, dropout=0.9),
    tf.keras.layers.Dense(5,activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

X_all = np.concatenate((padded_train, padded_test))
y_all = np.concatenate((y_train, y_test))

class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('loss')<=history.history['loss'][-1]):
            print('Stopped, reach same losses as splitted trial')
            self.model.stop_training = True
set_callback = myCallback()

history = model.fit(X_all, y_all, epochs=60, batch_size=128,
                    callbacks=[set_callback], verbose=2)

Epoch 1/60
12/12 - 4s - 361ms/step - accuracy: 0.2315 - loss: 1.6064
Epoch 2/60
12/12 - 2s - 135ms/step - accuracy: 0.3107 - loss: 1.5851
Epoch 3/60
12/12 - 0s - 37ms/step - accuracy: 0.4054 - loss: 1.5488
Epoch 4/60
12/12 - 0s - 38ms/step - accuracy: 0.5094 - loss: 1.4383
Epoch 5/60
12/12 - 0s - 37ms/step - accuracy: 0.5174 - loss: 1.1650
Epoch 6/60
12/12 - 0s - 38ms/step - accuracy: 0.5591 - loss: 0.9750
Epoch 7/60
12/12 - 1s - 52ms/step - accuracy: 0.7584 - loss: 0.8106
Epoch 8/60
12/12 - 0s - 38ms/step - accuracy: 0.8517 - loss: 0.6544
Epoch 9/60
12/12 - 1s - 51ms/step - accuracy: 0.9081 - loss: 0.5152
Epoch 10/60
12/12 - 1s - 51ms/step - accuracy: 0.9430 - loss: 0.3807
Epoch 11/60
12/12 - 1s - 52ms/step - accuracy: 0.9658 - loss: 0.2818
Epoch 12/60
12/12 - 0s - 38ms/step - accuracy: 0.9732 - loss: 0.2333
Epoch 13/60
12/12 - 0s - 37ms/step - accuracy: 0.9839 - loss: 0.1710
Epoch 14/60
12/12 - 0s - 38ms/step - accuracy: 0.9839 - loss: 0.1545
Epoch 15/60
12/12 - 0s - 38ms/step - accu

In [18]:
sample

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,business
3,459,entertainment
4,1020,politics
...,...,...
730,1923,sport
731,373,tech
732,1704,business
733,206,entertainment


In [29]:
test['Text'] = test['Text'].apply(txtprocess)
test['Text'] = test['Text'].apply(lambda x: remove_stopwords(x))

testtext = test['Text'].values

paddedtesttext = tokenizer(testtext.tolist(), padding=True, truncation=True, return_tensors='pt')['input_ids']

In [31]:
answer=np.argmax(model.predict(paddedtesttext), axis=-1)
answer=pd.DataFrame(answer, columns=['Category'])

mapping=dict(enumerate(category.columns))
answer.Category=answer.Category.map(mapping)

answer=pd.concat((test.ArticleId, answer),axis=1)
answer

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,sport
3,459,business
4,1020,sport
...,...,...
730,1923,business
731,373,entertainment
732,1704,tech
733,206,business
