In [1]:
# import necessary library

import numpy as np
import pandas as pd
# import textwrap
# import string
import nltk
# import spacy
from nltk.tokenize import PunktSentenceTokenizer, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import os
# from collections import Counter
# from heapq import nlargest
import zipfile
from sklearn.model_selection import train_test_split
import re
from sklearn.metrics import accuracy_score, make_scorer, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from nltk.tokenize import word_tokenize
import glob

In [2]:
# unzip archive

fantasy_zip = zipfile.ZipFile('/content/bbc.zip')
fantasy_zip.extractall('/content/')

In [3]:
# download necessesary nltk 

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [4]:
# download file from unzip archive

lst_dir = os.listdir('bbc')
texts = []
labels = []
for topic in lst_dir:
  for file in glob.glob(os.path.join(os.getcwd(), 'bbc', topic, '*.txt')):
    with open (file, encoding='utf-8', mode='r', errors='ignore') as f:
      texts.append(' '.join(f.read().splitlines()))
      labels.append(topic)

In [5]:
# create pandas dataframe
df = pd.DataFrame({'text': texts, 'label': labels})
df.head()

Unnamed: 0,text,label
0,More power to the people says HP The digital ...,tech
1,Gangsters dominate gaming chart Video games o...,tech
2,What's next for next-gen consoles? The next g...,tech
3,Smart search lets art fans browse If you don'...,tech
4,Hollywood campaign hits websites Movie studio...,tech


In [6]:
# dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2225 non-null   object
 1   label   2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [7]:
# find duplicate news 
df_duplicate = df.duplicated(subset=['text'])
df_duplicate.sum()

98

In [8]:
# show duplicate news
df[df.duplicated(subset=['text'])]

Unnamed: 0,text,label
69,2D Metal Slug offers retro fun Like some dril...,tech
74,Security warning over 'FBI virus' The US Fede...,tech
82,Progress on new internet domains By early 200...,tech
89,Doors open at biggest gadget fair Thousands o...,tech
112,Ask Jeeves joins web log market Ask Jeeves ha...,tech
...,...,...
2053,Troubled Marsh under SEC scrutiny The US stoc...,business
2068,Jobs growth still slow in the US The US creat...,business
2106,SEC to rethink post-Enron rules The US stock ...,business
2134,S Korea spending boost to economy South Korea...,business


In [9]:
# delete duplicate news and show info
data = df.drop_duplicates()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2127 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2127 non-null   object
 1   label   2127 non-null   object
dtypes: object(2)
memory usage: 49.9+ KB


In [10]:
# function for clean text

def text_transform(texts, stopwords):

  lemmatizer = WordNetLemmatizer() # create lemmatizer
  # stemmer = PorterStemmer()
  word_lst = word_tokenize(re.sub('[^a-zA-Z]', ' ', texts.lower())) # split on word and clean
  lst = ['v', 'a', 'r', 's', 'n']
  out = []
  for word in word_lst:
    for pos in lst:
      word = lemmatizer.lemmatize(word, pos=pos) # lemmatize all word
    if len(word) >= 2:
      out.append(word)
  out = [word for word in out if word not in stopwords] # check stopwords
  # out = [stemmer.stem(word) for word in word_lst if word not in stopwords ] # [word for word in word_lst if word not in stopwords ]
  
  return ' '.join(out)

In [11]:
# clean text and code label
data['clean_text'] = data.text.apply(text_transform, stopwords=stop_words)
data['label_id'] = data.label.factorize()[0]
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,text,label,clean_text,label_id
0,More power to the people says HP The digital ...,tech,power people say hp digital revolution focus l...,0
1,Gangsters dominate gaming chart Video games o...,tech,gangster dominate game chart video game consol...,0
2,What's next for next-gen consoles? The next g...,tech,next next gen console next generation video ga...,0
3,Smart search lets art fans browse If you don'...,tech,smart search let art fan browse know art know ...,0
4,Hollywood campaign hits websites Movie studio...,tech,hollywood campaign hit website movie studio ef...,0
...,...,...,...,...
2219,US Airways staff agree to pay cut A union rep...,business,airway staff agree pay cut union represent fli...,4
2220,US economy shows solid GDP growth The US econ...,business,economy show solid gdp growth economy grow exp...,4
2222,EMI shares hit by profit warning Shares in mu...,business,emi share hit profit warn share music giant em...,4
2223,Q&A: Malcolm Glazer and Man Utd The battle fo...,business,malcolm glazer man utd battle control manchest...,4


In [12]:
# create X and y
X = data.clean_text
y = data.label_id


In [13]:
# split data on train, test and validation samples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

print ("total train examples %s" % len(y_train))
print ("total test examples %s" % len(y_test))

total train examples 1701
total test examples 426


In [14]:
# convert X_train to TFIDF-form
vectorize = TfidfVectorizer()
X_train = vectorize.fit_transform(X_train).toarray()

In [15]:
# convert X_test to TFIDF form
X_test = vectorize.transform(X_test).toarray()

In [16]:
# function for train model and calculate metrics
def run_model(model_name, xtrain_tfidf, y_train, xtest_tfidf, y_test, 
              est_c=None, est_pnlty=None):
    mdl=''
    if model_name == 'Logistic Regression':
        mdl = LogisticRegression()
    elif model_name == 'Random Forest':
        mdl = RandomForestClassifier(n_estimators=300)
    elif model_name == 'Multinomial Naive Bayes':
        mdl = MultinomialNB()
    elif model_name == 'Linear SVC':
        mdl = LinearSVC()
    elif model_name == 'SGD Classifier':
        mdl = SGDClassifier()
    elif model_name == 'GaussianNB':
        mdl = GaussianNB()
    elif model_name == 'KNeighbors Classifier':
        mdl = KNeighborsClassifier()
    
    oneVsRest = OneVsRestClassifier(mdl)
    oneVsRest.fit(xtrain_tfidf, y_train)
    y_pred = oneVsRest.predict(xtest_tfidf)
    
    # Performance metrics
    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
    # Get precision, recall, f1 scores
    precision, recall, f1score, support = score(y_test, y_pred, average='micro')

    print(f'Test Accuracy Score of Basic {model_name}: % {accuracy}')
    print(f'Precision : {precision}')
    print(f'Recall    : {recall}')
    print(f'F1-score   : {f1score}')

    

In [None]:
# test different models

In [17]:
run_model(model_name = 'Logistic Regression', xtrain_tfidf=X_train, y_train=y_train, xtest_tfidf=X_test, y_test=y_test)

Test Accuracy Score of Basic Logistic Regression: % 98.36
Precision : 0.9835680751173709
Recall    : 0.9835680751173709
F1-score   : 0.9835680751173709


In [18]:
run_model(model_name = 'Random Forest', xtrain_tfidf=X_train, y_train=y_train, xtest_tfidf=X_test, y_test=y_test)

Test Accuracy Score of Basic Random Forest: % 98.36
Precision : 0.9835680751173709
Recall    : 0.9835680751173709
F1-score   : 0.9835680751173709


In [19]:
run_model(model_name = 'Multinomial Naive Bayes', xtrain_tfidf=X_train, y_train=y_train, xtest_tfidf=X_test, y_test=y_test)

Test Accuracy Score of Basic Multinomial Naive Bayes: % 97.42
Precision : 0.9741784037558685
Recall    : 0.9741784037558685
F1-score   : 0.9741784037558685


In [20]:
run_model(model_name = 'Linear SVC', xtrain_tfidf=X_train, y_train=y_train, xtest_tfidf=X_test, y_test=y_test)

Test Accuracy Score of Basic Linear SVC: % 98.59
Precision : 0.9859154929577465
Recall    : 0.9859154929577465
F1-score   : 0.9859154929577465


In [21]:
run_model(model_name = 'SGD Classifier', xtrain_tfidf=X_train, y_train=y_train, xtest_tfidf=X_test, y_test=y_test)

Test Accuracy Score of Basic SGD Classifier: % 98.83
Precision : 0.9882629107981221
Recall    : 0.9882629107981221
F1-score   : 0.9882629107981221


In [22]:
run_model(model_name = 'GaussianNB', xtrain_tfidf=X_train, y_train=y_train, xtest_tfidf=X_test, y_test=y_test)

Test Accuracy Score of Basic GaussianNB: % 80.52
Precision : 0.8051643192488263
Recall    : 0.8051643192488263
F1-score   : 0.8051643192488261


In [23]:
run_model(model_name = 'KNeighbors Classifier', xtrain_tfidf=X_train, y_train=y_train, xtest_tfidf=X_test, y_test=y_test)

Test Accuracy Score of Basic KNeighbors Classifier: % 95.54
Precision : 0.9553990610328639
Recall    : 0.9553990610328639
F1-score   : 0.9553990610328639


In [24]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 30.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 59.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 71.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2


In [25]:
# import libraris for transformer
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
device ='cuda' if torch.cuda.is_available() else 'cpu'

In [26]:
# split data on train, test and validation samples
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

In [27]:
# create id2label and label2id for transformer
id2label = {n: k for n, k in enumerate(data.label.unique())}
label2id = {k: n for n, k in enumerate(data.label.unique())}

In [28]:
# create transformer tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=y_train.nunique(), label2id=label2id, id2label=id2label).to(device)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

In [29]:
# create train, test and validation tokenizer samples
# max len text in model 512
tokenized_train = tokenizer(X_train.to_list(), padding = True, truncation = True, return_tensors="pt")
tokenized_test = tokenizer(X_test.to_list() , padding = True, truncation = True,  return_tensors="pt")
tokenized_val = tokenizer(X_val.to_list() , padding = True, truncation = True,  return_tensors="pt")

In [30]:
# class for create dataset
from torch.utils.data import Dataset
class MyDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
        
    def __len__(self):
        return len(self.labels)

In [31]:
# create dataset
train_dataset= MyDataset(tokenized_train, y_train.to_list())
test_dataset = MyDataset(tokenized_test, y_test.to_list())
val_dataset=  MyDataset(tokenized_val, y_val.to_list())

In [32]:
from transformers import TrainingArguments, Trainer

In [33]:
# set arguments for Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
)

In [34]:
# create Trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

In [35]:
# run model training
trainer.train()

***** Running training *****
  Num examples = 1360
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 510
  if __name__ == '__main__':


Step,Training Loss
500,0.1572


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
  if __name__ == '__main__':


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=510, training_loss=0.15413977579012805, metrics={'train_runtime': 194.8721, 'train_samples_per_second': 20.937, 'train_steps_per_second': 2.617, 'total_flos': 540495901900800.0, 'train_loss': 0.15413977579012805, 'epoch': 3.0})

In [36]:
# save model
trainer.save_model('bbc_model')

Saving model checkpoint to bbc_model
Configuration saved in bbc_model/config.json
Model weights saved in bbc_model/pytorch_model.bin


In [37]:
# predict model
y_predict = trainer.predict(val_dataset)

***** Running Prediction *****
  Num examples = 426
  Batch size = 4
  if __name__ == '__main__':


In [None]:
# result processing
test_results = np.argmax(y_predict.predictions, axis=-1)

In [None]:
# calculate metrics 
precision, recall, f1score, support = score(y_val, test_results, average='micro')
accuracy = round(accuracy_score(y_val, test_results,) * 100, 2)
print(f'Test Accuracy Score of Basic DistilBERT: % {accuracy}')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F1-score   : {f1score}')

Test Accuracy Score of Basic DistilBERT: % 97.65
Precision : 0.9765258215962441
Recall    : 0.9765258215962441
F1-score   : 0.9765258215962441


In [None]:
# the result of the transformer operation is comparable with the results of the classical classification algorithms