# BBC News Document Classification

Useful Resources
- https://www.kdnuggets.com/2019/04/text-preprocessing-nlp-machine-learning.html
- https://www.analyticsvidhya.com/blog/2019/08/how-to-remove-stopwords-text-normalization-nltk-spacy-gensim-python/
- https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089
- http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/
- https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

## Imports

In [34]:
import os
import numpy as np
import pandas as pd
import regex as re
import gensim
import multiprocessing
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    plot_confusion_matrix,
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
pd.set_option('display.max_colwidth', None)
cores = multiprocessing.cpu_count()

## Fetch sub-directory names

In [2]:
def get_subfolder_names(path, excluded_files):
    return [file for file in os.listdir(path) if file not in excluded_files]

In [3]:
sub_folders = get_subfolder_names('D:/Resume-projects/nlp-data/document-classification/bbc','README.TXT')
sub_folders

['business', 'entertainment', 'politics', 'sport', 'tech']

## Fetch sub-directory wise text file names

In [4]:
def get_folderwise_txtfile_names(path, folder_names):
    txtfile_dict = {}
    for folder in folder_names:
        txtfile_dict[folder] = [file for file in os.listdir(path + folder)]
    return txtfile_dict

In [5]:
txtfiles_dict = get_folderwise_txtfile_names('D:/Resume-projects/nlp-data/document-classification/bbc/',sub_folders)
txtfiles_dict['tech'][:10]

['001.txt',
 '002.txt',
 '003.txt',
 '004.txt',
 '005.txt',
 '006.txt',
 '007.txt',
 '008.txt',
 '009.txt',
 '010.txt']

In [6]:
print('Are all extracted subfolders and present in the dictionary? ' + str(sub_folders == list(txtfiles_dict.keys())))

Are all extracted subfolders and present in the dictionary? True


## Creating dataframe with two columns (article as text and its corresponding category)

In [7]:
df = pd.DataFrame(columns=['article', 'category'])

In [8]:
def create_df(path, folder_txtfile_dict, df):
    for folder, txt_filenames in folder_txtfile_dict.items():
        for filename in txt_filenames:
            file = open(path + folder + '/' + filename)
            new_row = {'article': file.read().replace("\n", " ").strip(), 'category': folder}
            df = df.append(new_row, ignore_index=True)
    return df

In [9]:
df = create_df('D:/Resume-projects/nlp-data/document-classification/bbc/', txtfiles_dict, df)

## Data Manipulation

In [10]:
def data_manipulation(df, text_col):
    df[text_col] = df[text_col].str.replace('$', '$\$$')  # Prevent latex styling due to dollar sign
    df = df.sample(frac=1, random_state=7)  # Resampling data
    return df

In [11]:
df = data_manipulation(df, 'article')

  df[text_col] = df[text_col].str.replace('$', '$\$$')  # Prevent latex styling due to dollar sign


In [12]:
def text_preprocessor(text):
    filtered_sentences = []
    lemma_word = []
    stop_words = set(stopwords.words('english') + ['\'s'])
    wordnet_lemmatizer = WordNetLemmatizer()
    text = text.lower()
    text = re.sub(pattern='[^\w\s]', repl = '', string = text)
    # Remove stop words
    word_tokens = word_tokenize(text)
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentences.append(w)
    # Lemmatization
    for w in filtered_sentences:
        word1 = wordnet_lemmatizer.lemmatize(w, pos="n")
        word2 = wordnet_lemmatizer.lemmatize(word1, pos="v")
        word3 = wordnet_lemmatizer.lemmatize(word2, pos=("a"))
        lemma_word.append(word3)
    filtered_text = " ".join(lemma_word)
    return filtered_text

In [13]:
df['article'] = df['article'].apply(text_preprocessor)  # converts sentence to lower case and removes stop words, punctuations and lemmatize

In [14]:
print('There are '+ str(df.shape[0]) + ' articles from BBC in this dataset with categories belonging to', \
      ', '.join([str(elem) for elem in sub_folders[:-1]]) + ' and ' + sub_folders[-1])

There are 2225 articles from BBC in this dataset with categories belonging to business, entertainment, politics, sport and tech


In [15]:
df.head(5)

Unnamed: 0,article,category
1036,howard deny split id card michael howard deny shadow cabinet split decision back controversial labour plan introduce id card tory leader say front bench team reach collective view hold good discussion admit easy issue decide support plan police say would help fight terror crime illegal immigration lib dems pledge oppose bill debate next monday tory source say senior party figure argue vociferously id card scheme among report serious reservation strategy senior shadow cabinet member david davis oliver letwin tim yeo mr howard deny mr yeo transport environment spokesman say plan stink also say confident shadow home secretary mr davis would set position clearly stand debate matter next week mr howard say police say id card could help foil terror bomb plot people could lose life add police say take seriously acknowledge good libertarian argument card say shadow cabinet weigh conflict interest reach decision dont pretend easy decision end day decision take also deny afraid look soft issue compare labour conservative announce support government plan monday even source within party tell bbc mr howard always favour id card try introduce home secretary tory insist would hold minister account precise purpose scheme say would also press labour whether objective could meet whether home office would able deliver pledge ass cost effectiveness id card whether people privacy would properly protect important remember bill take decade come full effect spokesman say lib dem home affair spokesman mark oaten brand id scheme waste money deeply flaw say sign michael howard overrule colleague concern id card chairman bar council guy mansfield qc warn real risk people margin society would drive hand extremist go happen young asian men bomb go somewhere go stop havent id card go detain tory exminister douglas hogg say oppose plan id card brand regressive step would intrude life ordinary citizen without counterbalance benefit predict ultimately carry card would become compulsory would lead large number britain ethnic minority stop police,politics
372,quiksilver move rossignol share ski rossignol world large skimaker jump much 15 speculation buy u surfwear firm quiksilver owner rossignol boixvives family say consider offer quiksilver analyst believe sport good company may take close look rossignol prompt auction push sale price high nike k2 previously mention possible suitor rossignol share touch 1770 euro fall back trade 78 high 1660 euro european sport good company see foreign revenue squeeze slump value u dollar make takeover attractive analyst say company quiksilver would able cut cost sell rossignol ski shop add boixvives family think spend past couple year sound possible suitor rossignol also make golf equipment snowboard sport clothe,business
1409,moyes uturn beattie dismissal everton manager david moyes discipline striker jam beattie headbutt chelsea defender william gallas scot initially defend beattie whose dismissal put everton back foot game ultimately lose 10 say gallas overreact rethink look video evidence say believe set record straight concede dismissal right correct moyes add comment saturday come immediately final whistle point opportunity see one quick rerun incident club website also report beattie seem unrepentant saturday match insist gallas would stay lot long headbutt apologise moyes continue although incident totally character jam never even suspend career action unacceptable detrimental effect teammate jam issue formal apology teammate everton supporter immediately game right thing do subject normal club discipline competitive player fair player know upset happen however must say still believe chelsea player question go easily speak immediately game moyes say dont think sendingoff centrehalf time would ashamed go easily million year would john terry go way never hear anybody butt somebody behind run happen big strong centrehalves think push initially still dont think sendingoff angry beattie initially say gallas would stay lot long headbutt tell wasnt intentional headbutt chase ball corner william gallas look shoulder block stop run say youre go stay way ill go straight head barely touch wasnt intentional headbutt,sport
1518,anelka eye man city departure striker nicolas anelka reportedly want leave manchester city search champion league football anelka 25 talk contract extension beyond 2007 city believe fear career go stale stay club news world report anelka tell french magazine either decide win title easy life think always choose football great club make offer add win title need player capability stagnate eighth 15th place impossible progress go score goal win risk go create feel feed anelka earn reputation difficult character handle spell arsenal real madrid paris st germain feel come back haunt talk sign extension contract say well sport aspect also come account play eighth place good miss champion league real madrid 2000 need play play im thing happen past nothing football dont blame anyone bite fault,sport
279,amex share spinoff news share american express surge 8 tuesday say spin le profitable financial advisory subsidiary u credit card travel service giant say offload american express financial advisor aefa would boost profitability aefa 12000 adviser sell financial advice fund insurance 25 million customer year deliver poor profit even loss excellent move american express focus core business sell laggard division problem quite time say marquis investment research analyst phil kain analyst estimate standalone aefa could market value 10bn â53bn unit acquire american express 20 year ago investor diversify service minneapolis time firm amass onestop financial empire however business sell investment never integrate rest group,business


In [16]:
# symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
# for i in symbols:
#     data = np.char.replace(data, i, ' ')

## Test Train Split

In [17]:
X = df['article']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7)

In [18]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1668,) (557,) (1668,) (557,)


## Feature Engineering

### Count Vectorizer

In [21]:
count_vectorizer = CountVectorizer(stop_words="english")
cv = count_vectorizer.fit_transform(X_train)

In [22]:
count_vec_op = pd.DataFrame(data=cv.toarray(), index=X_train.index ,columns = count_vectorizer.get_feature_names_out())

In [23]:
count_vec_df = pd.concat([pd.DataFrame(y_train), count_vec_op], axis=1, ignore_index=False)
count_vec_df['no_of_words_cv'] = count_vec_df.sum(axis=1)
count_vec_df['no_of_words_txt'] = df['article'].apply(lambda x: len(x.split(' ')))
count_vec_df.head()

  count_vec_df['no_of_words_cv'] = count_vec_df.sum(axis=1)


Unnamed: 0,category,00,000,0001,001,002,003,004secs,007,01,...,â960m,â96bn,â97m,â980m,â98m,â99,â99m,â9m,no_of_words_cv,no_of_words_txt
1237,politics,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,187,212
631,entertainment,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,342,396
1641,sport,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,228,262
1015,politics,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,205,222
772,entertainment,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,313,351


In [24]:
count_vec_df = count_vec_df.drop(['no_of_words_cv','no_of_words_txt'], axis=1)

In [25]:
count_vec_df.head()

Unnamed: 0,category,00,000,0001,001,002,003,004secs,007,01,...,â958m,â95m,â960m,â96bn,â97m,â980m,â98m,â99,â99m,â9m
1237,politics,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
631,entertainment,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1641,sport,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1015,politics,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
772,entertainment,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF

In [26]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
tfidfv = tfidf_vectorizer.fit_transform(X_train)

In [27]:
tfidf_vec_op = pd.DataFrame(data=tfidfv.toarray(), index=X_train.index ,columns = tfidf_vectorizer.get_feature_names_out())

In [29]:
tfidf_vec_df = pd.concat([pd.DataFrame(y_train), tfidf_vec_op], axis=1, ignore_index=False)
tfidf_vec_df.head()

Unnamed: 0,category,00,000,0001,001,002,003,004secs,007,01,...,â958m,â95m,â960m,â96bn,â97m,â980m,â98m,â99,â99m,â9m
1237,politics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
631,entertainment,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1641,sport,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1015,politics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
772,entertainment,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Word2Vec

In [35]:
# Create CBOW model
w2v_cbow = gensim.models.Word2Vec(X_train, min_count = 1, window = 5)
# # Print results
# print("Cosine similarity between 'alice' " + 
#                "and 'wonderland' - CBOW : ",
#     model1.similarity('alice', 'wonderland'))
      
# print("Cosine similarity between 'alice' " +
#                  "and 'machines' - CBOW : ",
#       model1.similarity('alice', 'machines'))

In [42]:
w2v_cbow

<gensim.models.word2vec.Word2Vec at 0x1b1387270d0>

In [None]:
# Create Skip Gram model
w2v_skp_grm = gensim.models.Word2Vec(X_train, min_count = 1, size = 100, window = 5, sg = 1)
# # Print results
# print("Cosine similarity between 'alice' " +
#           "and 'wonderland' - Skip Gram : ",
#     model2.similarity('alice', 'wonderland'))
      
# print("Cosine similarity between 'alice' " +
#             "and 'machines' - Skip Gram : ",
#       model2.similarity('alice', 'machines'))

## Models

In [None]:
# def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
#     """
#     Returns mean and std of cross validation
#     """
#     scores = cross_validate(model, X_train, y_train, **kwargs)

#     mean_scores = pd.DataFrame(scores).mean()
#     std_scores = pd.DataFrame(scores).std()
#     out_col = []

#     for i in range(len(mean_scores)):
#         out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

#     return pd.Series(data=out_col, index=mean_scores.index)

### Dummy Classifier

In [None]:
# results = {}
# dummy = DummyClassifier()
# results["dummy"] = mean_std_cross_val_scores(
#     dummy, X_train, y_train, return_train_score=True
# )
# pd.DataFrame(results)

### GloVe

### fastText

### BERT 

### RoBERTa

### AlBERT

### DistilBERT

### OpenAI Transformer

### ELMo

### ULM-Fit

### T5

### PEGASUS

### XLNet

### Reformer

### MT-DNN

### GPT

### GPT2

### GPT3