In [106]:
import numpy as np
import pandas as pd
import string
import nltk
import math
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.metrics import classification_report


[nltk_data] Downloading package stopwords to /home/ayush/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ayush/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [107]:
# import zipfile

# with zipfile.ZipFile('BBC News Train.csv.zip', 'r') as zip_ref:
#     zip_ref.extractall('./')

In [108]:
df = pd.read_csv('BBC News Train.csv')
# pd.set_option('display.max_colwidth', 1000)
df

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


# 2.1

In [109]:
def remove_punctuation_per_row(s):
    return s.translate(str.maketrans('', '', string.punctuation))

In [110]:
def remove_stopwords_per_row(s):
    stop_words = set(stopwords.words('english'))
    tokens = s.split()
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

In [111]:
lemmatizer = WordNetLemmatizer()
def lemmatize_row(row):
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in row]
    return ''.join(lemmatized_tokens)

In [112]:
def remove_integers_per_row(row):
    return ''.join([i for i in row if not i.isdigit()])

In [113]:
stemmer = PorterStemmer()

def stem_row(row):
    stemmed_tokens = [stemmer.stem(token) for token in row]
    return ''.join(stemmed_tokens)

In [114]:
category = list(df['Category'].unique())


In [115]:
#showing step wise preprocessing, then keeping the final preprocessed column only
def preprocess_df_1(df):
    df['punct'] = df[['Text']].applymap(remove_punctuation_per_row)
    df['lower'] = df[['punct']].applymap(lambda x: x.lower() if isinstance(x, str) else x)
    df['stopwords'] = df[['lower']].applymap(remove_stopwords_per_row)
    df['tokenized'] = df[['stopwords']].apply(lambda x: x.split()  if isinstance(x, str) else x)
#     df['stemmed'] = df['tokenized'].applymap(stem_row)
    df['lemmatized'] = df[['tokenized']].apply(lemmatize_row, axis=1)
    df['no_integers'] = df[['lemmatized']].applymap(remove_integers_per_row)
    
    display(df)
    df = df.drop(columns =['punct','stopwords', 'lower', 'tokenized', 'lemmatized', 'ArticleId'])
    df = df.rename(columns = {'no_integers': 'preprocessed'})
    display(df)
    
    '''vocab built on entiree dataset, the tf value and related probabilities shall be calculated over train split 
only but smoothed oversmoothed on entiree dataset, to be used dueing testing'''
    vocab = set()        
    for row in range(len(df)):
        terms = set(df['preprocessed'][row].split())
        for term in terms:
            if term not in vocab:
                vocab.add(term)
    return df, vocab

In [116]:
#stemming not in 1st method of preprocessing
def preprocess_df_2(df):
    df['punct'] = df[['Text']].applymap(remove_punctuation_per_row)
    df['lower'] = df[['punct']].applymap(lambda x: x.lower() if isinstance(x, str) else x)
    df['stopwords'] = df[['lower']].applymap(remove_stopwords_per_row)
    df['tokenized'] = df[['stopwords']].apply(lambda x: x.split()  if isinstance(x, str) else x)
    df['stemmed'] = df['tokenized'].apply(stem_row)
    df['lemmatized'] = df[['tokenized']].apply(lemmatize_row, axis=1)
    df['no_integers'] = df[['lemmatized']].applymap(remove_integers_per_row)
    
    display(df)
    df = df.drop(columns =['punct','stopwords', 'lower', 'tokenized', 'lemmatized', 'ArticleId'])
    df = df.rename(columns = {'no_integers': 'preprocessed'})
    display(df)
    
    '''built on entiree dataset, the tf value and related probabilities shall be calculated over train split 
only but smoothed oversmoothed on entiree dataset, to be used dueing testing'''
    vocab = set()        
    for row in range(len(df)):
        terms = set(df['preprocessed'][row].split())
        for term in terms:
            if term not in vocab:
                vocab.add(term)
    return df, vocab

# 2.2

In [117]:
##python code to split a df into train and test df based on given percentage
def split_train_test(df, test_size):
    from sklearn.model_selection import train_test_split
    train, test = train_test_split(df, test_size=test_size, random_state=42)
    
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)

    return train, test

# 2.3

In [129]:
def train(df, vocab):
    #calculate tf
    tf = {key:{term:1 for term in vocab} for key in category}  #initialised with 1 for smoothing
    for row in range(len(df)):
        s = df['preprocessed'][row]
        terms = s.split()
        for term in terms :
            tf[df['Category'][row]][term]+=1
    
    #calculate cf
    cf_classwise = {term:{key:0 for key in category} for term in vocab}
    for row in range(len(df)):
        terms = set(df['preprocessed'][row].split())
        for term in terms:
            cf_classwise[term][df['Category'][row]] = 1
    
    #calculate icf
    icf = {token:0 for token in vocab}
    for token in vocab:
        cf_per_token = sum(cf_classwise[token].values())
        icf[token] = math.log10(1+(5/(1 + cf_per_token)))

    #calculate idf
    idf = {token:0 for token in vocab}
    N = len(df)
    for token in vocab:
        df_count = sum([1 for text in df['preprocessed'] if token in text])
        idf[token] = math.log10(N/df_count) if df_count > 0 else 0
            
    
    #calculate tficf
    tficf = {key:{term:0 for term in vocab} for key in category}
    for classes in category:
        for term in vocab:
            tficf[classes][term] = tf[classes][term]*icf[term]
            
    #calculate tfidf
    tfidf = {key:{term:0 for term in vocab} for key in category}
    for classes in category:
        for term in vocab:
            tfidf[classes][term] = tf[classes][term]*idf[term]
    
    
    #priors
    priors  = {key : 0 for key in category}
    for row in range(len(df)):
        priors[df['Category'][row]] += 1
    priors = {key: value/len(df) for key, value in priors.items()}
    
    #conditional probablities
    prob_tficf = {key:{term:0 for term in vocab} for key in category}
    for classes in category:
        classwise_sum_tficf = sum(tficf[classes].values())
        for term in vocab:
            prob_tficf[classes][term] = tficf[classes][term]/classwise_sum_tficf
            
    prob_tfidf = {key:{term:0 for term in vocab} for key in category}
    for classes in category:
        classwise_sum_tfidf = sum(tfidf[classes].values())
        for term in vocab:
            prob_tfidf[classes][term] = tfidf[classes][term]/classwise_sum_tfidf
        
    return priors, prob_tficf, prob_tfidf

In [130]:
##python code to calculate inverse document frequency for a given dataframe , given inputs are dataframe and vocabulary

# 2.4

In [151]:
def test(priors, prob_tficf, prob_tfidf, df):
    y_pred1 = []                          #1 is with tficf
    y_pred2 = []                          #2 is with tfidf
    y_true = list(df['Category'])
    
#     prob = dict(priors)
    for key in priors.keys():
        priors[key] = math.log10(1+priors[key])
    
    for row in range(len(df)):
        terms = list(df['preprocessed'][row].split())
        prob1 = dict(priors)
        prob2 = dict(priors)

        for key in category:
            sum_logs1 = 0
            sum_logs2 = 0
            for term in terms:
                if prob_tficf[key][term] != 0:
                    sum_logs1 += math.log10(prob_tficf[key][term])
                if prob_tfidf[key][term] != 0:
                    sum_logs2 += math.log10(prob_tfidf[key][term]) 
            prob1[key] = sum_logs1
            prob2[key] = sum_logs2
        y_pred1.append(max(prob1, key=prob1.get))
        y_pred2.append(max(prob2, key=prob2.get))
    
    print(classification_report(y_true, y_pred1, target_names=category))
    print(classification_report(y_true, y_pred2, target_names=category))
    
    
    correct_predictions1 = 0
    correct_predictions2 = 0

    for i in range(len(y_true)):
        if y_true[i] == y_pred1[i]:
            correct_predictions1 += 1
        if y_true[i] == y_pred2[i]:
            correct_predictions2 += 1
        
    accuracy1 = correct_predictions1 / len(y_true)
    accuracy2 = correct_predictions2 / len(y_true)

    print('Accuracy tficf = ', accuracy1)
    print('Accuracy tfidf = ', accuracy2)
    
    return accuracy1, accuracy2

# 2.5

In [165]:
def improved():
    test_fractions = [0.2, 0.3, 0.4, 0.5]
    accuracy_list = {}
    
    #2 ways of preprocessing
    df1, vocab1 = preprocess_df_1(df)
    df2, vocab2 = preprocess_df_2(df)
    
#for df1   
    for data, vocab in [(df1, vocab1), (df2, vocab2)]:
        for test_split in test_fractions:
            train_df, test_df = split_train_test(data, test_split)
            
            priors, prob_tficf, prob_tfidf  = train(train_df, vocab)
            result1, result2 = test(priors, prob_tficf, prob_tfidf, test_df)
            if data is df1:
                method=1
            else:
                method=2
            accuracy_list[f'preprocesss = {method},split = {test_split}, vectorizer = tficf'] = result1
            accuracy_list[f'preprocesss = {method},split = {test_split}, vectorizer = tfidf'] = result2

            
    return accuracy_list

In [166]:
accuracy_list = improved()

Unnamed: 0,ArticleId,Text,Category,punct,lower,stopwords,tokenized,lemmatized,no_integers,stemmed
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,german business confidence slides german busin...,german business confidence slides german busin...,german business confidence slides german busin...,german business confidence slides german busin...,german business confidence slides german busin...,german business confidence slides german busin...
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...,bbc poll indicates economic gloom citizens in ...,bbc poll indicates economic gloom citizens maj...,bbc poll indicates economic gloom citizens maj...,bbc poll indicates economic gloom citizens maj...,bbc poll indicates economic gloom citizens maj...,bbc poll indicates economic gloom citizens maj...
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster bett...,lifestyle governs mobile choice faster bett...,lifestyle governs mobile choice faster better ...,lifestyle governs mobile choice faster better ...,lifestyle governs mobile choice faster better ...,lifestyle governs mobile choice faster better ...,lifestyle governs mobile choice faster better ...
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses in 168m payout eighteen former en...,enron bosses in 168m payout eighteen former en...,enron bosses 168m payout eighteen former enron...,enron bosses 168m payout eighteen former enron...,enron bosses 168m payout eighteen former enron...,enron bosses m payout eighteen former enron di...,enron bosses 168m payout eighteen former enron...
...,...,...,...,...,...,...,...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment,double eviction from big brother model caprice...,double eviction from big brother model caprice...,double eviction big brother model caprice holb...,double eviction big brother model caprice holb...,double eviction big brother model caprice holb...,double eviction big brother model caprice holb...,double eviction big brother model caprice holb...
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment,dj double act revamp chart show dj duo jk and ...,dj double act revamp chart show dj duo jk and ...,dj double act revamp chart show dj duo jk joel...,dj double act revamp chart show dj duo jk joel...,dj double act revamp chart show dj duo jk joel...,dj double act revamp chart show dj duo jk joel...,dj double act revamp chart show dj duo jk joel...
1487,1590,weak dollar hits reuters revenues at media gro...,business,weak dollar hits reuters revenues at media gro...,weak dollar hits reuters revenues at media gro...,weak dollar hits reuters revenues media group ...,weak dollar hits reuters revenues media group ...,weak dollar hits reuters revenues media group ...,weak dollar hits reuters revenues media group ...,weak dollar hits reuters revenues media group ...
1488,1587,apple ipod family expands market apple has exp...,tech,apple ipod family expands market apple has exp...,apple ipod family expands market apple has exp...,apple ipod family expands market apple expande...,apple ipod family expands market apple expande...,apple ipod family expands market apple expande...,apple ipod family expands market apple expande...,apple ipod family expands market apple expande...


Unnamed: 0,Text,Category,preprocessed,stemmed
0,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...
1,german business confidence slides german busin...,business,german business confidence slides german busin...,german business confidence slides german busin...
2,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens maj...,bbc poll indicates economic gloom citizens maj...
3,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster better ...,lifestyle governs mobile choice faster better ...
4,enron bosses in $168m payout eighteen former e...,business,enron bosses m payout eighteen former enron di...,enron bosses 168m payout eighteen former enron...
...,...,...,...,...
1485,double eviction from big brother model caprice...,entertainment,double eviction big brother model caprice holb...,double eviction big brother model caprice holb...
1486,dj double act revamp chart show dj duo jk and ...,entertainment,dj double act revamp chart show dj duo jk joel...,dj double act revamp chart show dj duo jk joel...
1487,weak dollar hits reuters revenues at media gro...,business,weak dollar hits reuters revenues media group ...,weak dollar hits reuters revenues media group ...
1488,apple ipod family expands market apple has exp...,tech,apple ipod family expands market apple expande...,apple ipod family expands market apple expande...


Unnamed: 0,ArticleId,Text,Category,punct,lower,stopwords,tokenized,lemmatized,no_integers,stemmed
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,german business confidence slides german busin...,german business confidence slides german busin...,german business confidence slides german busin...,german business confidence slides german busin...,german business confidence slides german busin...,german business confidence slides german busin...
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...,bbc poll indicates economic gloom citizens in ...,bbc poll indicates economic gloom citizens maj...,bbc poll indicates economic gloom citizens maj...,bbc poll indicates economic gloom citizens maj...,bbc poll indicates economic gloom citizens maj...,bbc poll indicates economic gloom citizens maj...
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster bett...,lifestyle governs mobile choice faster bett...,lifestyle governs mobile choice faster better ...,lifestyle governs mobile choice faster better ...,lifestyle governs mobile choice faster better ...,lifestyle governs mobile choice faster better ...,lifestyle governs mobile choice faster better ...
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses in 168m payout eighteen former en...,enron bosses in 168m payout eighteen former en...,enron bosses 168m payout eighteen former enron...,enron bosses 168m payout eighteen former enron...,enron bosses 168m payout eighteen former enron...,enron bosses m payout eighteen former enron di...,enron bosses 168m payout eighteen former enron...
...,...,...,...,...,...,...,...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment,double eviction from big brother model caprice...,double eviction from big brother model caprice...,double eviction big brother model caprice holb...,double eviction big brother model caprice holb...,double eviction big brother model caprice holb...,double eviction big brother model caprice holb...,double eviction big brother model caprice holb...
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment,dj double act revamp chart show dj duo jk and ...,dj double act revamp chart show dj duo jk and ...,dj double act revamp chart show dj duo jk joel...,dj double act revamp chart show dj duo jk joel...,dj double act revamp chart show dj duo jk joel...,dj double act revamp chart show dj duo jk joel...,dj double act revamp chart show dj duo jk joel...
1487,1590,weak dollar hits reuters revenues at media gro...,business,weak dollar hits reuters revenues at media gro...,weak dollar hits reuters revenues at media gro...,weak dollar hits reuters revenues media group ...,weak dollar hits reuters revenues media group ...,weak dollar hits reuters revenues media group ...,weak dollar hits reuters revenues media group ...,weak dollar hits reuters revenues media group ...
1488,1587,apple ipod family expands market apple has exp...,tech,apple ipod family expands market apple has exp...,apple ipod family expands market apple has exp...,apple ipod family expands market apple expande...,apple ipod family expands market apple expande...,apple ipod family expands market apple expande...,apple ipod family expands market apple expande...,apple ipod family expands market apple expande...


Unnamed: 0,Text,Category,preprocessed,stemmed
0,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launches defence lawyers defen...,worldcom exboss launches defence lawyers defen...
1,german business confidence slides german busin...,business,german business confidence slides german busin...,german business confidence slides german busin...
2,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens maj...,bbc poll indicates economic gloom citizens maj...
3,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster better ...,lifestyle governs mobile choice faster better ...
4,enron bosses in $168m payout eighteen former e...,business,enron bosses m payout eighteen former enron di...,enron bosses 168m payout eighteen former enron...
...,...,...,...,...
1485,double eviction from big brother model caprice...,entertainment,double eviction big brother model caprice holb...,double eviction big brother model caprice holb...
1486,dj double act revamp chart show dj duo jk and ...,entertainment,dj double act revamp chart show dj duo jk joel...,dj double act revamp chart show dj duo jk joel...
1487,weak dollar hits reuters revenues at media gro...,business,weak dollar hits reuters revenues media group ...,weak dollar hits reuters revenues media group ...
1488,apple ipod family expands market apple has exp...,tech,apple ipod family expands market apple expande...,apple ipod family expands market apple expande...


               precision    recall  f1-score   support

     business       0.99      0.97      0.98        75
         tech       1.00      0.96      0.98        46
     politics       0.92      0.98      0.95        56
        sport       1.00      1.00      1.00        63
entertainment       0.96      0.95      0.96        58

     accuracy                           0.97       298
    macro avg       0.97      0.97      0.97       298
 weighted avg       0.97      0.97      0.97       298

               precision    recall  f1-score   support

     business       0.99      0.97      0.98        75
         tech       1.00      0.96      0.98        46
     politics       0.92      0.98      0.95        56
        sport       1.00      1.00      1.00        63
entertainment       0.96      0.95      0.96        58

     accuracy                           0.97       298
    macro avg       0.97      0.97      0.97       298
 weighted avg       0.97      0.97      0.97       298

Accu

In [167]:
accuracy_list

{'preprocesss = 1,split = 0.2, vectorizer = tficf': 0.9731543624161074,
 'preprocesss = 1,split = 0.2, vectorizer = tfidf': 0.9731543624161074,
 'preprocesss = 1,split = 0.3, vectorizer = tficf': 0.9776286353467561,
 'preprocesss = 1,split = 0.3, vectorizer = tfidf': 0.9798657718120806,
 'preprocesss = 1,split = 0.4, vectorizer = tficf': 0.9748322147651006,
 'preprocesss = 1,split = 0.4, vectorizer = tfidf': 0.9748322147651006,
 'preprocesss = 1,split = 0.5, vectorizer = tficf': 0.9771812080536912,
 'preprocesss = 1,split = 0.5, vectorizer = tfidf': 0.9758389261744966,
 'preprocesss = 2,split = 0.2, vectorizer = tficf': 0.9731543624161074,
 'preprocesss = 2,split = 0.2, vectorizer = tfidf': 0.9731543624161074,
 'preprocesss = 2,split = 0.3, vectorizer = tficf': 0.9776286353467561,
 'preprocesss = 2,split = 0.3, vectorizer = tfidf': 0.9798657718120806,
 'preprocesss = 2,split = 0.4, vectorizer = tficf': 0.9748322147651006,
 'preprocesss = 2,split = 0.4, vectorizer = tfidf': 0.9748322147

In [168]:
max(accuracy_list.values())

0.9798657718120806