In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from collections import Counter
import math

In [2]:
# read the data with column names category, text
df = pd.read_csv(r'Data\BBC News Train.csv')

In [3]:
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


### 1. Preprocessing

In [4]:
# drop ArticleId column
df.drop('ArticleId', axis=1, inplace=True)
df

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business
...,...,...
1485,double eviction from big brother model caprice...,entertainment
1486,dj double act revamp chart show dj duo jk and ...,entertainment
1487,weak dollar hits reuters revenues at media gro...,business
1488,apple ipod family expands market apple has exp...,tech


In [5]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stopwords = set(stopwords)
 
def remove_punctuation(text):
    """
    This function removes punctuation from a string
    """
    import string
    new_string = ""
    for char in text:
        if char not in string.punctuation:
            new_string += char
    return new_string

def remove_stopwords(text):
    """
    This function removes stopwords from a string
    """
    new_string = ""
    for word in text.split():
        if word not in stopwords:
            new_string += word + " "
    return new_string

def lower_case(text):
    """
    This function converts all characters to lower case
    """
    return text.lower()

def tokenize(text):
    """
    This function splits a string into a list of words
    """
    return text.split()

def lemmatize(text):
    """
    This function lemmatizes a list of words
    """
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    new_string = []
    for word in text:
        new_string.append(lemmatizer.lemmatize(word))
    return new_string

def stem(text):
    """
    This function stems a list of words
    """
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    new_string = []
    for word in text:
        new_string.append(ps.stem(word))
    return new_string

In [6]:
df['Text'] = df['Text'].apply(remove_punctuation) # remove punctuation
df['Text'] = df['Text'].apply(lower_case) # convert to lower case
df['Text'] = df['Text'].apply(remove_stopwords) # remove stopwords

In [None]:
df['Text'] = df['Text'].apply(tokenize) # tokenize
df['Text'] = df['Text'].apply(lemmatize) # lemmatize

In [None]:
df

Unnamed: 0,Text,Category
0,"[worldcom, exboss, launch, defence, lawyer, de...",business
1,"[german, business, confidence, slide, german, ...",business
2,"[bbc, poll, indicates, economic, gloom, citize...",business
3,"[lifestyle, governs, mobile, choice, faster, b...",tech
4,"[enron, boss, 168m, payout, eighteen, former, ...",business
...,...,...
1485,"[double, eviction, big, brother, model, capric...",entertainment
1486,"[dj, double, act, revamp, chart, show, dj, duo...",entertainment
1487,"[weak, dollar, hit, reuters, revenue, medium, ...",business
1488,"[apple, ipod, family, expands, market, apple, ...",tech


In [None]:
class TF_ICF:
    def __init__(self, classes, ls_text, ls_classes):
        """
        Class to calculate TF, CF, ICF and TF_ICF for a given dataset
        """
        self.classes = classes
        self.ls_text = ls_text
        self.ls_classes = ls_classes
        self.tf = {}
        self.cf = {}
        self.icf = {}
        self.tf_icf = {}
        self.build()
        
    def build(self):
        """
        Driver function to build TF, CF, ICF and TF_ICF
        """
        for class_ in self.classes:
            self.tf[class_] = {}
            self.build_tf(class_)
        self.build_cf()
        self.build_icf()
        self.build_tf_icf()
    
    def build_tf(self, class_):
        """
        Method to build TF for a given class

        Parameters
        ----------
        class_ : str
            class for which TF is to be calculated

        Performs
        --------
        1. Filters the text for a given class
        2. For each text, calculates the frequency of each word
        3. Stores the frequency in self.tf

        Returns
        -------
        None
        """
        filtered_text = [self.ls_text[i] for i in range(len(self.ls_text)) if self.ls_classes[i] == class_]
        for text in filtered_text:
            for word in text:
                if word not in self.tf[class_]:
                    self.tf[class_][word] = 0
                self.tf[class_][word] += 1

    def build_cf(self):
        """
        Method to build CF for all words
        
        Parameters
        ----------
        None

        Performs
        --------
        1. For each word, calculates the number of classes in which it occurs
        2. Stores the frequency in self.cf

        Returns
        -------
        None
        """
        for i in range(len(self.ls_text)):
            current_class = self.ls_classes[i]
            for word in self.ls_text[i]:
                if word not in self.cf:
                    self.cf[word] = set()
                self.cf[word].add(current_class)
        # replce set with length of set
        for word in self.cf:
            self.cf[word] = len(self.cf[word])
    
    def build_icf(self):
        """
        Method to build ICF for all words

        Parameters
        ----------
        None

        Performs
        --------
        1. For each word, calculates the inverse class frequency
        2. Stores the frequency in self.icf

        Returns
        -------
        None
        """
        for word in self.cf:
            self.icf[word] = np.log10(len(self.classes)/self.cf[word])

    def build_tf_icf(self):
        """
        Method to build TF_ICF for all words

        Parameters
        ----------
        None

        Performs
        --------
        1. For each word, calculates the TF_ICF
        2. Stores the frequency in self.tf_icf

        Returns
        -------
        None
        """
        for class_ in self.classes:
            self.tf_icf[class_] = {}
            for word in self.tf[class_]:
                self.tf_icf[class_][word] = self.tf[class_][word] * self.icf[word]

In [None]:
def featurize(ls_text_train, ls_classes_train, ls_text_test, ls_classes_test, tf_icf, unique_classes, feature_set):
    """
    Function to featurize the dataset using TF_ICF

    Parameters
    ----------
    ls_text_train : list
        list of training text

    ls_classes_train : list
        list of training classes

    ls_text_test : list
        list of test text

    ls_classes_test : list
        list of test classes

    tf_icf : TF_ICF
        TF_ICF object

    unique_classes : list
        list of unique classes

    feature_set : list
        list of features to be used

    Returns
    -------
    train_X : list
        list of training features for each text

    train_Y : list
        list of training classes for each text

    test_X : list
        list of test features for each text

    test_Y : list
        list of test classes for each text
    """
    train_X = []
    train_Y = []
    test_X = []
    test_Y = []
    for mega_class in unique_classes:
        # filter the dataset for the mega class
        mega_class_ls_text_train = [ls_text_train[i] for i in range(len(ls_text_train)) if ls_classes_train[i] == mega_class]
        mega_class_ls_classes_train = [ls_classes_train[i] for i in range(len(ls_classes_train)) if ls_classes_train[i] == mega_class]
        mega_class_ls_text_test = [ls_text_test[i] for i in range(len(ls_text_test)) if ls_classes_test[i] == mega_class]
        mega_class_ls_classes_test = [ls_classes_test[i] for i in range(len(ls_classes_test)) if ls_classes_test[i] == mega_class]
        # featurize the dataset
        for i in range(len(mega_class_ls_text_train)):
            doc = []
            for word in feature_set:
                if word in mega_class_ls_text_train[i]:
                    doc.append(mega_class_ls_text_train[i].count(word))
                else:
                    doc.append(0)
            train_X.append(doc)
            train_Y.append(mega_class_ls_classes_train[i])
        for i in range(len(mega_class_ls_text_test)):
            doc = []
            for word in feature_set:
                if word in mega_class_ls_text_test[i]:
                    doc.append(mega_class_ls_text_test[i].count(word))
                else:
                    doc.append(0)
            test_X.append(doc)
            test_Y.append(mega_class_ls_classes_test[i])
            
    return train_X, train_Y, test_X, test_Y

### 2. Dataset Split

In [None]:
def train_test_split(df, split=0.7):
    # Shuffling the data
    df = df.sample(frac=1).reset_index(drop=True)
    # Splitting the data into train and test
    train = df[:int(split*len(df))]
    test = df[int(split*len(df)):]
    # reset the index
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    return train, test


In [None]:
# df_train, df_test = train_test_split(df, split=0.7)

# tf_icf = TF_ICF(df_train['Category'].unique(), df_train['Text'], df_train['Category'])

# # Computes all the unique words in the dataset
# feature_set = []
# unique_classes = df_train['Category'].unique()
# for mega_class in unique_classes:
#     sub_tf_icf = tf_icf.tf_icf[mega_class]
#     for word, tficf in sub_tf_icf.items():
#         if word not in feature_set:
#             feature_set.append(word)
# feature_set = list(set(feature_set))

# train_X, train_Y, test_X, test_Y = featurize(df_train['Text'], df_train['Category'], df_test['Text'], df_test['Category'], tf_icf, unique_classes, feature_set)

### 3. Naive Bayes

In [None]:
class NaiveBayes:
    def __init__(self):
        """
        A Naive Bayes classifier with Laplace smoothing.
        """
        self.num_features = None
        self.class_labels = None
        self.prior_prob = None
        self.conditional_prob = None
        self.cumulative_class_freq_stats = None
        self.alpha = 1
    
    def train(self, train_x, train_y):
        """
        Function to train the Naive Bayes classifier

        Parameters
        ----------
        train_x : list
            list of training features for each text

        train_y : list
            list of training classes for each text
        """
        total_class_samples = len(train_x)
        self.num_features = len(train_x[0])
        self.class_labels = list(set(train_y)) 
        self.prior_prob = {} 
        self.conditional_prob = {label:{} for label in self.class_labels}
        self.cumulative_class_freq_stats = {label:{feat:0 for feat in range(self.num_features)} for label in self.class_labels} 
        class_wise_count = dict(Counter(train_y)) 
        for label in self.class_labels: 
            self.prior_prob[label] = float(class_wise_count[label]) / float(total_class_samples)
        for i in range(total_class_samples): 
            sample_label = train_y[i]
            for j in range(self.num_features):
                self.cumulative_class_freq_stats[sample_label][j] += train_x[i][j]
        
        for label in self.class_labels:
            for feature in range(self.num_features):
                self.conditional_prob[label][feature] = float(self.cumulative_class_freq_stats[label][feature] + self.alpha ) / float(sum(self.cumulative_class_freq_stats[label].values()) + (self.num_features*self.alpha)) #conditional probab of a feature(word) wrt a class is the ratio of number of occurences of that word in that particular class divided by the sum of frequencies of all features(words) wrt that class.
        
        return self.prior_prob, self.conditional_prob

    def predict(self, test_x):
        predictions = []
        for sample in test_x: 
            posterior_probs = {} 
            for label in self.class_labels: 
                probab = math.log10(self.prior_prob[label])  
                for feature in range(len(sample)): 
                    if(sample[feature] != 0): 
                        probab += (math.log10(self.conditional_prob[label][feature]) * sample[feature]) 
                posterior_probs[label] = probab
            pred_label = max(posterior_probs, key=posterior_probs.get) 
            predictions.append(pred_label)
        return predictions

def calculate_metrics(true_y, pred_y):
    mapping = {0: 'Business', 1: 'Entertainment', 2: 'Politics', 3: 'Sport', 4: 'Tech'}
    inverse_mapping = {'Business': 0, 'Entertainment': 1, 'Politics': 2, 'Sport': 3, 'Tech': 4}
    # lower keys of mapping and inverse_mapping
    mapping = {k: v.lower() for k, v in mapping.items()}
    inverse_mapping = {k.lower(): v for k, v in inverse_mapping.items()}
    
    # Convert string labels to numeric labels
    true_y = [inverse_mapping[y] for y in true_y]
    pred_y = [inverse_mapping[y] for y in pred_y]
    
    # Calculate confusion matrix
    num_classes = len(mapping)
    confusion_matrix = [[0 for _ in range(num_classes)] for _ in range(num_classes)]
    for true_label, pred_label in zip(true_y, pred_y):
        confusion_matrix[true_label][pred_label] += 1
    
    # Calculate accuracy
    correct = sum([confusion_matrix[i][i] for i in range(num_classes)])
    total = sum([sum(confusion_matrix[i]) for i in range(num_classes)])
    accuracy = correct / total
    
    # Calculate precision, recall, and f1-score for each class
    metrics = {}
    for i in range(num_classes):
        tp = confusion_matrix[i][i]
        fp = sum([confusion_matrix[j][i] for j in range(num_classes) if j != i])
        fn = sum([confusion_matrix[i][j] for j in range(num_classes) if j != i])
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        metrics[mapping[i]] = {'precision': precision, 'recall': recall, 'f1-score': f1_score}
    
    return accuracy, metrics
        

In [None]:
nb = NaiveBayes()
nb.train(train_X, train_Y)
pred_y = nb.predict(test_X)
accuracy, metrics = calculate_metrics(test_Y, pred_y)

with open(f"Saves/results_main.txt", "a") as f:
    f.write("Accuracy: " + str(accuracy) + "\n")
    for label in metrics:
        f.write(label + "\n")
        for metric in metrics[label]:
            f.write(metric + " " + str(metrics[label][metric]) + "\n")
        f.write("\n")

print("Alpha: ", nb.alpha)
print("Accuracy: ", accuracy)
for label in metrics:
    print(label)
    for metric in metrics[label]:
        print(metric, metrics[label][metric])
    print()

nb = NaiveBayes()
nb.alpha = 0.1
nb.train(train_X, train_Y)
pred_y = nb.predict(test_X)
accuracy, metrics = calculate_metrics(test_Y, pred_y)

with open(f"Saves/results_main_with_alpha_1e-1.txt", "a") as f:
    f.write("Accuracy: " + str(accuracy) + "\n")
    for label in metrics:
        f.write(label + "\n")
        for metric in metrics[label]:
            f.write(metric + " " + str(metrics[label][metric]) + "\n")
        f.write("\n")

print("Alpha: ", nb.alpha)
print("Accuracy: ", accuracy)
for label in metrics:
    print(label)
    for metric in metrics[label]:
        print(metric, metrics[label][metric])
    print()

NameError: name 'train_X' is not defined

### 5. Improving the Classifier

##### Different preprocessing techniques and parameters to improve the performance of the classifier

A - Stemming instead of lemmatization

B - Not removing stopwords

C - 60-40 split instead of 70-30

D - 80-20 split instead of 70-30

E - 90-10 split instead of 70-30

F - 50-50 split instead of 70-30

A)

In [None]:
df = pd.read_csv(r'Data\BBC News Train.csv')
df.drop('ArticleId', axis=1, inplace=True)
df['Text'] = df['Text'].apply(remove_punctuation)
df['Text'] = df['Text'].apply(lower_case)
df['Text'] = df['Text'].apply(remove_stopwords)
df['Text'] = df['Text'].apply(tokenize)
df['Text'] = df['Text'].apply(stem)

df_train, df_test = train_test_split(df, split=0.7)

unique_classes = df_train['Category'].unique()

tf_icf = TF_ICF(unique_classes, df_train['Text'], df_train['Category'])

feature_set = []
unique_classes = df_train['Category'].unique()
for mega_class in unique_classes:
    sub_tf_icf = tf_icf.tf_icf[mega_class]
    for word, tficf in sub_tf_icf.items():
        if word not in feature_set:
            feature_set.append(word)
feature_set = list(set(feature_set))

train_X, train_Y, test_X, test_Y = featurize(df_train['Text'], df_train['Category'], df_test['Text'], df_test['Category'], tf_icf, unique_classes, feature_set)

nb = NaiveBayes()
nb.train(train_X, train_Y)
pred_y = nb.predict(test_X)
accuracy, metrics = calculate_metrics(test_Y, pred_y)

with open(f"Saves/results_stemming.txt", "a") as f:
    f.write("Accuracy: " + str(accuracy) + "\n")
    for label in metrics:
        f.write(label + "\n")
        for metric in metrics[label]:
            f.write(metric + " " + str(metrics[label][metric]) + "\n")
        f.write("\n")
        
print("Accuracy: ", accuracy)
for label in metrics:
    print(label)
    for metric in metrics[label]:
        print(metric, metrics[label][metric])
    print()

Accuracy:  0.9664429530201343
business
precision 0.9680851063829787
recall 0.9578947368421052
f1-score 0.962962962962963

entertainment
precision 0.9883720930232558
recall 0.9659090909090909
f1-score 0.9770114942528736

politics
precision 0.9418604651162791
recall 0.9529411764705882
f1-score 0.9473684210526314

sport
precision 1.0
recall 0.9801980198019802
f1-score 0.99

tech
precision 0.926829268292683
recall 0.9743589743589743
f1-score 0.9500000000000001



B)

In [None]:
df = pd.read_csv(r'Data\BBC News Train.csv')
df.drop('ArticleId', axis=1, inplace=True)
df['Text'] = df['Text'].apply(remove_punctuation)
df['Text'] = df['Text'].apply(lower_case)
df['Text'] = df['Text'].apply(tokenize)
df['Text'] = df['Text'].apply(lemmatize)

df_train, df_test = train_test_split(df, split=0.7)

feature_set = []
unique_classes = df_train['Category'].unique()
for mega_class in unique_classes:
    sub_tf_icf = tf_icf.tf_icf[mega_class]
    for word, tficf in sub_tf_icf.items():
        if word not in feature_set:
            feature_set.append(word)
feature_set = list(set(feature_set))

train_X, train_Y, test_X, test_Y = featurize(df_train['Text'], df_train['Category'], df_test['Text'], df_test['Category'], tf_icf, unique_classes, feature_set)

nb = NaiveBayes()
nb.train(train_X, train_Y)
pred_y = nb.predict(test_X)
accuracy, metrics = calculate_metrics(test_Y, pred_y)

with open(f"Saves/results_no_stopwords_removed.txt", "a") as f:
    f.write("Accuracy: " + str(accuracy) + "\n")
    for label in metrics:
        f.write(label + "\n")
        for metric in metrics[label]:
            f.write(metric + " " + str(metrics[label][metric]) + "\n")
        f.write("\n")
        
print("Accuracy: ", accuracy)
for label in metrics:
    print(label)
    for metric in metrics[label]:
        print(metric, metrics[label][metric])
    print()

C, D, E)

In [None]:
df = pd.read_csv(r'Data\BBC News Train.csv')
df.drop('ArticleId', axis=1, inplace=True)
df['Text'] = df['Text'].apply(remove_punctuation)
df['Text'] = df['Text'].apply(lower_case)
df['Text'] = df['Text'].apply(remove_stopwords)
df['Text'] = df['Text'].apply(tokenize)
df['Text'] = df['Text'].apply(lemmatize)

splits = [0.7, 0.8, 0.9, 0.5]

for split in splits:
    print("Split: ", split)
    df_train, df_test = train_test_split(df, split=split)

    feature_set = []
    unique_classes = df_train['Category'].unique()
    for mega_class in unique_classes:
        sub_tf_icf = tf_icf.tf_icf[mega_class]
        for word, tficf in sub_tf_icf.items():
            if word not in feature_set:
                feature_set.append(word)
    feature_set = list(set(feature_set))

    train_X, train_Y, test_X, test_Y = featurize(df_train['Text'], df_train['Category'], df_test['Text'], df_test['Category'], tf_icf, unique_classes, feature_set)

    nb = NaiveBayes()
    nb.train(train_X, train_Y)
    pred_y = nb.predict(test_X)
    accuracy, metrics = calculate_metrics(test_Y, pred_y)
    # Write to file
    with open(f"Saves/results_{split}.txt", "a") as f:
        f.write("Accuracy: " + str(accuracy) + "\n")
        for label in metrics:
            f.write(label + "\n")
            for metric in metrics[label]:
                f.write(metric + " " + str(metrics[label][metric]) + "\n")
            f.write("\n")
    print("Accuracy: ", accuracy)
    for label in metrics:
        print(label)
        for metric in metrics[label]:
            print(metric, metrics[label][metric])
        print()

##### Different types of features such as n-grams or TF-IDF weights.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

df = pd.read_csv(r'Data\BBC News Train.csv')
df.drop('ArticleId', axis=1, inplace=True)
df['Text'] = df['Text'].apply(remove_punctuation)
df['Text'] = df['Text'].apply(lower_case)
df['Text'] = df['Text'].apply(remove_stopwords)
df['Text'] = df['Text'].apply(tokenize)
df['Text'] = df['Text'].apply(lemmatize)
df['Text'] = df['Text'].apply(lambda x: ' '.join(x))

df_train, df_test = train_test_split(df, split=0.7)

# n_gram_range=(1, 1) for unigram
print("n_gram_range=(1, 1) for unigram")
vectorizer.fit(df_train['Text'])

train_X = vectorizer.transform(df_train['Text']).todense().tolist()
train_Y = df_train['Category']

test_X = vectorizer.transform(df_test['Text']).todense().tolist()
test_Y = df_test['Category']

nb = NaiveBayes()
nb.train(train_X, train_Y)
pred_y = nb.predict(test_X)
accuracy, metrics = calculate_metrics(test_Y, pred_y)

with open(f"Saves/results_unigram.txt", "a") as f:
    f.write("Accuracy: " + str(accuracy) + "\n")
    for label in metrics:
        f.write(label + "\n")
        for metric in metrics[label]:
            f.write(metric + " " + str(metrics[label][metric]) + "\n")
        f.write("\n")

print("Accuracy: ", accuracy)
for label in metrics:
    print(label)
    for metric in metrics[label]:
        print(metric, metrics[label][metric])
    print()

# n_gram_range=(1, 2) for unigrams and bigrams
print("n_gram_range=(1, 2) for unigrams and bigrams")
vectorizer.fit(df_train['Text'])

train_X = vectorizer.transform(df_train['Text']).todense().tolist()
train_Y = df_train['Category']

test_X = vectorizer.transform(df_test['Text']).todense().tolist()
test_Y = df_test['Category']

nb = NaiveBayes()
nb.train(train_X, train_Y)
pred_y = nb.predict(test_X)
accuracy, metrics = calculate_metrics(test_Y, pred_y)

with open(f"Saves/results_unigram_and_bigram.txt", "a") as f:
    f.write("Accuracy: " + str(accuracy) + "\n")
    for label in metrics:
        f.write(label + "\n")
        for metric in metrics[label]:
            f.write(metric + " " + str(metrics[label][metric]) + "\n")
        f.write("\n")

print("Accuracy: ", accuracy)
for label in metrics:
    print(label)
    for metric in metrics[label]:
        print(metric, metrics[label][metric])
    print()

# n_gram_range=(2, 2) for bigrams
print("n_gram_range=(2, 2) for bigrams")
vectorizer.fit(df_train['Text'])

train_X = vectorizer.transform(df_train['Text']).todense().tolist()
train_Y = df_train['Category']

test_X = vectorizer.transform(df_test['Text']).todense().tolist()
test_Y = df_test['Category']

nb = NaiveBayes()
nb.train(train_X, train_Y)
pred_y = nb.predict(test_X)
accuracy, metrics = calculate_metrics(test_Y, pred_y)

with open(f"Saves/results_bigram.txt", "a") as f:
    f.write("Accuracy: " + str(accuracy) + "\n")
    for label in metrics:
        f.write(label + "\n")
        for metric in metrics[label]:
            f.write(metric + " " + str(metrics[label][metric]) + "\n")
        f.write("\n")

print("Accuracy: ", accuracy)
for label in metrics:
    print(label)
    for metric in metrics[label]:
        print(metric, metrics[label][metric])
    print()

n_gram_range=(1, 1) for unigram
Accuracy:  0.9798657718120806
business
precision 0.9787234042553191
recall 0.9583333333333334
f1-score 0.968421052631579

entertainment
precision 1.0
recall 0.967391304347826
f1-score 0.9834254143646408

politics
precision 0.9310344827586207
recall 0.9878048780487805
f1-score 0.9585798816568047

sport
precision 1.0
recall 0.9903846153846154
f1-score 0.9951690821256038

tech
precision 0.9864864864864865
recall 1.0
f1-score 0.9931972789115647

n_gram_range=(1, 2) for unigrams and bigrams
Accuracy:  0.9798657718120806
business
precision 0.9787234042553191
recall 0.9583333333333334
f1-score 0.968421052631579

entertainment
precision 1.0
recall 0.967391304347826
f1-score 0.9834254143646408

politics
precision 0.9310344827586207
recall 0.9878048780487805
f1-score 0.9585798816568047

sport
precision 1.0
recall 0.9903846153846154
f1-score 0.9951690821256038

tech
precision 0.9864864864864865
recall 1.0
f1-score 0.9931972789115647

n_gram_range=(2, 2) for bigrams

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

df = pd.read_csv(r'Data\BBC News Train.csv')
df.drop('ArticleId', axis=1, inplace=True)
df['Text'] = df['Text'].apply(remove_punctuation)
df['Text'] = df['Text'].apply(lower_case)
df['Text'] = df['Text'].apply(remove_stopwords)
df['Text'] = df['Text'].apply(tokenize)
df['Text'] = df['Text'].apply(lemmatize)
df['Text'] = df['Text'].apply(lambda x: ' '.join(x))

df_train, df_test = train_test_split(df, split=0.7)

# tfidf
vectorizer.fit(df_train['Text'])

train_X = vectorizer.transform(df_train['Text']).todense().tolist()
train_Y = df_train['Category']

test_X = vectorizer.transform(df_test['Text']).todense().tolist()
test_Y = df_test['Category']

nb = NaiveBayes()
nb.train(train_X, train_Y)
pred_y = nb.predict(test_X)
accuracy, metrics = calculate_metrics(test_Y, pred_y)
with open(f"Saves/results_tfidf.txt", "a") as f:
    f.write("Accuracy: " + str(accuracy) + "\n")
    for label in metrics:
        f.write(label + "\n")
        for metric in metrics[label]:
            f.write(metric + " " + str(metrics[label][metric]) + "\n")
        f.write("\n")

print("Accuracy: ", accuracy)
for label in metrics:
    print(label)
    for metric in metrics[label]:
        print(metric, metrics[label][metric])
    print()

Accuracy:  0.9619686800894854
business
precision 0.94
recall 0.9894736842105263
f1-score 0.964102564102564

entertainment
precision 1.0
recall 0.8876404494382022
f1-score 0.9404761904761905

politics
precision 0.9310344827586207
recall 0.9642857142857143
f1-score 0.9473684210526316

sport
precision 0.9615384615384616
recall 1.0
f1-score 0.9803921568627451

tech
precision 0.987012987012987
recall 0.9620253164556962
f1-score 0.9743589743589742

