# Imports

In [90]:
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords , wordnet
import string
from collections import Counter
import numpy as np
import pandas as pd
import math as m
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from nltk import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Data Cleansing

In [2]:
# Getting folder names for classes in dataset

Y = []                            

for folders in os.listdir('./20_newsgroups/'):
    Y.append(folders)
    
y_classes = [i for i in range(len(Y))]
y_classes

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [3]:
# dictionary_y

dict_Y = {}
for i in range(len(Y)):
    dict_Y[Y[i]] = i
    
dict_Y

{'talk.politics.guns': 0,
 'rec.sport.baseball': 1,
 'talk.politics.misc': 2,
 'comp.graphics': 3,
 'sci.electronics': 4,
 'soc.religion.christian': 5,
 'comp.sys.ibm.pc.hardware': 6,
 'rec.motorcycles': 7,
 'comp.sys.mac.hardware': 8,
 'alt.atheism': 9,
 'sci.crypt': 10,
 'misc.forsale': 11,
 'sci.med': 12,
 'sci.space': 13,
 'rec.sport.hockey': 14,
 'talk.religion.misc': 15,
 'comp.os.ms-windows.misc': 16,
 'comp.windows.x': 17,
 'rec.autos': 18,
 'talk.politics.mideast': 19}

In [4]:
# stopwords

punctuations = list(string.punctuation)
stop = stopwords.words('english')
stop += punctuations

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
def get_pos_tag(word):
    tag = pos_tag([word])[0][1]
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADJ
    else:
        return wordnet.NOUN
    

# Features

### X_train & Y_train


In [7]:
'''
    Warning : This cell will take time to run. So please wait.
'''

documents = []

for dir_path , dir_name, file_name in os.walk('./20_newsgroups/'):
    if len(file_name) == 0:
        Y_labels = dir_name
    else:
        temp = dir_path[16:]
        #wordlist = []
        for files in file_name:
            lines = ''
            with open(dir_path+'/'+files, 'r' ,encoding = "ISO-8859-1") as f:
                for line in f:
                    for word in word_tokenize(line):
                        if word.lower() not in stop and word.isalpha():
                            word = lemmatizer.lemmatize(word, pos = get_pos_tag(word))
                            lines += word
                            lines += ' '
                            #wordlist.append(word)
            documents.append((lines, dict_Y[temp]))                     

In [8]:
np.random.shuffle(documents)

In [9]:
Y_train = [ y for doc, y in documents]
Y_train = np.array(Y_train)

In [10]:
docs = [ doc for doc, y in documents]


### X_test & Y_test



- NOTE :  I have used '20_newsgroups' as training dataset and 'mini_newsgroups' as testing dataset (As instructed by TA)

In [11]:
'''
    Warning : This cell will take time to run. So please wait.
'''

X_test  = []

for dir_path , dir_name, file_name in os.walk('./mini_newsgroups/'):
    if len(file_name) == 0:
        Y_test_labels = dir_name
    else:
        temp = dir_path[18:]
        #wordlist = []
        for files in file_name:
            lines_test = ''
            with open(dir_path+'/'+files, 'r' ,encoding = "ISO-8859-1") as f:
                for line in f:
                    for word in word_tokenize(line):
                        if word.lower() not in stop and word.isalpha():
                            word = lemmatizer.lemmatize(word, pos = get_pos_tag(word))
                            lines_test +=  word
                            lines_test += ' '
                            #wordlist.append(word)
            X_test.append((lines_test, dict_Y[temp]))  
                            

In [12]:
np.random.shuffle(X_test)

In [13]:
Y_test = [ y for doc, y in X_test]

In [14]:
docs_test = [ doc for doc ,y in X_test]

# Count Vectorization

In [97]:
# No. of features 

k = 3000                  

In [35]:
count_vec = TfidfVectorizer(max_features = k , max_df=0.8)
X_train = count_vec.fit_transform(docs)
X_train = X_train.toarray()

In [18]:
X_test = count_vec.transform(docs_test)
X_test = X_test.toarray()

In [19]:
features = count_vec.get_feature_names()

### Data Visuliazation

In [37]:

df = pd.DataFrame(X_train)
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074351,0.0,...,0.0,0.0,0.0,0.0,0.184983,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
count,19997.0,19997.0,19997.0,19997.0,19997.0,19997.0,19997.0,19997.0,19997.0,19997.0,...,19997.0,19997.0,19997.0,19997.0,19997.0,19997.0,19997.0,19997.0,19997.0,19997.0
mean,0.001157,0.001025,0.000747,0.001492,0.004868,0.002379,0.00131,0.001447,0.001073,0.000883,...,0.000807,0.004967,0.001392,0.004603,0.002874,0.002584,0.000957,0.000926,0.00105,0.000432
std,0.017938,0.015184,0.016201,0.01256,0.020664,0.030185,0.017668,0.012104,0.01231,0.015382,...,0.017983,0.021691,0.01341,0.019341,0.020482,0.020568,0.015526,0.010732,0.015403,0.006368
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.872059,0.547026,0.84492,0.391783,0.345596,0.712626,0.737732,0.24425,0.491812,0.756248,...,0.718446,0.749224,0.347986,0.274223,0.422118,0.625549,0.600091,0.273232,0.616453,0.306615


# MultinomialNB Implementation
    - Inbuilt Multinomial Naive Bayes implementation from SKlearn

In [38]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()            # Multinomial Naive Bayes classifier


In [39]:
clf.fit(X_train, Y_train)    # fitting the training data
ypred = clf.predict(X_test)  # predicting the test data

In [40]:
ypred                          # predicted result

array([13,  7, 18, ...,  5, 17,  9])

### Accuracy Score, Classification Report and Confusion Matrix

In [95]:
print('Accuracy Score :',accuracy_score(Y_test, ypred))

Accuracy Score : 0.841


In [96]:
print('Classification Report :-\n\n', classification_report(Y_test, ypred))
print('Confusion Matrix :-\n\n', confusion_matrix(Y_test, ypred))

Classification Report :-

               precision    recall  f1-score   support

           0       0.71      0.88      0.79       100
           1       0.97      0.94      0.95       100
           2       0.75      0.62      0.68       100
           3       0.72      0.83      0.77       100
           4       0.84      0.81      0.83       100
           5       0.88      0.98      0.93       100
           6       0.75      0.76      0.76       100
           7       0.92      0.90      0.91       100
           8       0.84      0.78      0.81       100
           9       0.73      0.86      0.79       100
          10       0.97      0.92      0.94       100
          11       0.84      0.88      0.86       100
          12       0.97      0.91      0.94       100
          13       0.92      0.93      0.93       100
          14       0.94      0.96      0.95       100
          15       0.73      0.43      0.54       100
          16       0.72      0.79      0.75       100


# MultinomialNB Implementation ( Self )
- Self Multinomial Naive Bayes Implenetation

In [74]:
'''
    - MultinomialNaiveBayes Class
'''

class MultinomialNaiveBayes:
    # __init__ function 
    def __init__(self):
        pass
    
    # fit function :- takes two arguments : Xtrain dataset and ytrain dataset and fits the data and returns the 
    # dictinary 'count' that contain probability of each word in each document that are part of vocublary.
    def fit(self , xtrain , ytrain):
        self.count = {}
        classes = set(ytrain)                    # unique classes in Ytrain 
        for c in classes:           
            self.count[c] = {}                       # adding every unique class in count dictionary as key
            self.count['total_data'] = len(ytrain)       # adding total_data key and length of ytrain as its value
            current_class_row = (ytrain == c)            
            xtrain_current = xtrain[current_class_row]   # only those rows of xtrain that has class c
            ytrain_current = ytrain[current_class_row]   # only those points of ytrain that has value c
            self.count[c]['total_count'] = (current_class_row).sum()  # total_count = total no. of class c in Ytrain

            for j in range(len(features)):   # instead of len(features) do xtrain.shape[1]
                 self.count[c][j] = xtrain_current[:,j].sum()             
            
            self.count[c]['total_feature_count'] = np.sum(xtrain_current)
    
    # _probability fn :
    #                 - takes 2 argument : x = testing data point (row of testing dataset)
    #                                    : cur_class = current class ( From Y_train )
    #                 - Returns output i.e. sum of log probability of all the words that testing data point contains.
    def _probability(self, x, cur_class):
        output = np.log(self.count[cur_class]['total_count']) -  np.log(self.count['total_data'])
        num_features = len(self.count[cur_class].keys())-2
        for j in range(num_features):
            if(x[j] == 0):
                cur_xj_prob = 0
            else:
                count_cur_class_with_value_xj = self.count[cur_class][j] +1
                count_cur_class = self.count[cur_class]['total_feature_count'] + len(self.count[cur_class].keys()) -2
                cur_xj_prob = np.log(count_cur_class_with_value_xj )- np.log(count_cur_class)
            output += cur_xj_prob

        return output  
    
    # _predictSinglePoint fn : takes only one argument i.e.  x = testing data point (row of testing dataset)
    #                       Returns the best class (best_c) based on the probability it gets from _probability(x, cur_class)
    def _predictSinglePoint(self, x):
        classes = self.count.keys()
        best_p = -10**11
        best_c = -1
        for cur_class in classes:
            if cur_class == 'total_data':
                continue
            p = self._probability(x, cur_class)

            if p > best_p:
                best_p = p
                best_c = cur_class
        return best_c
    
    # predict fn : takes 1 argument i.e. whole Xtest dataset
    #              returns list of prediction (y_pred)
    def predict(self, x_test):
        self.y_pred = []
        for x in x_test:
            x_class = self._predictSinglePoint(x)
            self.y_pred.append(x_class)
        return self.y_pred


    
        

In [81]:
clf_self = MultinomialNaiveBayes()

In [82]:
clf_self.fit(X_train, Y_train)

In [83]:
ypred_cls = clf_self.predict(X_test)

### Accuracy Score , Classification Report and Confusion Matrix

In [94]:
print('Accuracy Score :',accuracy_score(Y_test, ypred_cls))

Accuracy Score : 0.8415


In [93]:
print('Classification Report :-\n\n', classification_report(Y_test, ypred_cls))
print('Confusion Matrix :-\n\n', confusion_matrix(Y_test, ypred_cls))

Classification Report :-

               precision    recall  f1-score   support

           0       0.71      0.87      0.78       100
           1       0.95      0.94      0.94       100
           2       0.72      0.62      0.67       100
           3       0.66      0.81      0.73       100
           4       0.80      0.83      0.81       100
           5       0.93      0.98      0.96       100
           6       0.83      0.75      0.79       100
           7       0.88      0.90      0.89       100
           8       0.85      0.80      0.82       100
           9       0.72      0.87      0.79       100
          10       0.96      0.91      0.93       100
          11       0.88      0.88      0.88       100
          12       0.96      0.91      0.93       100
          13       0.94      0.92      0.93       100
          14       0.98      0.94      0.96       100
          15       0.68      0.43      0.53       100
          16       0.78      0.83      0.81       100


## Comparison my implementation with Sklearn

It seems like we are getting same accuracy score as of sklearn's implementation of MultinomialNB i.e. 84.1%