In [125]:
from sklearn import datasets
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk import pos_tag
import numpy as np
import math 
import pandas as pd
import random
import string
import nltk
import matplotlib.pyplot as plt

In [None]:
##Loading the dataset 

In [91]:
news=datasets.fetch_20newsgroups()   
x=news.data
y=news.target

In [5]:
  #doing spliting for train and test data    

In [103]:
newsgroup_train_data,newsgroup_test_data,newsgroup_train_target,newsgroup_test_target=model_selection.train_test_split(x,y,test_size=0.25,random_state=0)


In [None]:
## Getting the both training and testing documents converted into tuple form in which first element consists of data and
## second element consists it's class to which it belongs.After that we are shuflling the training documents.

In [104]:
traindocuments=[]
testdocuments=[]
traindocuments=[(newsgroup_train_data[i],newsgroup_train_target[i]) for i in range(0,len(newsgroup_train_target))]
testdocuments=[(newsgroup_test_data[i],newsgroup_test_target[i]) for i in range(0,len(newsgroup_test_target))]
random.shuffle(documents)

In [None]:
## Splitting the training and testing documents into word.

In [105]:
train_documents=[(word_tokenize(document),category) for document,category in traindocuments]
test_documents=[(word_tokenize(document),category) for document,category in testdocuments]

In [106]:
lemmatizer=WordNetLemmatizer()

In [107]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [108]:
stops=set(stopwords.words('english'))  ## Creating List of words that we don't want in training as well as testing documents.
punctuations=list(string.punctuation)  ## Also we also need to remove various punctuations from our data.
stops.update(punctuations)

In [109]:
def clean_text(words):    ## Cleaning the documents.
    output_words=[]
    
    for w in words:
        if w.lower() not in stops:
            pos=pos_tag([w])
            clean_word=lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [None]:
## Here we have our cleaned training as well as testing documents.

In [110]:
train_documents=[(clean_text(document),category) for document,category in train_documents]
test_documents=[(clean_text(document),category) for document,category in test_documents]

In [None]:
## Here we are getting our features on which we will work.

In [111]:
vectorizer = CountVectorizer(max_features=3000)
t_train=[" ".join(document) for document,category in train_documents]
t_test=[" ".join(document) for document,category in test_documents]
t_train=vectorizer.fit_transform(t_train)
features=vectorizer.get_feature_names()

In [None]:
## Here we are calculating the probability of data point for a particular class

In [112]:
def probability(dictionary,x,current_class,features,distinct_words):
    output=(dictionary[current_class]["total_count"])/(dictionary["total_data"])
    output=math.log(output)
    num_features=len(dictionary[current_class].keys())-2
    
    for j in range(2,num_features+2):
        if(x[j-2]==0):
            continue
        word_jcnt=dictionary[current_class][features[j-2]]+1
        total_word_cnt=dictionary[current_class]["total_word_count"]+distinct_words
        prob_wordj=(word_jcnt)/(total_word_cnt)
        prob_wordj=math.log(prob_wordj)
        output=(output)+(prob_wordj)
    return output

In [None]:
## This function return the best_class i.e the class which has highest probability to contain a data point.

In [113]:
def predictSinglePoint(dictionary,x,features,distinct_words):
    classes=dictionary.keys()
    first_run=True
    best_p=-10000000
    best_class=-1
    
    for current_class in classes:
        if(current_class=="total_data"):
            continue
        p_current_class=probability(dictionary,x,current_class,features,distinct_words)
        
        if(p_current_class > best_p or first_run):
            best_p=p_current_class
            best_class=current_class
            first_run=False
            
    return best_class

In [None]:
## We are storing the predictions and returing it.

In [114]:
def predict(dictionary,x_test,features,distinct_words):
    y_pred=[]
    for i in range(0,x_test.shape[0],1):
        x_class=predictSinglePoint(dictionary,x_test[i,:],features,distinct_words)
        y_pred.append(x_class)
    return y_pred

In [None]:
## Here we are training the algorithm

In [115]:
def fit(x_train,y_train,features):
    result={} 
    class_values=set(y_train)
    
    for current_class in class_values:
        result["total_data"]=len(y_train) 
        result[current_class]={}
        current_class_rows=(y_train==current_class)
        x_train_current=x_train[current_class_rows]
        y_train_current=y_train[current_class_rows]
        num_features=x_train.shape[1]
        total_word_cnt=0
        result[current_class]["total_count"]=len(y_train_current)
        result[current_class]["total_word_count"]=total_word_cnt
        
        for j in range(2,num_features+2):
            result[current_class][features[j-2]]=x_train_current[:,j-2].sum()
            total_word_cnt+=x_train_current[:,j-2].sum()
          ##  if(x_train_current[:,j-2].sum()==0):
          ##      continue
           ## total_word_cnt+=1 ## Adding +1 for getting the count of the words for lapalace correction
                                                                                          
            
        result[current_class]["total_word_count"]=total_word_cnt 
                                                                   
    return result

In [None]:
## Here we are getting our dataset ready of the form that is required for the Multinomial Naive Bayes

In [116]:
check={}
index={}
distinct_words=0
visited={}
x_train=[[0 for i in range(len(features))] for j in range(len(newsgroup_train_target))]
x_test=[[0 for i in range(len(features))] for j in range(len(newsgroup_test_target))]
ind=0
for word in features:
    check[word]=1
    index[word]=ind
    ind+=1
    
row=0
for document,category in train_documents:
    col=0
    for word in document:
        if word in check:
            x_train[row][index[word]]+=1
            if word not in visited:
                distinct_words+=1
                visited[word]=1;
    row+=1 

row=0
for document,category in test_documents:
    col=0
    for word in document:
        if word in check:
            x_test[row][index[word]]+=1
    row+=1
x_train=np.array(x_train)
x_test=np.array(x_test)
y_train=newsgroup_train_target
y_test=newsgroup_test_target
y_train=np.array(y_train)
y_test=np.array(y_test)

In [None]:
## Finally! we have our predictions.

In [117]:
dictionary=fit(x_train,y_train,features)
y_pred=predict(dictionary,x_test,features,distinct_words)

In [None]:
## Now for the subsequent code we are predicting the dataset with the Sklearn's Multinomial Naive Bayes and
## then we are comparing with our own implemented Naive Bayes through classification reports

In [118]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [119]:
clf=MultinomialNB()

In [120]:
clf.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [121]:
y_pred_classifier=clf.predict(x_test)

In [None]:
## Printing the classification_report of Our Own Implemented Naive Bayes

In [122]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.77      0.82      0.80       112
           1       0.52      0.80      0.63       147
           2       0.92      0.16      0.28       140
           3       0.57      0.72      0.64       148
           4       0.66      0.81      0.72       149
           5       0.83      0.74      0.78       159
           6       0.67      0.85      0.75       131
           7       0.81      0.79      0.80       158
           8       0.81      0.91      0.86       162
           9       0.90      0.93      0.91       148
          10       0.96      0.88      0.92       150
          11       0.98      0.89      0.93       155
          12       0.81      0.71      0.76       147
          13       0.91      0.89      0.90       131
          14       0.92      0.84      0.88       154
          15       0.88      0.83      0.85       155
          16       0.87      0.92      0.90       144
          17       0.92    

In [None]:
## Printing the classification report of Sklearn's Naive Bayes

In [123]:
print(classification_report(y_test,y_pred_classifier))

              precision    recall  f1-score   support

           0       0.80      0.81      0.81       112
           1       0.55      0.82      0.66       147
           2       0.87      0.24      0.37       140
           3       0.58      0.76      0.66       148
           4       0.70      0.81      0.75       149
           5       0.79      0.77      0.78       159
           6       0.73      0.83      0.78       131
           7       0.83      0.79      0.81       158
           8       0.82      0.90      0.86       162
           9       0.90      0.93      0.91       148
          10       0.97      0.94      0.95       150
          11       0.99      0.88      0.93       155
          12       0.79      0.73      0.76       147
          13       0.93      0.89      0.91       131
          14       0.92      0.86      0.89       154
          15       0.89      0.85      0.86       155
          16       0.83      0.89      0.86       144
          17       0.92    