In [120]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk import pos_tag
import numpy as np
import math 
import pandas as pd
import random
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
fetch_20newsgroups     ##Analysing the dataset

<function sklearn.datasets.twenty_newsgroups.fetch_20newsgroups(data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True)>

In [None]:
##Loading the dataset and removing 'headers','footers' and 'quotes' from it

In [3]:
categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']
newsgroups_train = fetch_20newsgroups(subset='train',remove=('headers', 'footers', 'quotes'),categories=categories)
newsgroups_test =  fetch_20newsgroups(subset='test',remove=('headers', 'footers', 'quotes'),categories=categories)

In [4]:
newsgroup_train_data=(newsgroups_train.data)      ## Extracting the training input data 
newsgroup_train_target=(newsgroups_train.target)  ## Extracting the tranaing target

In [5]:
newsgroup_test_data=(newsgroups_test.data)        ## Extracting the testing input data
newsgroup_test_target=(newsgroups_test.target)    ## Extracting the testing target

In [None]:
## Getting the both training and testing documents converted into tuple form in which first element consists of data and
## second element consists it's class to which it belongs.After that we are shuflling the training documents.

In [6]:
documents=[]
testdocuments=[]
documents=[(newsgroup_train_data[i],newsgroup_train_target[i]) for i in range(0,len(newsgroup_train_target))]
testdocuments=[(newsgroup_test_data[i],newsgroup_test_target[i]) for i in range(0,len(newsgroup_test_target))]
random.shuffle(documents)

In [None]:
## Splitting the training and testing documents into word.

In [7]:
train_documents=[(word_tokenize(document),category) for document,category in documents]
test_documents=[(word_tokenize(document),category) for document,category in testdocuments]

In [8]:
lemmatizer=WordNetLemmatizer()

In [9]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [11]:
stops=set(stopwords.words('english'))  ## Creating List of words that we don't want in training as well as testing documents.
punctuations=list(string.punctuation)  ## Also we also need to remove various punctuations from our data.
stops.update(punctuations)

In [12]:
def clean_text(words):    ## Cleaning the documents.
    output_words=[]
    
    for w in words:
        if w.lower() not in stops:
            pos=pos_tag([w])
            clean_word=lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [None]:
## Here we have our cleaned training as well as testing documents.

In [13]:
train_documents=[(clean_text(document),category) for document,category in train_documents]
test_documents=[(clean_text(document),category) for document,category in test_documents]

In [None]:
## Here we are getting our features on which we will work.

In [173]:
vectorizer = CountVectorizer(max_features=3000)
t_train=[" ".join(document) for document,category in train_documents]
t_test=[" ".join(document) for document,category in test_documents]
features=vectorizer.get_feature_names()

In [None]:
## Here we are calculating the probability of data point for a particular class

In [174]:
def probability(dictionary,x,current_class,features):
    output=(dictionary[current_class]["total_count"])/(dictionary["total_data"])
    output=np.log(output)
    num_features=len(dictionary[current_class].keys())-2
    
    for j in range(2,num_features+2):
        if(x[j-2]==0):
            continue
        word_jcnt=dictionary[current_class][features[j-2]]+1
        total_word_cnt=dictionary[current_class]["total_word_count"]
        prob_wordj=(word_jcnt)/(total_word_cnt)
        prob_wordj=np.log(prob_wordj)
        output=(output)+(prob_wordj)
    return output

In [None]:
## This function return the best_class i.e the class which has highest probability to contain a data point.

In [175]:
def predictSinglePoint(dictionary,x,features):
    classes=dictionary.keys()
    first_run=True
    best_p=-10000000
    best_class=-1
    
    for current_class in classes:
        if(current_class=="total_data"):
            continue
        p_current_class=probability(dictionary,x,current_class,features)
        
        if(p_current_class > best_p or first_run):
            best_p=p_current_class
            best_class=current_class
            first_run=False
            
    return best_class

In [None]:
## We are storing the predictions and returing it.

In [176]:
def predict(dictionary,x_test,features):
    y_pred=[]
    for i in range(0,x_test.shape[0],1):
        x_class=predictSinglePoint(dictionary,x_test[i,:],features)
        y_pred.append(x_class)
    return y_pred

In [None]:
## Here we are training the algorithm

In [177]:
def fit(x_train,y_train,features):
    result={} 
    class_values=set(y_train)
    
    for current_class in class_values:
        result[current_class]={}
        result["total_data"]=len(y_train) 
        current_class_rows=(y_train==current_class)
        x_train_current=x_train[current_class_rows]
        y_train_current=y_train[current_class_rows]
        num_features=x_train.shape[1]
        total_word_cnt=0
        result[current_class]["total_count"]=len(y_train_current)
        result[current_class]["total_word_count"]=total_word_cnt
        
        for j in range(2,num_features+2):
            result[current_class][features[j-2]]=x_train_current[:,j-2].sum()
            total_word_cnt+=x_train_current[:,j-2].sum()
            if(x_train_current[:,j-2].sum()==0):
                continue
            total_word_cnt+=1 ## Adding +1 for getting the count of the words for lapalace correction
                                                                                          
            
        result[current_class]["total_word_count"]=total_word_cnt 
                                                                   
    return result

In [None]:
## Here we are getting our dataset ready of the form that is required for the Multinomial Naive Bayes

In [178]:
check={}
index={}
x_train=[[0 for i in range(len(features))] for j in range(len(newsgroup_train_target))]
x_test=[[0 for i in range(len(features))] for j in range(len(newsgroup_test_target))]
ind=0
for word in features:
    check[word]=1
    index[word]=ind
    ind+=1
    
row=0
for document,category in train_documents:
    col=0
    for word in document:
        if word in check:
            x_train[row][index[word]]+=1
    row+=1 

row=0
for document,category in test_documents:
    col=0
    for word in document:
        if word in check:
            x_test[row][index[word]]+=1
    row+=1
x_train=np.array(x_train)
x_test=np.array(x_test)
y_train=newsgroup_train_target
y_test=newsgroup_test_target
y_train=np.array(y_train)
y_test=np.array(y_test)

In [None]:
## Finally! we have our predictions.

In [179]:
dictionary=fit(x_train,y_train,features)
y_pred=predict(dictionary,x_test,features)

In [None]:
## Now for the subsequent code we are predicting the dataset with the Sklearn's Multinomial Naive Bayes and
## then we are comparing with our own implemented Naive Bayes through classification reports

In [180]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [181]:
clf=MultinomialNB()

In [182]:
clf.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [183]:
y_pred_classifier=clf.predict(x_test)

In [184]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.06      0.09      0.07       319
           1       0.02      0.02      0.02       389
           2       0.14      0.21      0.17       394
           3       0.04      0.02      0.02       392
           4       0.01      0.02      0.02       385
           5       0.12      0.15      0.13       395
           6       0.02      0.02      0.02       390
           7       0.09      0.07      0.08       396
           8       0.04      0.08      0.05       398
           9       0.09      0.05      0.07       397
          10       0.03      0.04      0.04       399
          11       0.06      0.01      0.01       396
          12       0.02      0.00      0.00       393
          13       0.12      0.16      0.14       396
          14       0.03      0.03      0.03       394
          15       0.00      0.00      0.00       398
          16       0.03      0.01      0.02       364
          17       0.12    

In [185]:
print(classification_report(y_test,y_pred_classifier))

              precision    recall  f1-score   support

           0       0.05      0.07      0.06       319
           1       0.02      0.02      0.02       389
           2       0.14      0.21      0.17       394
           3       0.05      0.02      0.03       392
           4       0.01      0.02      0.01       385
           5       0.12      0.14      0.13       395
           6       0.03      0.02      0.02       390
           7       0.09      0.06      0.07       396
           8       0.05      0.08      0.06       398
           9       0.07      0.03      0.04       397
          10       0.03      0.04      0.03       399
          11       0.06      0.01      0.02       396
          12       0.02      0.01      0.01       393
          13       0.12      0.13      0.12       396
          14       0.03      0.03      0.03       394
          15       0.00      0.00      0.00       398
          16       0.04      0.02      0.02       364
          17       0.13    