In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ROHAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Conversion of text data to the dataframe format:-

In [2]:
#Loading Text data into X and targets into Y.

import os
rootdir = 'C:/20_newsgroups'
X=[]
Y=[]

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        p=os.path.join(subdir, file)
        k=open(p,"r")
        X.append(k.read())
        y=subdir.replace("C:/20_newsgroups\\","")
        Y.append(y)    

In [3]:
stop_words=set(stopwords.words("english"))       #Getting all the stopwords from the nltk library.
def removestopwords(x):
    words=x.split(" ")
    filtered_words=[w for w in words if not w.lower() in stop_words]
    y=" ".join(filtered_words)
    return y

In [4]:
#To remove all the special characters.
def removespecialcharacter(x):
    specialchar=["0","1","2","3","4","5","6","7","8","9","~","`","!","@","#","$","%","^","&","*","(",")","-","_","+","=","{","[",
                 "]","}",";",":","'","<",",",">",".","/","?","\"","|",".","\n","\t"]
    for i in specialchar:
                 x=x.replace(i," ")
    return x

In [5]:
X1=[]
X2=[]
for i in range(len(X)):
    X1.append(removespecialcharacter(X[i]))     #Removes all the special characters from the text datafile as those are not useful to us.
for i in range(len(X1)):
    X2.append(removestopwords(X1[i]))           #To remove all the stop_words

In [6]:
#To make a list of list of words by splitting each article.
X3=[]
for i in range(len(X2)):
    words=X2[i].split(" ")
    X3.append(words)

In [7]:
#To remove the empty string from the list.
for i in range(len(X3)):
    while("" in X3[i]):
        X3[i].remove("")

In [8]:
#To get the list of features from X3 by picking out the unique words from the X3.
X4=[]
X3=np.array(X3)
for i in range(len(X3)):
    for item in X3[i]:
        X4.append(item)
features=set(X4)

#To remove the words with 1 or 2 occurence & choosing the top "k" features:-
from collections import Counter
coun=Counter(X4)
feature1=features.copy()
for word in feature1:
    if coun[word]<200:
        features.remove(word)
features=list(features)    #As set object is not subscriptable,we have to change it into list.

In [9]:
#To make the dataframe of count of words:-
row=len(Y)
col=len(features)
X=np.zeros((row,col),int)              #To make an empty dataframe of the required size.

for i in range(row):
    coun_words=Counter(X3[i])
    for k in range(col):                              
        X[i][k]+=coun_words[features[k]]       #To get the count of each feature in its respective row.

In [10]:
#So we have got out data in the right format. 
X_data=X
Y_data=Y

In [11]:
#To make continuos data into labelled data:-

def makeLabelled(column):
    third_limit=column.mean()
    second_limit=0.5*third_limit
    first_limit=0.5*second_limit
    forth_limit=1.5*third_limit
    fifth_limit=2.5*third_limit
    for i in range(len(column)):
        if (column[i]<first_limit):
            column[i]=0
        elif(column[i]<second_limit):
            column[i]=1
        elif(column[i]<third_limit):
            column[i]=2
        elif(column[i]<forth_limit):
            column[i]=3
        elif(column[i]<fifth_limit):
            column[i]=4
        else:
            column[i]=5
    return column

In [12]:
for i in range(0,X_data.shape[-1]):
    X_data[:,i]=makeLabelled(X_data[:,i])

In [13]:
X_train,X_test,Y_train,Y_test = train_test_split(X_data,Y_data,random_state=1)

In [14]:
#APPLYING THE MULTINOMIAL CLASSIFIER IN SKLEARN:-
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()
clf.fit(X_train,Y_train)
Y_pred=clf.predict(X_test)

# Naive Baye's From Scratch:-

In [18]:
#To Make The Dictionary Of the data.

def make_dictionary(X_train,Y_train):
    result={}
    distinct_classes=set(Y_train)
    for current_class in distinct_classes:
        result[current_class]={}               #For each distinct class,We add a dictionary inside dictionary.
        result["total_data"]=len(Y_train)      #Storing the total number of elements in the first layer of dictionary.
        current_class_rows=(Y_train==current_class) #To get all the rows have class as current class.
        X_train_current=X_train[current_class_rows]  
        Y_train_current=Y_train[current_class_rows]
        num_features=X_train.shape[1]
        result[current_class]["total_count"]=len(Y_train_current) #To get the number of data points in each class.
        for j in range(1,num_features+1):
            result[current_class][j]={}
            all_possible_values=set(X_train[:,j-1])    #All possible values feature j could take.
            for current_value in all_possible_values:
                result[current_class][j][current_value]=(X_train_current[:,j-1]==current_value).sum()  #Count of distinct value of each feature in the respective class.
    return result

In [19]:
#To calculate the probabilty of features belonging to each class:- 

def probabilty(dictionary,x,current_class):
    output=np.log(dictionary[current_class]["total_count"])-np.log(dictionary["total_data"])
    num_features=len(dictionary[current_class].keys())-1
    for j in range(1,num_features+1):
        xj=x[j-1]
        count_current_class_with_value_xj=dictionary[current_class][j][xj]+1
        count_current_class=dictionary[current_class]["total_count"]+len(dictionary[current_class][j].keys())
        current_xj_probability=np.log(count_current_class_with_value_xj)-np.log(count_current_class)
        output=output+current_xj_probability #We are using log probabilities as probabilities are so small.
    return output

In [20]:
#To calculate the output for each test data:-

def predictSinglePoint(dictionary,x):
    classes=dictionary.keys()
    best_p=-100
    best_class=-1
    first_run=True
    for current_class in classes:
        if (current_class=="total_data"):
            continue
        p_current_class=probabilty(dictionary,x,current_class)
        if(first_run or p_current_class>best_p):
            best_p=p_current_class
            best_class=current_class
    return best_class

In [21]:
#To calculate Output:-

def predict(dictionary,X_test):
    y_pred=[]
    for x in X_test:
        x_class=predictSinglePoint(dictionary,x)
        y_pred.append(x_class)
    return y_pred

In [22]:
dictionary=make_dictionary(X_train,Y_train)

In [None]:
Y_pred_from_scratch_NB=predict(dictionary,X_test)

# Comparing The Result:-

In [15]:
#Report of Sklearn.Multinomial():

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(Y_test,Y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.81      0.82      0.82       261
           comp.graphics       0.81      0.85      0.83       248
 comp.os.ms-windows.misc       0.92      0.96      0.94       253
comp.sys.ibm.pc.hardware       0.91      0.93      0.92       260
   comp.sys.mac.hardware       0.94      0.95      0.94       266
          comp.windows.x       0.95      0.88      0.92       265
            misc.forsale       0.89      0.94      0.92       252
               rec.autos       0.92      0.91      0.92       223
         rec.motorcycles       0.98      0.95      0.97       293
      rec.sport.baseball       0.99      0.98      0.99       245
        rec.sport.hockey       0.99      0.98      0.99       247
               sci.crypt       0.96      0.96      0.96       248
         sci.electronics       0.92      0.97      0.94       239
                 sci.med       0.97      0.94      0.96       236
         

In [16]:
print(confusion_matrix(Y_test,Y_pred))

[[215   2   0   0   0   1   0   0   0   1   0   1   2   0   1   1   0   0
    2  35]
 [  0 211   6   6   7   7   3   0   0   0   0   3   1   2   2   0   0   0
    0   0]
 [  0   3 243   3   0   3   1   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   4   3 242   5   0   4   0   0   0   0   0   2   0   0   0   0   0
    0   0]
 [  0   2   2   7 253   0   1   0   0   0   0   0   0   1   0   0   0   0
    0   0]
 [  0  16   9   3   0 233   2   1   0   0   0   0   0   0   0   0   1   0
    0   0]
 [  0   0   0   2   2   0 237   4   1   0   0   0   5   1   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   9 204   2   0   0   0   4   0   0   0   3   0
    1   0]
 [  0   0   0   0   0   0   3  10 279   0   0   0   0   0   0   0   0   0
    1   0]
 [  0   0   0   0   0   0   1   0   1 241   2   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   0   0   1   1 243   0   0   0   0   0   0   0
    2   0]
 [  0   6   0   0   0   0   1   0   0   0   0 237   2   0   1   0

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,Y_pred)

0.9126

In [None]:
#Report of predictions Of Naive Baye's From Scratch

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(Y_test,Y_pred_from_scratch_NB))

In [None]:
print(confusion_matrix(Y_test,Y_pred_from_scratch_NB))

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test,Y_pred_from_scratch_NB))