In [1]:
import numpy as np                                                  # to preform numerical operations
from sklearn import datasets                                        # to import required dataset
from sklearn import model_selection                                 # to split data into training and testing data
from nltk.corpus import stopwords                                   # for removing stopwords
import itertools                                                    # for slicing vocabulary dictionary into required length
import  pandas as pd                                                # for dataset manipulation
from sklearn.metrics import classification_report, confusion_matrix # for getting info of result precision and accuracy
from nltk.tokenize import word_tokenize                             # for tokenizing  
import math as ma

In [2]:
stop_words=set(stopwords.words('english'))                                            # get the stopwords 
news=datasets.fetch_20newsgroups()                                                    # load data into news 
x_train,x_test,y_train,y_test=model_selection.train_test_split(news.data,news.target,test_size=0.1) # split data into test and train    

In [3]:
print(stop_words)                                                   # Just checking what are the stopwords

{'i', 'wasn', 'weren', 'very', 'itself', 's', "hadn't", 'has', 'me', "you'll", 'down', 'both', 'aren', 'hasn', 'them', 'won', 'wouldn', 'into', 'haven', 'did', 'is', 'ours', 'your', 'during', 'or', 'nor', "you're", 'mustn', 're', 'no', 'such', 'against', "haven't", 'been', 'off', 'with', 'ourselves', 'further', 'just', 'as', 'does', 'y', 'this', 'while', "should've", "isn't", 'will', 'other', "wouldn't", "you've", 'mightn', "you'd", 'couldn', 'needn', 'my', "didn't", "wasn't", 'own', 'all', "she's", "shan't", 'from', 'have', 'same', 'which', 'before', 'himself', 'should', 'was', "mightn't", 'you', 'by', 'a', 'few', 'myself', 'we', 'it', 'he', 'because', 'until', 'after', 'hers', "aren't", 'they', 'isn', 'on', 'once', "don't", "that'll", 'here', 'below', 'but', 'there', 'yours', 'in', 'doing', "hasn't", 'any', 'him', 'yourself', 'her', 'd', 't', 'be', 'again', 'that', 'about', 'if', 'do', 'who', 'to', 'those', 'their', 'having', 've', "weren't", 'themselves', "mustn't", 'for', 'didn', "

In [4]:
#import nltk
#nltk.download('punkt')                                             # needed to be done first time
d={}                                                                # d is going to be our vocabulary dictionary
for i in range(len(x_train)):                                       # this function filters stopwords and creates vocabulary
    data=x_train[i]                                                 # taken one row at a time of training data
    words=word_tokenize(data)                                       # tokenize the data
    filtered_sentence=[ele for ele in words if ele not in stop_words] # get filtered sentence
    for ele in filtered_sentence:                                   # create vocabulary from filtered sentence    
        d[ele]=d.get(ele,0)+1                                         
new_dict={}                                                         # this will be our final vocabulary dictionary
for key,value in sorted(d.items(), key=lambda kv: kv[1],reverse=True): # put the key,value pair from d in decreasing order of value
    new_dict[key]=value
new_dict=dict(itertools.islice(new_dict.items(),6000))             # Take the top 6000 elements into final vocabulary
features=[]
for i in new_dict:
    features.append(i)

In [5]:
print(new_dict)                                                     # just checking final created vocabulary dictionary
print(features.shape)



AttributeError: 'list' object has no attribute 'shape'

In [None]:
def updatedRow(words,features,i):                                   
    lst=np.array([0]*6000)                                         # Initialise the lst to be np.zeros with dimensions 1 X 6000                                                           # i will help us iterate our lst 
    for ele in words:
        if ele in features:
            lst[features.index(ele)]=words.count(ele)
    return lst

def updateXTrain(x_train,features):
    x_train_updated=pd.DataFrame(np.zeros((len(x_train),6000)))     # Intialise the np.zeros 2-D array to x_train_updated
    x_train_updated.columns=features                                # Add columns to the x_train_updated
    for i in range(len(x_train)):                                   # Now operate on each row
        curr_x=x_train[i]                                           # curr_x contains the row i content
        words=word_tokenize(curr_x)                                 # extract all words of curr_x into words
        x_train_updated.iloc[i,:]=updatedRow(words,features,i)      # replace row i with the updateRow() function's list
    return x_train_updated                                          # return x_train_updated

x_train_updated=updateXTrain(x_train,features)                      # get updated x_train_updated
x_train_updated.columns                                             # just checking the columns

In [None]:
def fit(X_train, Y_train):                                          # This function is used to fit training data into our model
    result = {}                                                     # This dictionary is going to be useful in later calculations
    class_values = set(Y_train) 
    for current_class in class_values:                              # We create keys for all the possible classes
        result[current_class] = {}                                  # We create a dictionary as value for each key itself
        current_class_rows = (Y_train == current_class)             # Obtain rows for the current class
        X_train_current = X_train[current_class_rows]               # Filter the x_train for current class
        Y_train_current = Y_train[current_class_rows]               # Filter the y_train for current class
        num_features = X_train.shape[1]                             
        result[current_class]["count"] = len(Y_train_current)       # It gives total number of features of our data
        a=0                                                         # To get total number of a particular feature 
        for j in range(num_features):                               # traverse each feature
            result[current_class][j]=X_train_current[:,j].sum()     # Get total number of current feature
            a+=result[current_class][j]                             # Increment a by total number of current feature
        result[current_class]['total']=a                            # Assign a, will be used in later calculations
    result["total"] = len(Y_train)                                  # It gives total element present in our dictionary
    return result                                                   # Return result


x_train_updated=np.array(x_train_updated)                           # Change pandas dataframe to numpy array
d=fit(x_train_updated,y_train)                                      # Fit ur trainig data 

In [None]:
def probability(dictionary, x, current_class): 
    output = ma.log(dictionary[current_class]["total"]) - ma.log(dictionary["total"])
    num_features = len(dictionary[current_class].keys()) - 2
    for j in range( num_features ):
        if x[j]==0:
            continue
        count_current_class_with_value_j = dictionary[current_class][j]+1
        count_current_class = dictionary[current_class]['total']+num_features
        current_j_probablity = ma.log(count_current_class_with_value_j) - ma.log(count_current_class)
        output = output + current_j_probablity
    return output

def doSinglePrediction(x,dictionary):                                  # Function to predict class for a single row
    classes = dictionary.keys()                                        # Get all possible classes
    best_p = -100                                                      # Initialise best probablity & class to some -ve number
    best_class = -100 
    first_run = True                                                   # Running for first time = True
    for current_class in classes:                                      # Iterate over each possible class
        if (current_class == "total"):                                 # Ignore 'total' key
            continue
        p_current_class = probability(dictionary, x, current_class)    # Get probablity for x belonging to current class
        if (first_run or p_current_class > best_p):                    # If this is greatest probablity till now, change the
            best_p = p_current_class                                    # value of greatest probability & predicted class
            best_class = current_class
        first_run = False                                              # First run complete
    return best_class                                                  # Return the predicted class out of all classes

def y_predict(x_test,d,features):                                       # Function to predict the output values for test data
    y_pred=np.zeros(len(x_test))                                        # We will use this numpy array to store predictions
    x_test_updated=updateXTrain(x_test,features)                        # Update x_test so that our model can operate on it
    x_test_updated=np.array(x_test_updated)                             # Change pandas dataframe to numpy array
    for i in range(len(x_test)):                                                                                                        # Extract one-one row from data
        curr_x=x_test_updated[i,:]                                      # Extract one-one row 
        y_pred[i]=doSinglePrediction(curr_x,d)                          # Predict class for current row, store the prediction
    return y_pred                                                       # Return predictions


In [None]:
y_pred=y_predict(x_test,d,features)                                     # Call function to predict classes for x_test
print(classification_report(y_test,y_pred))                             # Print classification report
print(confusion_matrix(y_test,y_pred))                                  # Print confusion matrix


In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

In [None]:
print('\n\n\t\tCOMPARING WITH LIBRARY IMPLEMENTATION')      # Compare built model with the sklearn's inbuilt model

from sklearn.naive_bayes import MultinomialNB               # Import MultinomialDB from sklearn
clf = MultinomialNB()                                       # Create object of MultinomialDB
clf.fit(x_train_updated, y_train)                           # Fit the training data
x_test_updated=updateXTrain(x_test,features)                # Update x_test so that model can operate on it
x_test_updated=np.array(x_test_updated)                     # Convert to np array
y_pred = clf.predict(x_test_updated)                        # Do predictions
print(classification_report(y_test,y_pred))                 # Print classification report
print(confusion_matrix(y_test,y_pred))                      # Print confusion matrix
print(accuracy_score(y_test,y_pred))

On comparing above results we can see that our implementation is performing better than the library implementation of Naive Bayes.