In [12]:
from sklearn.datasets import load_files
from nltk.corpus import stopwords
from collections import OrderedDict
import numpy as np

Loading the newsgroup files and storing in data_dictionary in shuffled order

In [2]:
data_dictionary={}
data_dictionary=load_files(r"20_newsgroups",shuffle=True)

data_dictionary.keys() are dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


Every subfolder in the given dataset corresponds to a class. Hence the target_names stores the list  of  all classes.
data[i] is a document and target[i] is storing the subfolder/class corresponding to data[i] (to be found in the target_names list)

In [3]:
#dictionary['target_names'][dictionary['target'][0]]
#dictionary['data'][dictionary['target'][0]]
#len(dictionary['data']) =19997 

In [9]:
def build_vocab(data_dictionary, count_of_train): #takes data and length (how long  till consider data for building vocab i.e how long till training)
    stop_words = set(stopwords.words('english')) 
    vocab={} #Building Vocabulary
    for i in range(0,count_of_train):
        current_data=data_dictionary['data'][i] #a_random_training_document in bytes format
        current_data=current_data.decode('utf-8','replace') #converted to unicode string format
        #current_classname=data_dictionary['target_names'][data_dictionary['target'][i]] 
        for word in current_data.split(): #traversing  word by word (splitted on space)
            if word in stop_words: #Ignoring stop words
                continue
            if word in vocab: #Existing  
                vocab[word]=vocab[word]+1  #store in vocab
            else:
                vocab[word]=1 #insert in vocab
    #Vocabulary builded
    return vocab


In [10]:
def find_features(vocab,top_limit):
    sortedvocab=OrderedDict(sorted(vocab.items(),key=lambda t : t[1],reverse=True)) #storing in orderedDICT in desc order of value
    i=0
    feature_names=[]
    for key,value in sortedvocab.items():
        if i==top_limit: # considering top_limit MAX OCCURING words as part of vocab
            break
        feature_names.append(key)
        i=i+1
    return feature_names

In [14]:
def convert_to_2D_general_input_format(data_dictionary,feature_names,start,end):
    X=[]
    Y=[]
    for docno in range(0,end-start-1):
        current_data=data_dictionary['data'][start+docno]
        current_data=current_data.decode('utf-8','replace')  
        current_classname=data_dictionary['target_names'][data_dictionary['target'][start+docno]] #fetching class corresponding to this doc
        X.append([])
        Y.append(data_dictionary['target'][start+docno]) #class_number of currentclass (0-19)
        X[docno]=[0]*len(feature_names)
        #storing  the count of all the vocab words for all the documents
        for word in current_data.split():
            if word in feature_names:
                index=feature_names.index(word)
                X[docno][index]+=1 
    return np.array(X),np.array(Y)
    

In [15]:
#SPLITTING 0-14997 training and 14997-19997 TESTING 
vocab=build_vocab(data_dictionary,14997)

feature_names=find_features(vocab,2000) 
# top 2000 words considered as features

X_test,Y_test=convert_to_2D_general_input_format(data_dictionary,feature_names,14997,19998)


X_train,Y_train=convert_to_2D_general_input_format(data_dictionary,feature_names,0,14998)

print(X_test.shape)
print(Y_test.shape)
print(X_train.shape)
print(Y_train.shape)

(5000, 2000)
(5000,)
(14997, 2000)
(14997,)


DATAPREPROCESSING DONE

In [16]:
def fit(X_train,Y_train):
    #builds a count dictionary storing the count of particular word in particular class in training data
    count={}
    class_values=set(Y_train)
    for current_class in class_values:
        count[current_class]={}
        count['total_data']=len(Y_train)
        current_class_rows=(Y_train==current_class)
        #DATA CORRESPODNDING TO CURRENT_CLASS
        X_train_current=X_train[current_class_rows]
        Y_train_current=Y_train[current_class_rows]
        num_features=X_train.shape[1]
        count[current_class]['total_words']=0
        for wordi in range(num_features):
            count[current_class][wordi]=X_train_current[:,wordi].sum()
            count[current_class]['total_words']+=count[current_class][wordi]
    return count
            

In [17]:
def probability(count,x,current_class):
        #returns probability value for the current_class for this row x (this document)
        prob_current_class= np.log(count[current_class]["total_words"]) - np.log(count["total_data"])
        prob=prob_current_class
        num_features=len(count[current_class].keys())-1
        for wordi in range(num_features):
            probcount_wordi_in_current_class=count[current_class][wordi]+1 #laplace_correction
            total_words_in_current_class=count[current_class]['total_words']+num_features
            current_prob_wordi=np.log(probcount_wordi_in_current_class)-np.log(total_words_in_current_class)
            prob=prob+(current_prob_wordi)*x[wordi] #for every occurance of the  word finding  probability
        return prob

In [18]:
def predictSinglePoint(count,x):
    #returns the  maximum probability class for this row x (this document)
    classes=count.keys()
    best_p=-1000
    best_class=-1
    first_run=True
    for  current_class in classes:
        if  (current_class=='total_data'):
            continue
        p_current_class=probability(count,x,current_class)
        if (first_run or  p_current_class>best_p): #If found better class then update
            best_p=p_current_class
            best_class=current_class
        first_run=False
    return best_class

In [19]:
def predict(count,X_test):
    #returns a list of predicted  classes for entire testing  data
    y_pred=[]
    for x in X_test:
        x_class=predictSinglePoint(count,x)
        y_pred.append(x_class)
    return y_pred

FITTING THE TRAINING DATA

In [20]:
count=fit(X_train,Y_train)

PREDICTING USING OWN NAIVE_BAYES PREDICT FUNCTION

In [22]:
#Estimated time-15 minutes to predict. 
Y_pred_own=predict(count,X_test)

REPORT OF SELF-MADE CLASSIFIER

In [24]:
from sklearn.metrics import  classification_report,confusion_matrix
print(classification_report(Y_test,Y_pred_own))
print(confusion_matrix(Y_test,Y_pred_own))

             precision    recall  f1-score   support

          0       0.69      0.72      0.71       279
          1       0.74      0.71      0.72       263
          2       0.81      0.78      0.79       236
          3       0.87      0.82      0.84       277
          4       0.84      0.88      0.86       260
          5       0.88      0.74      0.80       242
          6       0.65      0.87      0.75       247
          7       0.75      0.91      0.82       242
          8       0.80      0.91      0.85       224
          9       0.79      0.93      0.86       247
         10       0.94      0.70      0.80       226
         11       0.94      0.82      0.88       247
         12       0.70      0.83      0.76       240
         13       0.87      0.80      0.83       239
         14       0.81      0.88      0.84       267
         15       0.95      0.98      0.97       234
         16       0.66      0.81      0.73       245
         17       0.94      0.71      0.81   

PREDICTING USING INBUILT MULTINOMIAL_NB

In [25]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()
clf.fit(X_train,Y_train)
Y_pred=clf.predict(X_test)

REPORT OF INBUILT CLASSIFIER

In [26]:
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

             precision    recall  f1-score   support

          0       0.70      0.72      0.71       279
          1       0.75      0.71      0.73       263
          2       0.81      0.78      0.80       236
          3       0.87      0.82      0.84       277
          4       0.84      0.88      0.86       260
          5       0.88      0.71      0.79       242
          6       0.63      0.87      0.73       247
          7       0.75      0.90      0.82       242
          8       0.79      0.92      0.85       224
          9       0.79      0.93      0.85       247
         10       0.94      0.70      0.81       226
         11       0.95      0.81      0.88       247
         12       0.68      0.83      0.75       240
         13       0.87      0.80      0.83       239
         14       0.82      0.88      0.85       267
         15       0.95      0.98      0.97       234
         16       0.66      0.80      0.72       245
         17       0.94      0.70      0.80   