In [30]:
#path for the data to be loaded
data = "C:/Users/Hp India/Desktop/20_newsgroups"

In [31]:
from sklearn.datasets import load_files

#load the data using load_files function
content = load_files(data, encoding="utf-8", decode_error="replace")

In [7]:
import numpy as np
from sklearn import model_selection

In [8]:
#split of content data and target fields
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(content.data, content.target)

In [9]:
len(X_test), len(X_train)

(5000, 14997)

In [10]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import re

In [11]:
#function for getting feature set from the loaded documents

def Features(X_train):
    #dictionary to obtain feature_set
    feature_set ={}
    stop_words = set(stopwords.words('english')) 

    for doc in X_train:
        #remove all stopwords, numbers, special symbols from each documents
        removedNumber = re.sub(r'[0-9]+', ' ', doc)
        cleanString = re.sub(r"[^a-zA-Z0-9]+", ' ', removedNumber)
        clean = re.sub(r'\b\w{1,3}\b', ' ', cleanString)
        
        #tokenized each document
        word_tokens = word_tokenize(clean) 
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        for word in filtered_sentence:
            
            #check for each world check if is a part of dictionary or not
            #initialize each world in dictionary with one if not already present
            #increase the frequency of world by one if already present
            if word not in feature_set:
                feature_set[word] = 1
            else:
                feature_set[word] += 1
    
    #order the unorderd dictionary in reverse order
    #to get most frequent words
    data = []
    features = []
    for keys, values in feature_set.items():
        data.append((values,keys))
    data.sort(reverse=True)
    print(data[0])
    
    #select top 500 words as features
    for i in range(0,500):
        features.append(data[i][1])
        
    return features
        

In [12]:
#DataSet function for converting X_test/X_train in m*n array form 

def DataSet(X, features):
    stop_words = set(stopwords.words('english'))
    #define a numpy array as data_x
    data_x = np.zeros((len(X),len(features)))
    i = 0
    for doc in X:
        #remove all stopwords, numbers, special symbols from each documents
        
        removedNumber = re.sub(r'[0-9]+', ' ', doc)
        cleanString = re.sub(r"[^a-zA-Z0-9]+", ' ', removedNumber)
        clean = re.sub(r'\b\w{1,3}\b', ' ', cleanString)
        word_tokens = word_tokenize(clean) 
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        for word in filtered_sentence:
            
            #go through each word in filtered_sentence and convert data
            #in numpy array according to frequency of word in document
            for sets, j in zip(features, range(len(features))):
                if word == sets:
                    data_x[i][j] += 1
                    break
        i += 1
        
    return data_x

In [13]:
#fit function for obtaining the sum of frequencies of all the words
#in each class of Y_train
def fit(X1, Y_train, feature):
    #dictionary for all values of each_class
    count = {}
    class_value = set(Y_train)
    for each_class in class_value:
        #for each class create dictionary for keeping the sum
        #of each words in a feature set
        count[each_class] = {}
        count["total_count"] = len(Y_train)
        current_class_rows = (Y_train == each_class)
        X_train_current = X1[current_class_rows]
        Y_train_current = Y_train[current_class_rows]
        count[each_class]["total_count1"] = len(Y_train_current)
        count[each_class]["total_data"] = X_train_current[:,:].sum()
        
        for i in range(1,len(feature)+1):
            #obtain sum of all the frequencies for each feature
            #in a feature_set
            count[each_class][i] = X_train_current[:,i-1].sum()
            
    return count

In [14]:
#probability function to return probability value for each class
def probability(dictionary, x, current_class):
    #inital output value for each class
    output = np.log(dictionary[current_class]["total_count1"]) - np.log(dictionary["total_count"])
    number = len(dictionary[current_class].keys())-2
    #for loop over all features  
    for j in range(1,number+1):
        #xj for getting if the word is part of the feature_set
        xj = x[j-1]
        #if it is the part of the feature set obtain probability value for it
        #by formula (number of that word in each class)/(sum of all the words in that class) 
        #use log probability
        #add output values to obtain final probability value for each class
        if xj != 0:
            count_current_class_with_value_xj = dictionary[current_class][j] + 1
            count_current_class = dictionary[current_class]["total_data"] + len(dictionary[current_class].keys()) - 2
            current_xj_probablity = np.log(count_current_class_with_value_xj) - np.log(count_current_class)
            output = output + current_xj_probablity
    return output

In [15]:
#predictSinglePoint function that return the best class 
def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if (current_class == "total_count"):
            continue
        #for each class call probability function to obtain probability values
        p_current_class = probability(dictionary, x, current_class)
        #compare probability value obtain with best_p and assign the new value
        #to best_p
        if (first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [16]:
#definr predict function that return the predicited values for X_test
def predict(dictionary, X_test):
    y_pred = []
    for x in X_test:
        #for each X in X_test call predictSinglePoint
        x_class = predictSinglePoint(dictionary, x)
        y_pred.append(x_class)
    return y_pred

In [17]:
#feature_set obtain from X_train 
feature_Set = Features(X_train)

(22912, 'news')


In [18]:
#converting X_train in m*n numpy array
X1 = DataSet(X_train, feature_Set)

In [19]:
#converting X_test in m*n numpy array
X2 = DataSet(X_test, feature_Set)

In [20]:
#call fit function
dictionary = fit(X1,Y_train, feature_Set)

In [21]:
#call for predict function
Y_pred1 = predict(dictionary,X2)

##classification report using inbulid sklearn MultinomialNB()

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
clf = MultinomialNB()
clf.fit(X1, Y_train)
Y_pred = clf.predict(X2)
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.80      0.66      0.72       274
           1       0.88      0.74      0.80       235
           2       0.72      0.90      0.80       241
           3       0.69      0.59      0.64       248
           4       0.64      0.76      0.70       245
           5       0.89      0.68      0.77       261
           6       0.79      0.95      0.86       238
           7       0.80      0.90      0.85       237
           8       0.81      0.95      0.87       259
           9       0.95      0.92      0.93       247
          10       0.94      0.96      0.95       236
          11       0.97      0.93      0.95       245
          12       0.79      0.88      0.83       256
          13       0.84      0.56      0.67       247
          14       0.89      0.86      0.88       259
          15       0.96      1.00      0.98       240
          16       0.78      0.91      0.84       258
          17       0.94    

##classification_report for predicted values using naive bayes code

In [23]:
print(classification_report(Y_test,Y_pred1))
print(confusion_matrix(Y_test,Y_pred1))

              precision    recall  f1-score   support

           0       0.71      0.76      0.73       274
           1       0.93      0.73      0.82       235
           2       0.72      0.88      0.79       241
           3       0.70      0.60      0.65       248
           4       0.65      0.80      0.72       245
           5       0.84      0.67      0.74       261
           6       0.77      0.92      0.84       238
           7       0.86      0.86      0.86       237
           8       0.80      0.97      0.88       259
           9       0.96      0.96      0.96       247
          10       0.97      0.95      0.96       236
          11       0.98      0.92      0.95       245
          12       0.73      0.93      0.82       256
          13       0.78      0.60      0.68       247
          14       0.93      0.83      0.88       259
          15       1.00      1.00      1.00       240
          16       0.72      0.93      0.81       258
          17       0.93    