Implementing process:
1. Make vocab and choose top k words as the features
   :-  there is a makeDictionary function for this
   :- while making vocab stop words are removed
   :- lammatozer is used to have meaningfull words
   :- now dictionary is sorted in reverse order
   :- vocab is the list of top k words from the dictionary
   :- I have used k = 12000
2. Fit the data
   :- there is a fit function for this
   :- fit function takes two parameters path( where all 20 news groups are present ) and vocab
   :- fit function returns train_data in the form of nested dictionary as
   
                                                                            |---[word_1]---count
                                                                            |---[word_2]---count
                                                                            |
                                                                            |
                                                                            |---[word_n]---count
                                                                            |
                                                        |---[folder1_name]--|---[documents]---count
                                                        |                   |---[total_words]---count
                                                        |
                                          train_data----|---[folder2_name]--|--
                                                        |---[folder3_name]--|--
                                                        |
                                                        |
                                                        |---[folder20_name]--|--
                                                        |
                                                        |---[total_documents]---count
3. Predict the data
   :- there is a predict function for this
   :- predict function takes two parameters train_data and test_data
   :- testData function returns the test_data..test_data have the words for a perticular file in a folder
   :- predict function predicts the best class
   :- probability function returns the probability of a given file for a perticular class

In [99]:
import nltk
from nltk.corpus import stopwords
import os
import string
import re
from nltk.stem import WordNetLemmatizer
import operator
import pandas as pd
import numpy as np
import math
from sklearn.naive_bayes import MultinomialNB
from time import time

In [6]:
nltk.download('wordnet')
nltk.download("stopwords")
stop_words=set(stopwords.words("english"))
stop_words|={"", "," , ".", "-"," ","also", "even","to","it","hi","ha", "know","use","however", "xref", "cantaloupesrvcscmuedu"
            ,"wa","gmt","jan","feb","mar","apr","may","june","july","aug","sep","oct","nov","dec","mon","tue","wed","thu","fri",
            "sat","sun","nntp","doe"}

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Abhishek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abhishek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
table = str.maketrans('', '', string.punctuation)
wnl=WordNetLemmatizer()

In [8]:
def makeDictionary(path):
    word_count = {}
    for folder in os.listdir(path):
        path_for_files_in_folder=path+"\\"+folder
        for file in os.listdir(path_for_files_in_folder):
            absolute_file_path = path_for_files_in_folder + "\\" + file
            doc_file = open( absolute_file_path , "r" , errors="ignore" )
            doc_text = re.split(' |,|-|\n' , doc_file.read().strip() )
            for word in doc_text:
                if word.isalpha():
                    my_word = wnl.lemmatize(word.strip(" ").translate(table).lower())
                    word_length = len(my_word)
                    if my_word in stop_words or word_length < 2 or word_length > 15:
                        continue
                    elif my_word in word_count:
                        word_count[my_word] += 1
                    else:
                        word_count[my_word] = 1
    return word_count

In [22]:
path = r"C:\Users\Abhishek\Desktop\myDataSets\text_classification\20_newsgroups"
dictionary=makeDictionary(path)
sorted_dictionary=sorted(dictionary.items(), key=operator.itemgetter(1),reverse=True)

In [129]:
vocab=list()
for i in range(12000):
    vocab.append(sorted_dictionary[i][0]) # vocab contains top 12001 words

In [25]:
def fit( path , vocab ):
    train_data = {}
    total_doc_count = 0  #count of total documents in all the folder
    for folder in os.listdir(path):
        train_data[folder] = {}
        doc_count = 0  #count of documents in a folder
        total_words = 0  #count of words in a folder 
        path_for_files_in_folder = path + "\\" + folder
        for file in os.listdir(path_for_files_in_folder):
            doc_count += 1
            absolute_file_path = path_for_files_in_folder + "\\" + file
            doc_file = open( absolute_file_path, "r" , errors = "ignore" )
            doc_text = re.split(' |,|-|\n' , doc_file.read().strip() ) # splits the document about these characters
            for word in doc_text:
                if word.isalpha():  #check if word is alphabet or not
                    my_word = wnl.lemmatize(word.strip(" ").translate(table).lower())
                    if my_word in vocab: #check if the word is present in vocab or not
                        total_words += 1
                        if my_word in train_data[folder]:
                            train_data[folder][my_word] += 1
                        else:
                            train_data[folder][my_word] = 1
        total_doc_count += doc_count
        train_data[folder]["doc_count"] = doc_count
        train_data[folder]["total_words"] = total_words
    train_data["total_doc_count"] = total_doc_count
    return train_data

In [26]:
def probability(train_data , current_class , test_data):
    class_prob = np.log(train_data[current_class]["doc_count"])-np.log(train_data["total_doc_count"])
    for word in test_data:
        if word == "total_words":
            continue
        else:
            if word in train_data[current_class]:
                word_count_in_current_class = train_data[current_class][word] + 1 # 1 is added as laplase correction
            else:
                word_count_in_current_class = 1
            total_words_in_current_class = train_data[current_class]["total_words"] + test_data["total_words"] # laplase correction
            word_prob = np.log(word_count_in_current_class) - np.log(total_words_in_current_class)
            class_prob = class_prob + word_prob
    return class_prob

In [27]:
def predict(train_data,test_data):
    best_prob = -1000
    best_class = ""
    first_run = True
    classes = train_data.keys()
    for current_class in classes:
        if current_class == "total_doc_count":
            continue
        prob = probability(train_data , current_class , test_data)
        #print("prob ",prob,"current_class ",current_class)
        if first_run or (prob > best_prob):
            best_prob = prob
            best_class = current_class
        first_run = False
    return best_class

In [28]:
def testData( path , vocab ):
    test_data={}
    test_data["total_words"]=0
    #for file in os.listdir(path):
        #file_path = path + "\\" + file
    doc_file = open( path, "r" , errors = "ignore" )
    doc_text = re.split(' |,|-|\n' , doc_file.read().strip() )
    for word in doc_text:
        if word.isalpha():
            my_word = wnl.lemmatize(word.strip(" ").translate(table).lower())
            if my_word in vocab:
                if my_word in test_data:
                    continue
                else:
                    test_data[my_word] = 1
                    test_data["total_words"] += 1
    return test_data

In [131]:
train_data_path = r"C:\Users\Abhishek\Desktop\myDataSets\text_classification\20_newsgroups"
test_data_path = r"C:\Users\Abhishek\Desktop\myDataSets\text_classification\mini_newsgroups"

In [30]:
train_data = fit(train_data_path , vocab)

In [132]:
count=1
Y_pred = list()
for folder in os.listdir(test_data_path):
    file_path = test_data_path + "\\" + folder
    for file in os.listdir(file_path):
        file_text_path = file_path + "\\" + file 
        test_data = testData( file_text_path , vocab)
        predicted_class = predict(train_data , test_data)
        #print(predicted_class)
        if predicted_class == folder:
            Y_pred.append(int(1))
        else:
            Y_pred.append(int(0))

In [133]:
one = list()
for i in range(len(Y_pred)):
    one.append(int(1))
Y_pred = np.array(Y_pred)
one = np.array(one)
print("my_socre = ",np.mean(Y_pred == one))

my_socre =  0.8855


In [78]:
# function to to make the dictionary so as to convert this dictionary into dataframe..
def makeData(path , features):
    train_data = {}
    for folder in os.listdir(path):
        s=time()
        file_path = path + "\\" + folder
        for file in os.listdir(file_path):
            train_data[file] = {}
            for w in features:
                train_data[file][w] = 0
            text_path = file_path + "\\" + file
            doc_text = open( text_path, "r" , errors = "ignore" )
            doc_words = re.split(' |,|-|\n' , doc_text.read().strip() )
            for word in doc_words:
                if word.isalpha():
                    my_word = wnl.lemmatize(word.strip(" ").translate(table).lower())
                    if my_word in stop_words:
                        continue
                    if my_word in features:
                        if my_word in train_data[file]:
                            train_data[file][my_word] += 1
                        else:
                            train_data[file][my_word] = 1
            train_data[file]["zz_class"] = folder
        e = time()
        print(e-s)
    return train_data

In [126]:
features={}
for i in range(12000):
    features[sorted_dictionary[i][0]] = 1 # features contains top 2000 words

In [109]:
train_data_path = r"C:\Users\Abhishek\Desktop\myDataSets\text_classification\20_newsgroups"
test_data_path2 = r"C:\Users\Abhishek\Desktop\myDataSets\text_classification\mini_newsgroups"

In [None]:
#train_data2 = makeData(train_data_path , features)
train_df = pd.DataFrame(train_data2).T.fillna(0)
#test_data2 = makeData(test_data_path2 , features)
#test_df = pd.DataFrame(test_data2).T.fillna(0)
#train_df.shape
#test_df.shape

In [100]:
clf = MultinomialNB().fit(train_df.iloc[:,0:12000], train_df.iloc[:,12000])

In [125]:
y_pred = clf.predict(test_df.iloc[:,0:12000])
print("socre = " , np.mean(y_pred == test_df.iloc[: , 12000]))

socre =  0.772773797339
