In [None]:
import os
import numpy as np

### READING DOCUMENTS FROM NEWSGROUPS

In [None]:
docs=[]             #storing text in each document
classes=[]          #storing the class(newsgroup) of each document

basepath = None       #Enter path of folder where data is stored
os.chdir(basepath)
folders=os.listdir(basepath)

for folder in folders:
    current_path=os.path.join(basepath,folder)
    os.chdir(current_path)
    for doc in os.listdir():
        with open(os.path.join(current_path,doc),"r") as f:
            docs.append(f.read())
            classes.append(folder) 

### DATA CLEANING AND MAKING VOCABULARY

In [None]:
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import string

#list of stop words
punc = list(string.punctuation)
stops = stopwords.words('english')
stops += punc

#converting sentences to words
words = [word_tokenize(doc) for doc in docs]

#cleaning words

def get_pos(tag):
    
    if(tag.startswith('J')):
        return wordnet.ADJ
    elif(tag.startswith('V')):
        return wordnet.VERB
    elif(tag.startswith('N')):
        return wordnet.NOUN
    elif(tag.startswith('R')):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
lemmatizer = WordNetLemmatizer()
clean_words = []
for doc in words:
    curr_doc = []
    for w in doc:
        if(w not in stops and len(w) > 3 and w.isalpha()):
            tag = pos_tag([w])[0][1]
            simple_word = lemmatizer.lemmatize(w, pos = get_pos(tag))
            curr_doc.append(simple_word.lower())
    clean_words.append(curr_doc)   


#clean_words contains 2000 entries each corresponding to clean words in a document

In [None]:
#sorting dictionary in descending order of words based on frquency into a list of tuples
vocab=sorted(vocabulary.items(), key = lambda x : x[1], reverse=True)
new_vocab=vocab[:20000]   #considering the top 25000 words as features
features={}     #assigning an index to each word
for c, i in enumerate(new_vocab):
    features[i[0]]=c

In [None]:
from sklearn.model_selection import train_test_split as tts

#converting words in a documents to sentences (for count vectorizer)
new_document = [" ".join(doc) for doc in clean_words]

x_train, x_test, y_train, y_test = tts(new_document, classes)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 20000, max_df = 0.6)    #max_df = 0.6 means that ignore words that come in more than 60% of the documents

a = cv.fit_transform(x_train)
x_train_documents = a.todense()
x_test_documents = cv.transform(x_test).todense()

### Converting input data into a 2d array

In [None]:
def transform(x):
    
    x_2d=np.zeros((len(x),len(features.keys())))
    idx=0
    for i in x:
        for word in i.split():
            if(word in features.keys()):
                x_2d[idx][features[word]]+=1
        idx+=1
    
    return x_2d

x_train_new=transform(x_train)
x_test_new=transform(x_test)
y_train=np.asarray(y_train)

### INBUILT MULTINOMIAL NB

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report

clf=MultinomialNB()
clf.fit(x_train_new,y_train)
y_pred=clf.predict(x_test_new)

print(classification_report(y_test,y_pred))

### SELF IMPLEMENTED MULTINOMIAL NB

In [None]:
class MultinomialNaiveBayes:
    
    def __init__(self,x_train,y_train):
        self.x=x_train
        self.y=y_train
        self.words={}
        
    def fit(self):

        classes=set(self.y)
        num_features=self.x.shape[1]
        
        for curr_class in classes:
            true_rows = self.y == curr_class
            x_curr=self.x[true_rows]
            y_curr=self.y[true_rows]
            self.words[curr_class]={}
            self.words[curr_class]["sum"]=x_curr.sum()      #all words in the current class
            self.words[curr_class]["total"]=len(y_curr)     #total number of inputs in current class
            for curr_feature in range(num_features):
                self.words[curr_class][curr_feature]=x_curr[:,curr_feature].sum()   #count of a word for current class
                
    def get_output(self,x):

        best_proba = -100
        best_class = -100
        first = True
        num_features=len(x)
        classes=set(self.y)
        
        for curr_class in classes:
            
            output = np.log(self.words[curr_class]["total"])-np.log(len(self.y))
            for i in range(num_features):
                num=(self.words[curr_class][i]+1)
                denom=self.words[curr_class]["sum"] + num_features
                output += x[i]*(np.log(num) - np.log(denom))
        
            if( first or output>best_proba):
                best_proba=output
                best_class=curr_class
                
            first=False

        return best_class

    def predict(self,x_test):

        y_pred=[]
        for x in x_test:
            output_class=self.get_output(x)
            y_pred.append(output_class)
    
        return y_pred

In [None]:
clf1=MultinomialNaiveBayes(x_train_new,y_train)
clf1.fit()
y_pred1 = clf1.predict(x_test_new)
print(classification_report(y_test,y_pred1))