In [1]:
from __future__ import print_function
import os
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
import logging
import numpy as np
import pandas as pd
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt
from collections import Counter
import string
from sklearn.model_selection import train_test_split

In [2]:
# this funtion is to fetch all file name and put in to a list x 
# y value is corresponding to newgroup which vary from 0 to 19 to represent all 20 newsgroup
def get_data(path):
    x=[]
    y=[]
    i=0
    for newspaper in os.listdir(path):
        n_path=path+'/'+newspaper
        for filename in os.listdir(n_path):
            new_path=n_path+'/'+filename
            x.append(new_path)
            y.append(i)
        i+=1
    return x,y

In [3]:
# this function is to remove punctuation and then tokenize it 
def get_tokens(doc):
        deletechars={ord(c): None for c in string.punctuation} # to remove punctuation
        no_punctuation=doc.translate(deletechars)
        tokens=nltk.word_tokenize(no_punctuation)
        return tokens

In [4]:
# this funtion is to tokenize all document into a token_data and also put tokens for every 
# individual newsgroup into a seperate list 
def tokenization(x_train,y_train):
    token_data=[]
    token_list=[[] for i in range(20)]
    for i in range(len(x_train)):
        file=open(x_train[i],'r')
        doc=file.read()
        lowers=doc.lower()
        token=get_tokens(lowers)
        filter_token=[w for w in token if not w in stopset]
        token_data.extend(filter_token)
        token_list[y_train[i]].extend(filter_token)
    return token_data,token_list

In [5]:
# this funtion is to create an 2d np array of 20 * 20001 to store frequency of feature words for individual newsgroup
def get_dictionary(token_list,feature_name):
    x=np.zeros((20,2000),dtype=int)
    for i in range(20):
        token_data=token_list[i]
        for word in token_data:
            if word in feature_name:
                j=feature_name.index(word)
                x[i][j]+=1
    total=[]
    for i in range(20):
        sum=0
        for j in range(2000):
            sum+=x[i][j]
        total.append(sum)
    t_arr=np.array(total)
    t_arr=t_arr.reshape(len(t_arr),1) 
    x=np.append(x,t_arr,axis=1)    # to add a column in array which store sum of all words for every individual newspaper group
    return x

In [6]:
# this function is to select top 2000 words from dictionary
def get_feature_name(count):
    feature_name=[word for (word, freq) in count.most_common(2000)]
    return feature_name

In [7]:
# this function is to calculate probablity of word occurence with lapalce correction
def probablity(dictionary,x,feature_name,current_class):
    output=np.log(1)-np.log(20)
    occurance={}
    for word in x:
        if word in occurance.keys():
            continue
        else:
            occurance[word]=1
        if word in feature_name:
            index=feature_name.index(word)
            temp=np.log(dictionary[current_class][index]+1)-np.log(dictionary[current_class][dictionary.shape[1]-1]+2000)
            output+=temp
    return output

In [8]:
# this is to tokenize word for a file 
def tokenize_word(path):
    file=open(path,'r')
    doc=file.read()
    lowers=doc.lower()
    token=get_tokens(lowers)
    filter_token=[w for w in token if not w in stopset]
    return filter_token

In [9]:
# this function is to provide y predicted value
def predict(dictionary,feature_name,x_test):
    y_pred=[]
    for i in range(len(x_test)):
        words=tokenize_word(x_test[i])
        best_p=-1e9
        best_class=-1
        for j in range(20):
            current=probablity(dictionary,words,feature_name,j)
            if(current>best_p):
                best_p=current
                best_class=j 
        y_pred.append(best_class)
    return y_pred

In [10]:
path='../datasets/20_newsgroups'
x,y=get_data(path)

In [11]:
x_train,x_test,y_train,y_test=train_test_split(x,y,shuffle=True,test_size=0.25,random_state=42)

In [12]:
token_data,token_list=tokenization(x_train,y_train)

In [13]:
count=Counter(token_data)

In [14]:
feature_name=get_feature_name(count)

In [15]:
dictionary=get_dictionary(token_list,feature_name)

In [16]:
y_pred=predict(dictionary,feature_name,x_test)

In [17]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.71      0.75      0.73       237
          1       0.80      0.77      0.78       237
          2       0.87      0.81      0.84       271
          3       0.88      0.82      0.85       247
          4       0.86      0.91      0.88       253
          5       0.97      0.83      0.89       237
          6       0.75      0.88      0.81       256
          7       0.80      0.93      0.86       260
          8       0.83      0.95      0.88       255
          9       0.90      0.96      0.93       246
         10       0.99      0.87      0.92       240
         11       0.95      0.83      0.88       255
         12       0.70      0.92      0.80       259
         13       0.89      0.90      0.89       276
         14       0.90      0.90      0.90       253
         15       0.98      0.99      0.98       243
         16       0.72      0.83      0.77       244
         17       0.94      0.81      0.87   