In [3]:
import numpy as np
import os
import pandas as pd
import string
from nltk.corpus import stopwords
from collections import Counter

In [4]:
PATH='20_newsgroups'

In [5]:
list_classes=os.listdir(PATH)

In [6]:
print (list_classes)

['sci.space', 'comp.sys.ibm.pc.hardware', 'rec.motorcycles', 'talk.politics.guns', 'misc.forsale', 'alt.atheism', 'talk.politics.mideast', 'comp.os.ms-windows.misc', 'rec.sport.baseball', 'comp.windows.x', 'sci.electronics', 'talk.religion.misc', 'sci.crypt', 'sci.med', 'comp.sys.mac.hardware', 'talk.politics.misc', 'rec.sport.hockey', 'rec.autos', 'comp.graphics', 'soc.religion.christian']


In [7]:
stop_words=stopwords.words('english')
stop_words.extend(string.punctuation)
stop_words.extend(['1','2','3','4','5','6','7','8','9','one','two','three','four','five','six','seven','eight','nine','----------------------------------------------------------------------',"max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>ax>'"])

## DATA CLEANING

In [8]:
for current_class in list_classes:
    files_path=os.path.join(PATH,current_class)
    print (current_class,":",len(os.listdir(files_path)))

sci.space : 1000
comp.sys.ibm.pc.hardware : 1000
rec.motorcycles : 1000
talk.politics.guns : 1000
misc.forsale : 1000
alt.atheism : 1000
talk.politics.mideast : 1000
comp.os.ms-windows.misc : 1000
rec.sport.baseball : 1000
comp.windows.x : 1000
sci.electronics : 1000
talk.religion.misc : 1000
sci.crypt : 1000
sci.med : 1000
comp.sys.mac.hardware : 1000
talk.politics.misc : 1000
rec.sport.hockey : 1000
rec.autos : 1000
comp.graphics : 1000
soc.religion.christian : 997


## We split the list of documents and their class into train and test as we don't want to use test documents for creating the vocabulary

In [9]:
files_class_list=[]

for current_class in list_classes:
    files_path=os.path.join(PATH,current_class)
    for current_class_file in os.listdir(files_path):
        current_file_path=os.path.join(files_path,current_class_file)
        current_file_class_name=[]
        current_file_class_name.append(current_class)
        current_file_class_name.append(current_class_file)
        
        files_class_list.append(current_file_class_name)

In [10]:
len(files_class_list)

19997

## CLEANING ALL THE DOCUMENTS INITIALLY

In [106]:
def cleanWords(text):
    
    words=text.strip().split()
    final_words_list=[]

    for word in words:
        if ((word.lower() not in stop_words) and (len(word)>5)):
            final_words_list.append(word.lower())
                    
    return final_words_list

In [107]:
def readLines(list_lines):
    
    final_words_list=[]
    
    for current_line in list_lines:
        line_to_clean=current_line
        
        if (current_line[0]=="\'" and current_line[len(current_line)-1]=="\'"):
                line_to_clean=current_line[1:len(current_line)-1]
        elif (current_line[0]=="\'"):
                line_to_clean=current_line[1:]
        elif (current_line[len(current_line)-1]=="\'"):
            line_to_clean=current_line[:len(current_line)-1]
            
        final_words_list.extend(cleanWords(line_to_clean))
    
    return final_words_list

In [117]:
clean_files_list=[]

for current_file in files_class_list:
    current_file_path=os.path.join(os.path.join(PATH,current_file[0]),current_file[1])
        
    current_file_class_words=[]
    current_file_class_words.append(current_file[0])
    try:
        file=open(current_file_path,'r')
        current_file_class_words.append(readLines(file.readlines()))
            
    except:
        file=open(current_file_path,'r',encoding='latin-1')
        current_file_class_words.append(readLines(file.readlines()))
    
    clean_files_list.append(current_file_class_words)

In [123]:
import random

In [125]:
# Shuffled the list of documents so that while splitting we get data points from each class

random.shuffle(clean_files_list)

In [126]:
train_files_list=clean_files_list[:15000]
test_files_list=clean_files_list[15000:]

In [127]:
print (len(train_files_list))
print (len(test_files_list))

15000
4997


In [None]:
def readWords(text):
    
    final_words_list=[]
    for word in text:
        final_words_list.append(word)
    return final_words_list

In [None]:
final_words_list=[]
for current_class,current_file_words in train_files_list:
    final_words_list.extend(readWords(current_file_words))

In [None]:
words_frequency_list_new=Counter(final_words_list_new).most_common(5000)

In [140]:
words_frequency_list

[('subject:', 15404),
 ('newsgroups:', 15074),
 ('message-id:', 15048),
 ('lines:', 15030),
 ('organization:', 14451),
 ('writes:', 10546),
 ('references:', 9403),
 ('article', 9089),
 ('sender:', 8154),
 ('nntp-posting-host:', 6509),
 ('people', 6440),
 ('university', 6157),
 ('cantaloupe.srv.cs.cmu.edu', 4520),
 ('distribution:', 3334),
 ('anyone', 2977),
 ('really', 2664),
 ('believe', 2495),
 ('computer', 2403),
 ('system', 2403),
 ('something', 2369),
 ('reply-to:', 2322),
 ('please', 2248),
 ('without', 2106),
 ('someone', 1962),
 ('information', 1948),
 ('government', 1938),
 ('better', 1938),
 ('problem', 1880),
 ('another', 1877),
 ('number', 1858),
 ('things', 1791),
 ("that's", 1780),
 ("max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'", 1769),
 ('little', 1732),
 ('available', 1689),
 ('windows', 1650),
 ('question', 1618),
 ('different', 1617),
 ('probably', 1584),
 ('around', 1581),
 ('(usenet', 1578),
 ('public', 1514),
 ('program', 1502),
 ('software', 1492

In [131]:
len(final_words_list)

1637623

In [18]:
# taking top 5000 words as our vocabulary

words_frequency_list=Counter(final_words_list).most_common(5000)

In [19]:
words_frequency_list

[('subject:', 15404),
 ('newsgroups:', 15074),
 ('message-id:', 15048),
 ('lines:', 15030),
 ('organization:', 14451),
 ('writes:', 10546),
 ('references:', 9403),
 ('article', 9089),
 ('sender:', 8154),
 ('nntp-posting-host:', 6509),
 ('people', 6440),
 ('university', 6157),
 ('cantaloupe.srv.cs.cmu.edu', 4520),
 ('distribution:', 3334),
 ('anyone', 2977),
 ('really', 2664),
 ('believe', 2495),
 ('computer', 2403),
 ('system', 2403),
 ('something', 2369),
 ('reply-to:', 2322),
 ('please', 2248),
 ('without', 2106),
 ('someone', 1962),
 ('information', 1948),
 ('government', 1938),
 ('better', 1938),
 ('problem', 1880),
 ('another', 1877),
 ('number', 1858),
 ('things', 1791),
 ("that's", 1780),
 ("max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'", 1769),
 ('little', 1732),
 ('available', 1689),
 ('windows', 1650),
 ('question', 1618),
 ('different', 1617),
 ('probably', 1584),
 ('around', 1581),
 ('(usenet', 1578),
 ('public', 1514),
 ('program', 1502),
 ('software', 1492

In [20]:
vocabulary=[]

for word_freq in words_frequency_list:
        vocabulary.append(word_freq[0])

## TRAINING

In [61]:
def readWords_for_making_dictionary(text,current_class,train_word_freq_dict):
    
    words=text.strip().split()
    
    for word in words:
        train_word_freq_dict[current_class]['total_class_words']+=1
        if (word in train_word_freq_dict[current_class]):
            train_word_freq_dict[current_class][word.lower()]+=1
                    
    return train_word_freq_dict

In [62]:
def readLines_for_making_dictionary(lines_list,current_class,train_word_freq_dict):
    
    for current_line in lines_list:
        line_to_clean=current_line
        
        if (current_line[0]=="\'" and current_line[len(current_line)-1]=="\'"):
            line_to_clean=current_line[1:len(current_line)-1]
            
        elif (current_line[0]=="\'"):
            line_to_clean=current_line[1:]
            
        elif (current_line[len(current_line)-1]=="\'"):
            line_to_clean=current_line[:len(current_line)-1]
            
        train_word_freq_dict=readWords_for_making_dictionary(line_to_clean,current_class,train_word_freq_dict)
    
    return train_word_freq_dict

In [63]:
def fit():
    train_word_freq_dict={}
    train_word_freq_dict['total_train_files']=len(train_files_list)
    
    for current_class,current_file in train_files_list:
        if ((current_class in train_word_freq_dict)==False):
            train_word_freq_dict[current_class]={}
            
            for word in vocabulary:
                train_word_freq_dict[current_class][word]=0
            
            train_word_freq_dict[current_class]['total_class_words']=0
            train_word_freq_dict[current_class]['total_train_files']=0
            
        current_file_path=os.path.join(os.path.join(PATH,current_class),current_file)
        
        train_word_freq_dict[current_class]['total_train_files']+=1
        
        try:
            file=open(current_file_path,'r')
            train_word_freq_dict=readLines_for_making_dictionary(file.readlines(),current_class,train_word_freq_dict)
            
        except:
            file=open(current_file_path,'r',encoding='latin-1')
            train_word_freq_dict=readLines_for_making_dictionary(file.readlines(),current_class,train_word_freq_dict)
        
    return train_word_freq_dict

## TESTING

In [64]:
def get_current_line_probability(text,current_class,train_dict):
    
    words=text.strip().split()
    first_term=0
    for word in words:
        if (word in train_dict[current_class]):
            numerator=np.log(train_dict[current_class][word]+1)
            denominator=np.log(train_dict[current_class]['total_class_words']+len(vocabulary))
            first_term+=(numerator-denominator)
    
    return first_term

In [65]:
def readLines_for_prediction(lines_list,train_dict):
    
    best_class_probability=-1000
    best_class=-1
    first_run=True
    
    for current_class in list_classes:
        
        current_class_probability=(np.log(train_dict[current_class]['total_train_files'])-np.log(train_dict['total_train_files']))
        
        for current_line in lines_list:
            line_to_clean=current_line
        
            if (current_line[0]=="\'" and current_line[len(current_line)-1]=="\'"):
                line_to_clean=current_line[1:len(current_line)-1]
            
            elif (current_line[0]=="\'"):
                line_to_clean=current_line[1:]
            
            elif (current_line[len(current_line)-1]=="\'"):
                line_to_clean=current_line[:len(current_line)-1]
            
            current_class_probability+=get_current_line_probability(line_to_clean,current_class,train_dict)
        
        if (first_run or current_class_probability > best_class_probability):
            best_class=current_class
            best_class_probability=current_class_probability
        
        first_run=False
            
    return best_class

In [66]:
def predict(train_dict,files_list):
    class_predicted=[]
    
    for current_class,current_file in files_list:
        
        current_file_path=os.path.join(os.path.join(PATH,current_class),current_file)
        
        try:
            file=open(current_file_path,'r')
            class_predicted.append(readLines_for_prediction(file.readlines(),train_dict))
            
        except:
            file=open(current_file_path,'r',encoding='latin-1')
            class_predicted.append(readLines_for_prediction(file.readlines(),train_dict))
            
    return class_predicted

In [67]:
train_dict=fit()

In [68]:
yPredicted=predict(train_dict,test_files_list)

In [69]:
len(yPredicted)

4997

In [53]:
from sklearn.metrics import classification_report,confusion_matrix

In [70]:
yTest=[]

for current_class,file in test_files_list:
        yTest.append(current_class)
    

In [71]:
print (classification_report(yTest,yPredicted))

                          precision    recall  f1-score   support

             alt.atheism       0.63      0.87      0.73       230
           comp.graphics       0.72      0.82      0.77       259
 comp.os.ms-windows.misc       0.89      0.77      0.83       252
comp.sys.ibm.pc.hardware       0.84      0.89      0.86       261
   comp.sys.mac.hardware       0.87      0.89      0.88       258
          comp.windows.x       0.88      0.86      0.87       254
            misc.forsale       0.86      0.77      0.81       238
               rec.autos       0.93      0.86      0.89       231
         rec.motorcycles       0.98      0.91      0.94       246
      rec.sport.baseball       0.99      0.96      0.98       259
        rec.sport.hockey       1.00      0.94      0.97       254
               sci.crypt       0.80      0.93      0.86       261
         sci.electronics       0.88      0.89      0.89       240
                 sci.med       0.92      0.93      0.93       248
         

In [72]:
print (confusion_matrix(yTest,yPredicted))

[[199   2   0   0   0   0   0   0   0   0   0   1   0   0   0   2   0   2
    3  21]
 [  1 213  10   3   6   9   1   0   1   0   0   5   3   2   3   0   0   0
    1   1]
 [  1  15 195  12   4  15   1   0   0   0   0   4   0   0   4   0   1   0
    0   0]
 [  1   6   1 231   8   3   5   1   0   0   0   1   2   1   0   0   0   0
    1   0]
 [  0   7   2  10 230   0   4   0   0   0   0   1   3   0   0   0   0   0
    0   1]
 [  1  17   6   2   2 219   0   0   1   0   0   2   1   1   1   0   0   1
    0   0]
 [  0   9   2   9   9   1 183   4   1   0   0   7   5   3   3   1   0   1
    0   0]
 [  1   1   0   1   0   0   5 198   0   0   0   7   2   0   2   0   8   0
    6   0]
 [  2   1   0   0   0   0   2   5 223   0   0   4   3   0   0   2   3   0
    1   0]
 [  2   0   0   0   0   0   3   0   0 249   1   1   2   0   0   0   0   0
    1   0]
 [  0   0   0   0   1   0   0   0   0   2 239   1   0   1   0   3   0   0
    7   0]
 [  1   7   0   1   1   0   1   0   0   0   0 242   1   0   1   0

In [73]:
yPredicted_train=predict(train_dict,train_files_list)

In [74]:
len(yPredicted_train)

15000

In [75]:
yTrain=[]

for current_class,file in train_files_list:
        yTrain.append(current_class)
    

In [76]:
print (classification_report(yTrain,yPredicted_train))

                          precision    recall  f1-score   support

             alt.atheism       0.76      0.95      0.84       770
           comp.graphics       0.82      0.94      0.88       741
 comp.os.ms-windows.misc       0.94      0.86      0.90       748
comp.sys.ibm.pc.hardware       0.89      0.95      0.92       739
   comp.sys.mac.hardware       0.95      0.95      0.95       742
          comp.windows.x       0.93      0.91      0.92       746
            misc.forsale       0.92      0.85      0.88       762
               rec.autos       0.96      0.93      0.94       769
         rec.motorcycles       1.00      0.94      0.97       754
      rec.sport.baseball       1.00      0.98      0.99       741
        rec.sport.hockey       1.00      0.97      0.98       746
               sci.crypt       0.88      0.97      0.92       739
         sci.electronics       0.95      0.95      0.95       760
                 sci.med       0.95      0.94      0.94       752
         

## USING INBUILT MULTINOMIAL NAIVE BAYES

In [179]:
def readWords_for_X(text):
    words=text.strip().split()
    clean_words=[]
    
    for word in words:
        if (word not in stop_words and len(word)>5):
                clean_words.append(word)
    return clean_words

In [180]:
def readLines_for_X(lines_list):
    
    clean_words=[]
    
    for current_line in lines_list:
        line_to_clean=current_line
        
        if (current_line[0]=="\'" and current_line[len(current_line)-1]=="\'"):
            line_to_clean=current_line[1:len(current_line)-1]
            
        elif (current_line[0]=="\'"):
            line_to_clean=current_line[1:]
            
        elif (current_line[len(current_line)-1]=="\'"):
            line_to_clean=current_line[:len(current_line)-1]
            
        clean_words.extend(readWords_for_X(line_to_clean))
    
    return clean_words

In [181]:
def get_data_for_inbuilt():

    y=[]
    x=[]

    for current_class,current_file in files_class_list:
        
        current_file_path=os.path.join(os.path.join(PATH,current_class),current_file)
        y.append(current_class)
        
        try:
            file=open(current_file_path,'r')
            x.append(readLines_for_X(file.readlines()))
            
        except:
            file=open(current_file_path,'r',encoding='latin-1')
            x.append(readLines_for_X(file.readlines()))
                     
    return x,y

In [182]:
X,Y=get_data_for_inbuilt()

In [183]:
len(X),len(Y)

(19997, 19997)

In [186]:
from sklearn.model_selection import train_test_split

In [187]:
xTrain,xTest,yTrain,yTest=train_test_split(X,Y)

In [188]:
xTrain=[" ".join(currDoc) for currDoc in xTrain]
xTest=[" ".join(currDoc) for currDoc in xTest]

In [192]:
from sklearn.feature_extraction.text import CountVectorizer

In [198]:
count_vec=CountVectorizer(max_features=5000)
xTrain_sklearn_format=count_vec.fit_transform(xTrain)

In [199]:
count_vec.get_feature_names()

['00',
 '000',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '0d',
 '0g',
 '0q',
 '0t',
 '10',
 '100',
 '1024x768',
 '11',
 '12',
 '128',
 '129',
 '13',
 '130',
 '14',
 '144',
 '145',
 '147',
 '15',
 '150',
 '16',
 '17',
 '18',
 '19',
 '192',
 '1992',
 '1993',
 '1993apr1',
 '1993apr10',
 '1993apr12',
 '1993apr13',
 '1993apr14',
 '1993apr15',
 '1993apr16',
 '1993apr17',
 '1993apr18',
 '1993apr19',
 '1993apr2',
 '1993apr20',
 '1993apr21',
 '1993apr22',
 '1993apr23',
 '1993apr24',
 '1993apr25',
 '1993apr26',
 '1993apr27',
 '1993apr28',
 '1993apr29',
 '1993apr3',
 '1993apr30',
 '1993apr4',
 '1993apr5',
 '1993apr6',
 '1993apr7',
 '1993mar31',
 '1993may12',
 '1993may13',
 '1d',
 '1d9',
 '1qvfik',
 '1t',
 '20',
 '200',
 '202',
 '21',
 '214',
 '22',
 '23',
 '24',
 '241',
 '24e',
 '25',
 '250',
 '256',
 '26',
 '27',
 '28',
 '29',
 '2di',
 '2pl',
 '2tm',
 '30',
 '300',
 '301',
 '31',
 '32',
 '32bis',
 '33',
 '34',
 '34u',
 '35',
 '36',
 '37',
 '38',
 '386',
 '386bsd',
 '39',
 '

In [200]:
xTest_sklearn_format=count_vec.transform(xTest)

In [201]:
from sklearn.svm import SVC

In [202]:
svc=SVC()
svc.fit(xTrain_sklearn_format,yTrain)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [203]:
svc.score(xTest_sklearn_format,yTest)

0.8784

In [204]:
from sklearn.naive_bayes import MultinomialNB

In [205]:
clf_mnb = MultinomialNB()

In [207]:
clf_mnb.fit(xTrain_sklearn_format,yTrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [208]:
clf_mnb.score(xTest_sklearn_format,yTest)

0.8386

In [211]:
yPredicted=clf_mnb.predict(xTest_sklearn_format)

In [213]:
print (confusion_matrix(yTest,yPredicted))

[[269   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0 288   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0  76   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0 329   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0 290   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0 234   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0 286   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   0 269   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   0   0 256   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   0   0   0 244   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   0   0   0   0 264   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   0   0   0   0   0 231   0   0   0   0

In [212]:
print (classification_report(yTest,yPredicted))

                          precision    recall  f1-score   support

             alt.atheism       1.00      1.00      1.00       269
           comp.graphics       1.00      1.00      1.00       288
 comp.os.ms-windows.misc       1.00      1.00      1.00        76
comp.sys.ibm.pc.hardware       1.00      1.00      1.00       329
   comp.sys.mac.hardware       1.00      1.00      1.00       290
          comp.windows.x       1.00      1.00      1.00       234
            misc.forsale       1.00      1.00      1.00       286
               rec.autos       1.00      1.00      1.00       269
         rec.motorcycles       1.00      1.00      1.00       256
      rec.sport.baseball       1.00      1.00      1.00       244
        rec.sport.hockey       1.00      1.00      1.00       264
               sci.crypt       1.00      1.00      1.00       231
         sci.electronics       1.00      1.00      1.00       263
                 sci.med       1.00      1.00      1.00       210
         