In [1]:
import numpy as np
import os
import pandas as pd
import string
from nltk.corpus import stopwords
from collections import Counter

In [2]:
PATH='20_newsgroups'

In [3]:
list_classes=os.listdir(PATH)

In [4]:
print (list_classes)

['sci.space', 'comp.sys.ibm.pc.hardware', 'rec.motorcycles', 'talk.politics.guns', 'misc.forsale', 'alt.atheism', 'talk.politics.mideast', 'comp.os.ms-windows.misc', 'rec.sport.baseball', 'comp.windows.x', 'sci.electronics', 'talk.religion.misc', 'sci.crypt', 'sci.med', 'comp.sys.mac.hardware', 'talk.politics.misc', 'rec.sport.hockey', 'rec.autos', 'comp.graphics', 'soc.religion.christian']


In [5]:
stop_words=stopwords.words('english')
stop_words.extend(string.punctuation)
stop_words.extend(['1','2','3','4','5','6','7','8','9','one','two','three','four','five','six','seven','eight','nine','----------------------------------------------------------------------',"max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>ax>'"])

## DATA CLEANING

In [6]:
for current_class in list_classes:
    files_path=os.path.join(PATH,current_class)
    print (current_class,":",len(os.listdir(files_path)))

sci.space : 1000
comp.sys.ibm.pc.hardware : 1000
rec.motorcycles : 1000
talk.politics.guns : 1000
misc.forsale : 1000
alt.atheism : 1000
talk.politics.mideast : 1000
comp.os.ms-windows.misc : 1000
rec.sport.baseball : 1000
comp.windows.x : 1000
sci.electronics : 1000
talk.religion.misc : 1000
sci.crypt : 1000
sci.med : 1000
comp.sys.mac.hardware : 1000
talk.politics.misc : 1000
rec.sport.hockey : 1000
rec.autos : 1000
comp.graphics : 1000
soc.religion.christian : 997


## We split the list of documents and their class into train and test as we don't want to use test documents for creating the vocabulary

In [7]:
files_class_list=[]

for current_class in list_classes:
    files_path=os.path.join(PATH,current_class)
    for current_class_file in os.listdir(files_path):
        current_file_path=os.path.join(files_path,current_class_file)
        current_file_class_name=[]
        current_file_class_name.append(current_class)
        current_file_class_name.append(current_class_file)
        
        files_class_list.append(current_file_class_name)

In [8]:
import random

In [9]:
random.shuffle(files_class_list)

In [10]:
train_files_list=files_class_list[:7000]
test_files_list=files_class_list[7000:]

In [11]:
# Gets a line from a file and returns the list of cleaned up words

def cleanWords(text):
    
    words=text.strip().split()
    final_words_list=[]

    for word in words:
        if ((word.lower() not in stop_words) and (len(word)>5)):
            final_words_list.append(word.lower())
                    
    return final_words_list

In [12]:
# Gets the list of lines from a file and returns the list of cleaned up words

def readLines(list_lines):
    
    final_words_list=[]
    
    for current_line in list_lines:
        line_to_clean=current_line
        
        if (current_line[0]=="\'" and current_line[len(current_line)-1]=="\'"):
                line_to_clean=current_line[1:len(current_line)-1]
        elif (current_line[0]=="\'"):
                line_to_clean=current_line[1:]
        elif (current_line[len(current_line)-1]=="\'"):
            line_to_clean=current_line[:len(current_line)-1]
            
        final_words_list.extend(cleanWords(line_to_clean))
    
    return final_words_list

In [13]:
final_words_list=[]

for current_file in train_files_list:
        current_file_path=os.path.join(os.path.join(PATH,current_file[0]),current_file[1])
        
        try:
            file=open(current_file_path,'r')
            final_words_list.extend(readLines(file.readlines()))
            
        except:
            file=open(current_file_path,'r',encoding='latin-1')
            final_words_list.extend(readLines(file.readlines()))

In [14]:
words_frequency_list=Counter(final_words_list).most_common(2000)

In [15]:
words_frequency_list

[('subject:', 7133),
 ('newsgroups:', 7028),
 ('message-id:', 7016),
 ('lines:', 7016),
 ('organization:', 6704),
 ('writes:', 4855),
 ('references:', 4377),
 ('article', 4293),
 ('sender:', 3867),
 ('nntp-posting-host:', 3058),
 ('university', 2878),
 ('people', 2798),
 ('cantaloupe.srv.cs.cmu.edu', 2087),
 ('distribution:', 1510),
 ('anyone', 1322),
 ('really', 1219),
 ('believe', 1145),
 ('something', 1145),
 ('please', 1105),
 ('reply-to:', 1098),
 ("max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'", 1093),
 ('computer', 1079),
 ('system', 989),
 ('someone', 929),
 ('without', 920),
 ('another', 902),
 ('problem', 875),
 ('number', 866),
 ('government', 853),
 ('things', 828),
 ('information', 807),
 ('better', 804),
 ('question', 796),
 ('little', 789),
 ("that's", 766),
 ('(usenet', 734),
 ('windows', 730),
 ('public', 702),
 ('different', 699),
 ('anything', 684),
 ('around', 683),
 ('probably', 676),
 ('available', 666),
 ('program', 659),
 ('support', 651),
 ('sci

In [16]:
vocabulary=[]

for word_freq in words_frequency_list:
        vocabulary.append(word_freq[0])

## TRAINING

In [40]:
def readWords_for_making_dictionary(text,current_class,train_word_freq_dict):
    
    words=text.strip().split()
    
    for word in words:
        train_word_freq_dict[current_class]['total_class_words']+=1
        if (word in train_word_freq_dict[current_class]):
            train_word_freq_dict[current_class][word.lower()]+=1
                    
    return train_word_freq_dict

In [41]:
def readLines_for_making_dictionary(lines_list,current_class,train_word_freq_dict):
    
    for current_line in lines_list:
        line_to_clean=current_line
        
        if (current_line[0]=="\'" and current_line[len(current_line)-1]=="\'"):
            line_to_clean=current_line[1:len(current_line)-1]
            
        elif (current_line[0]=="\'"):
            line_to_clean=current_line[1:]
            
        elif (current_line[len(current_line)-1]=="\'"):
            line_to_clean=current_line[:len(current_line)-1]
            
        train_word_freq_dict=readWords_for_making_dictionary(line_to_clean,current_class,train_word_freq_dict)
    
    return train_word_freq_dict

In [77]:
def fit():
    train_word_freq_dict={}
    train_word_freq_dict['total_train_files']=7000
    
    for current_class,current_file in train_files_list:
        if ((current_class in train_word_freq_dict)==False):
            train_word_freq_dict[current_class]={}
            
            for word in vocabulary:
                train_word_freq_dict[current_class][word]=0
            
            train_word_freq_dict[current_class]['total_class_words']=0
            train_word_freq_dict[current_class]['total_train_files']=0
            
        current_file_path=os.path.join(os.path.join(PATH,current_class),current_file)
        
        train_word_freq_dict[current_class]['total_train_files']+=1
        
        try:
            file=open(current_file_path,'r')
            train_word_freq_dict=readLines_for_making_dictionary(file.readlines(),current_class,train_word_freq_dict)
            
        except:
            file=open(current_file_path,'r',encoding='latin-1')
            train_word_freq_dict=readLines_for_making_dictionary(file.readlines(),current_class,train_word_freq_dict)
        
    return train_word_freq_dict

In [93]:
def get_current_line_probability(text,current_class,train_dict):
    
    words=text.strip().split()
    first_term=0
    for word in words:
        if (word in train_dict[current_class]):
            numerator=np.log(train_dict[current_class][word]+1)
            denominator=np.log(train_dict[current_class]['total_class_words']+2000)
            first_term+=(numerator-denominator)
    
    return first_term

In [94]:
def readLines_for_prediction(lines_list,train_dict):
    
    best_class_probability=-1000
    best_class=-1
    first_run=True
    
    for current_class in list_classes:
        
        current_class_probability=(np.log(train_dict[current_class]['total_train_files'])-np.log(train_dict['total_train_files']))
        
        for current_line in lines_list:
            line_to_clean=current_line
        
            if (current_line[0]=="\'" and current_line[len(current_line)-1]=="\'"):
                line_to_clean=current_line[1:len(current_line)-1]
            
            elif (current_line[0]=="\'"):
                line_to_clean=current_line[1:]
            
            elif (current_line[len(current_line)-1]=="\'"):
                line_to_clean=current_line[:len(current_line)-1]
            
            current_class_probability+=get_current_line_probability(line_to_clean,current_class,train_dict)
        
        if (first_run or current_class_probability > best_class_probability):
            best_class=current_class
            best_class_probability=current_class_probability
        
        first_run=False
            
    return best_class

In [95]:
def predict(train_dict):
    class_predicted=[]
    
    for current_class,current_file in test_files_list:
        
        current_file_path=os.path.join(os.path.join(PATH,current_class),current_file)
        
        try:
            file=open(current_file_path,'r')
            class_predicted.append(readLines_for_prediction(file.readlines(),train_dict))
            
        except:
            file=open(current_file_path,'r',encoding='latin-1')
            class_predicted.append(readLines_for_prediction(file.readlines(),train_dict))
            
    return class_predicted

In [68]:
train_dict=fit()

In [96]:
yPredicted=predict(train_dict)

In [98]:
from sklearn.metrics import classification_report,confusion_matrix

In [99]:
yTest=[]

for current_class,file in test_files_list:
        yTest.append(current_class)
    

In [101]:
print (classification_report(yTest,yPredicted))

                          precision    recall  f1-score   support

             alt.atheism       0.61      0.84      0.71       644
           comp.graphics       0.72      0.84      0.78       669
 comp.os.ms-windows.misc       0.87      0.67      0.76       637
comp.sys.ibm.pc.hardware       0.79      0.84      0.81       657
   comp.sys.mac.hardware       0.77      0.89      0.83       647
          comp.windows.x       0.85      0.83      0.84       641
            misc.forsale       0.88      0.69      0.77       639
               rec.autos       0.88      0.85      0.86       660
         rec.motorcycles       0.98      0.87      0.92       643
      rec.sport.baseball       0.94      0.94      0.94       681
        rec.sport.hockey       0.99      0.90      0.94       627
               sci.crypt       0.70      0.89      0.78       651
         sci.electronics       0.83      0.85      0.84       686
                 sci.med       0.88      0.83      0.86       631
         

In [103]:
print (confusion_matrix(yTest,yPredicted))

[[540   2   0   0   0   1   0   0   0   0   0   3   0   1   1   8   1   1
    7  79]
 [  3 564  11  16  13  20   2   2   1   1   0  20   2   5   4   1   0   0
    2   2]
 [  4  50 426  30  24  40   8   2   1   0   0  19   9   3   9   1   0   1
    4   6]
 [  2  11   9 552  45   9   5   2   0   0   0   5  12   0   1   1   0   0
    3   0]
 [  0   7   7  23 575   5   5   0   1   0   0   6  11   2   1   1   0   0
    2   1]
 [  4  37  20  10   6 535   1   2   0   0   0  12   5   1   1   2   0   0
    3   2]
 [  3  21   8  41  41   8 439  13   2   0   5  20  20   6   4   1   0   0
    5   2]
 [  3   4   4   2   5   1   7 561   2   1   0  11   9   3   8   1   9   0
   27   2]
 [  8   0   0   1   5   0   5  18 560   2   0   5   9   7   3   2   5   1
    9   3]
 [  6   3   0   1   1   2   3   5   2 640   0   4   4   3   1   0   0   0
    5   1]
 [  2   0   0   0   2   0   0   5   1  26 565   6   2   1   3   4   1   1
    6   2]
 [  9   5   1   1   4   4   0   0   1   1   0 577   3   2   5   1

In [24]:
x={'Ankit':1,'Ankita':2}
if ('Ankitaa' in x):
    print ('Yes')