In [1]:
import os
import collections
import csv
import re #Regular Expressions
import operator # Majorly for sorting tf-idf dictionary on basis of values
import nltk
from nltk.corpus import stopwords # For removing stop words, use ntlk.download() to completely install this package
import pandas as pd
import numpy as np
from sklearn import model_selection as cv
from sklearn import naive_bayes
from sklearn.metrics import classification_report, confusion_matrix

# Parameters to customize according to use case

In [2]:
base_dir = "/home/aakash/Drive/Dropbox/ML/data/20_newsgroups/" #Set  this according to your machine
class_names = os.listdir(base_dir)

In [14]:
#Use small numbers for first time, articles =  100, classes = 2 , features = 100 
max_articles_of_each_class = 1000
no_of_classes = 2
no_of_features = 5000

# Caclulating tf-idf metric for articles of selected classes

In [15]:
def preprocess_file_text(file):
    text = file.read().lower() # Converting all to lowercase as lowercase and uppercase words should be considered same word
    text = re.sub('[^A-Za-z ]+', '', text) # Removing non-aplha characters
    text = re.sub('\s+', ' ', text)  # Condense all whitespace
    return text

In [16]:
words = set()
tf = {}
idf = {}
selected_classes = []
articles_read = {} # To keep track of which all articles were read while extracting features
for i in range(0,len(class_names)):
    if(i >= no_of_classes):
        break
    current_class = class_names[i]
    
    # 2 updates for tracking
    selected_classes.append(current_class)
    articles_read[current_class] = []
    
    class_dir = base_dir + current_class
    all_articles = os.listdir(class_dir)
    for j in range(0,len(all_articles)):
        if(j >= max_articles_of_each_class):
            break
        current_file = class_dir + "/" + all_articles[j]
        articles_read[current_class].append(all_articles[j]) 
        file = open(current_file, encoding = "ISO-8859-1")
        text = preprocess_file_text(file)
        file.close() # Always close a file after using it to free up system resources
        file_words = text.split()
        
        # Updating term-frequency dictionary
        word_count = collections.Counter(file_words)
        for word,freq in word_count.items():
            if(word in tf):
                tf[word] = tf[word] + freq
            else:
                tf[word] = freq
        
        #Updating (inverse document frequency) dictionary
        word_set = set(file_words)
        for word in word_set:
            if(word in idf):
                idf[word] = idf[word] + 1
            else:
                idf[word] = 1

In [17]:
tf_by_idf = {}
for key in tf.keys():
    tf_by_idf[key] = tf[key]/idf[key]

# Removing Stop Words from data dictionary

In [18]:
for stop_word in stopwords.words("english"):
    if(stop_word in tf_by_idf.keys()):
        tf_by_idf.pop(stop_word)

# Selecting top x words with max tf-idf value as Features

In [19]:
tf_by_idf = sorted(tf_by_idf.items(), key=operator.itemgetter(1))
tf_by_idf.reverse()
#tf_by_idf

In [20]:
features = set()
for i in range(0,no_of_features):
    features.add(tf_by_idf[i][0])

# Creating Dataframe by reading the articles again on the basis of selected features

In [21]:
# It takes time,so be patient (Reduce feature count to reduce this time but score will be affected accordingly)

columns = list(features)
total_articles_to_process = 0
for class_name in articles_read.keys():
    total_articles_to_process += len(articles_read[class_name])
data = []

articles_processed = 0
for current_class in selected_classes:
    class_dir = base_dir + current_class
    for article in articles_read[current_class]:
        articles_processed += 1
        if(articles_processed%500 == 0):
            print(articles_processed,"articles are processed out of",total_articles_to_process,"articles")
        current_file = class_dir + "/" + article
        file = open(current_file, encoding = "ISO-8859-1")
        text = preprocess_file_text(file)
        file.close() # Always close a file after using it to free up system resources
        file_words = text.split()
        
        word_count = collections.Counter(file_words)
        training_data = [0]*(len(columns) + 1) # +1 because last column is of output class
        for i in range(0,len(columns)): 
            feature = columns[i]
            if(feature in word_count.keys()):
                training_data[i] = word_count[feature]
        training_data[-1] = current_class
        data.append(training_data)

columns.append("class")
df = pd.DataFrame(data,columns=columns)

500 articles are processed out of 2000 articles
1000 articles are processed out of 2000 articles
1500 articles are processed out of 2000 articles
2000 articles are processed out of 2000 articles


In [258]:
#df.to_csv("data.csv",encoding="UTF-8")

In [22]:
X = df.values[:,:-1]
Y = df.values[:,-1]

In [23]:
X_train,X_test,Y_train,Y_test = cv.train_test_split(X,Y,test_size=0.25,random_state=0)

# Using Inbuilt Multinomial Naive Bayes

In [24]:
gnb = naive_bayes.GaussianNB()
gnb.fit(X_train,Y_train)
Y_pred = gnb.predict(X_test)
print("Size of Y_test = ",len(Y_test)," and wrong results = ",(Y_pred != Y_test).sum())
print("Score is ",gnb.score(X_test,Y_test))
#print(gnb.predict_proba(X_test[Y_test != Y_pred]))

Size of Y_test =  500  and wrong results =  15
Score is  0.97


# Using my implementation (Incomplete)

In [50]:
def calculatePriorProbabilities(Y):
    classes = set(Y)
    result = {}
    for i in classes:
        result[i] = (len(Y[Y==i])/len(Y))
    return result

In [267]:
def naiveProbability(priorProbability,current_class,X_train,Y_train,X_test_sample):
    result = priorProbability
    #Modifying X_train for current class only
    class_samples = (Y_train == current_class)
    Y_train = Y_train[class_samples]
    X_train = X_train[class_samples]
    dr = len(Y_train)
    for i in range(0,X_train.shape[-1]):
        nr = len(X_train[X_train[:,i]==X_test_sample[i]])
        result = result * (nr/dr)
    return result

In [268]:
#For Labelled(non-continous) Features only
def naiveBayesPredict(X_train,Y_train,X_test,priorProbabilities={}):
    classes = set(Y_train)
    test_samples = X_test.shape[0]
    y_pred = np.zeros(test_samples)
    
    #Assuming this condition is sufficient
    if (len(priorProbabilities) == 0) :
        priorProbabilities = calculatePriorProbabilities(Y_train)
    #print(priorProbabilities)
    for i in range(0,test_samples):
        probabilities = {}
        for current_class in classes:
            probabilities[current_class] = naiveProbability(priorProbabilities[current_class],current_class,X_train,Y_train,X_test[i,:])
        #print("For sample",X_test[i,:]," probabilities are = ",probabilities)
        y_pred[i] = max(probabilities,key=probabilities.get)
    return y_pred