In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import glob
import io
import os
import re
import math
import numpy as np
import string
%matplotlib inline
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.collections import defaultdict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [3]:
# preprocessing the text data....
def preprocess(data):
  
  # converting to lowercase...
  data = data.lower()

  # remove tags...
  remove_tag= re.compile(r'<[^>]+>')
  data = remove_tag.sub(' ',data)

  #punctuations and digits removal..
  punc_dig_list = string.punctuation + string.digits
  data = ' '.join(data.translate(str.maketrans(punc_dig_list , ' '*len(punc_dig_list ))).split())

  # removing URL's...
  data = re.sub(r'http\S+', ' ',data)

  # removing non alphanumeric characters
  token = word_tokenize(data)
  text = ""
  text = text + ' '.join([word for word in token if word.isalnum()])
  data = text
  
  # lemmatization..
  lemmatizer = WordNetLemmatizer()
  token = word_tokenize(data)
  text = ""
  text = text + ' '.join([lemmatizer.lemmatize(word) for word in token ])
  data = text

  # removing stopwords and single,double letter words ..
  text = ""
  stopword = nltk.corpus.stopwords.words('english')
  token = word_tokenize(data)
  text = text + ' '.join([word for word in token if word not in stopword and len(word)>2])
  data = text

  # removing unwanted spaces..
  data = re.sub(' +', ' ', data)

  data = word_tokenize(data)

  return data

In [4]:
# splitting the document dataset into train,test..
def split(dataset,split_size):
  permuted_dataset = dataset.sample(frac=1)
  train_size = int(len(permuted_dataset) * split_size)
  train_set = permuted_dataset[:train_size]
  test_set = permuted_dataset[train_size:]
  return train_set['Documents'].tolist(),test_set['Documents'].tolist()

In [5]:
# function to create a dataframe of docs from the path pased as input...
def get_dataframe(path):
  docs=[]
  for doc in os.listdir(path):
    docs.append(doc)
  df = pd.DataFrame(docs,columns=['Documents'])
  return df

In [6]:
# function that will calculate Term Frequency of each word in each set of documents in the training set, given as input..
# returns the dictionary of unique words with their TF found..
def findTF(path,train_set):
  vocab={}  
  for docname in glob.glob(os.path.join(path,'*')):
      if(os.path.basename(docname) in train_set):
        with open(docname,'r',encoding="utf8",errors='ignore') as f:
          doc_text = f.read()
          processed_list = preprocess(doc_text)
          for word in processed_list:
            if(word not in vocab):
                vocab[word]=1
            else:
                vocab[word] = vocab[word] + 1
  return vocab

In [7]:
# function to calculate the TF-ICF for each word in the dictionary passed as input....
def calculate_TFICF(dictionary,vocab2,vocab3,vocab4,vocab5):
  tficf_dict={}
  for word in dictionary:
    TF = dictionary[word]
    CF = 1
    if(word in vocab2):
      CF = CF + 1 
    if(word in vocab3):
      CF = CF + 1 
    if(word in vocab4):
      CF = CF + 1 
    if(word in vocab5):
      CF = CF + 1 
    ICF = math.log(5/CF)
    TF_ICF = TF * ICF
    tficf_dict[word] = TF_ICF

  return tficf_dict

In [8]:
# function to create the effective vocabulaty containing top k features of each class...
def effective_Vocab(dict1,dict2,dict3,dict4,dict5,k):
  vocab=[]
  for word in (list(dict1.keys())[:k]):
    if(word not in vocab):
      vocab.append(word)
  for word in (list(dict2.keys())[:k]):
    if(word not in vocab):
      vocab.append(word)
  for word in (list(dict3.keys())[:k]):
    if(word not in vocab):
      vocab.append(word)
  for word in (list(dict4.keys())[:k]):
    if(word not in vocab):
      vocab.append(word)
  for word in (list(dict5.keys())[:k]):
    if(word not in vocab):
      vocab.append(word)
  return vocab

In [9]:
# function to find the word occurance of the top k features in each document in the path and train_set provided as input...
def get_wordOccurance(path,train_set,effective_features):
  word_count=[]
  for docname in glob.glob(os.path.join(path,'*')):
      if(os.path.basename(docname) in train_set):
        with open(docname,'r',encoding="utf8",errors='ignore') as f:
          doc_text = f.read()
          processed_list = preprocess(doc_text)
          temp=[]
          for word in effective_features:
              if(word in processed_list):
                temp.append(1)
              else:
                temp.append(0)
          word_count.append(temp)
  return(word_count)

In [10]:
# function to compute the accuracy..
def find_accuracy(y_true,y_pred):
  count = 0
  for i in range(0,len(y_true)):
      if(y_true[i] == y_pred[i]):
        count = count + 1
  return (count/len(y_true))

In [11]:
# function implementing Naive Bayes Algorithm..
# Returns the predictions..
def predict_NB(train_data,test_data,effective_features,train_split_ratio):

  train_size = int(train_split_ratio * 1000)

  # prior probabilities of each class in the training dataset
  prior_probab = [(train_size/len(train_data))] * 5;   # '5' is the number of classes in the dataset...

  # dividing the dataset class wise...
  class_wise=[]
  for i in range(0,5):   # '5' is the number of classes in the dataset...
    class_wise.append(train_data[(i*train_size) : ((i+1)*train_size)])

  # computing the conditional probabilities for each class..
  conditional_probab = []
  for cls in range(0,5):   # '5' is the number of classes in the dataset...
    class_data = class_wise[cls]
    yes = [0] * len(effective_features) 
    no = [0] * len(effective_features)
    for i in range(0,len(class_data)):
      for j in range(0,len(effective_features)):
        if(class_data[i][j] == 1):
          yes[j] = yes[j] + 1 
        else:
          no[j] = no[j] + 1

    for k in range(0,len(yes)):
      yes[k] = yes[k] / train_size;

    for k in range(0,len(no)):
      no[k] = no[k] / train_size

    conditional_probab.append([yes,no])

  # generating the predictions..
  predicted_label=[]
  for i in range(0,len(test_data)):
    test_vec = test_data[i]
    class_wise_probab=[]
    for cls in range(0,5):
      probab_val = prior_probab[cls]
      for j in range(0,len(test_vec)):
        if(test_vec[j] == 1):
          probab_val = probab_val * conditional_probab[cls][0][j]
        else:
          probab_val = probab_val * conditional_probab[cls][1][j]
      class_wise_probab.append(probab_val)
    predicted_label.append(class_wise_probab.index(max(class_wise_probab)))

  return predicted_label

In [12]:
def naive_bayes(k,train_split_ratio):

  test_split_ratio  = (1 - train_split_ratio)
  test_split_ratio = round(test_split_ratio,2)

  path1 = "/content/drive/MyDrive/IR Assignment 2/Documents/comp.graphics"
  path2 = "/content/drive/MyDrive/IR Assignment 2/Documents/rec.sport.hockey"
  path3 = "/content/drive/MyDrive/IR Assignment 2/Documents/sci.med"
  path4 = "/content/drive/MyDrive/IR Assignment 2/Documents/sci.space"
  path5 = "/content/drive/MyDrive/IR Assignment 2/Documents/talk.politics.misc"

  dataframe1 = get_dataframe(path1)
  dataframe2 = get_dataframe(path2)
  dataframe3 = get_dataframe(path3)
  dataframe4 = get_dataframe(path4)
  dataframe5 = get_dataframe(path5)

  #print(dataframe1.shape,dataframe2.shape,dataframe3.shape,dataframe4.shape,dataframe5.shape)

  # splitting the dataset into train and test data..
  train_1,test_1 = split(dataframe1,train_split_ratio)
  train_2,test_2 = split(dataframe2,train_split_ratio)
  train_3,test_3 = split(dataframe3,train_split_ratio)
  train_4,test_4 = split(dataframe4,train_split_ratio)
  train_5,test_5 = split(dataframe5,train_split_ratio)

  # getting dictionary of unique words with their TF values for each class in the train data...
  graphics_vocab = findTF(path1,train_1)
  sport_vocab = findTF(path2,train_2)
  scimed_vocab = findTF(path3,train_3)
  scispace_vocab = findTF(path4,train_4)
  politics_vocab = findTF(path5,train_5)


  # TF-ICF values for each class..
  graphics_dict = calculate_TFICF(graphics_vocab,sport_vocab,scimed_vocab,scispace_vocab,politics_vocab)
  # after sorting
  graphics_dict = dict(sorted(graphics_dict.items(), key=lambda item: item[1],reverse=True))

  sport_dict = calculate_TFICF(sport_vocab,graphics_vocab,scimed_vocab,scispace_vocab,politics_vocab)
  # after sorting
  sport_dict = dict(sorted(sport_dict.items(), key=lambda item: item[1],reverse=True))

  scimed_dict = calculate_TFICF(scimed_vocab,graphics_vocab,sport_vocab,scispace_vocab,politics_vocab)
  # after sorting
  scimed_dict = dict(sorted(scimed_dict.items(), key=lambda item: item[1],reverse=True))

  scispace_dict = calculate_TFICF(scispace_vocab,graphics_vocab,sport_vocab,scimed_vocab,politics_vocab)
  # after sorting
  scispace_dict = dict(sorted(scispace_dict.items(), key=lambda item: item[1],reverse=True))

  politics_dict = calculate_TFICF(politics_vocab,graphics_vocab,sport_vocab,scimed_vocab,scispace_vocab)
  # after sorting
  politics_dict = dict(sorted(politics_dict.items(), key=lambda item: item[1],reverse=True))


  # getting the union of top k features of each class.. 
  effective_features = effective_Vocab(graphics_dict,sport_dict,scimed_dict,scispace_dict,politics_dict,k) 


  # creating list of word occurance of top k features for each document of each class for the training set..
  word_occur_train = []
  word_occur_train.extend(get_wordOccurance(path1,train_1,effective_features))
  word_occur_train.extend(get_wordOccurance(path2,train_2,effective_features))
  word_occur_train.extend(get_wordOccurance(path3,train_3,effective_features))
  word_occur_train.extend(get_wordOccurance(path4,train_4,effective_features))
  word_occur_train.extend(get_wordOccurance(path5,train_5,effective_features))

  # creating list of word occurance of top k features for each document of each class for the testing set..
  word_occur_test = []
  word_occur_test.extend(get_wordOccurance(path1,test_1,effective_features))
  word_occur_test.extend(get_wordOccurance(path2,test_2,effective_features))
  word_occur_test.extend(get_wordOccurance(path3,test_3,effective_features))
  word_occur_test.extend(get_wordOccurance(path4,test_4,effective_features))
  word_occur_test.extend(get_wordOccurance(path5,test_5,effective_features))

  # creating class mapping dictionary..
  class_mapping = {0:'comp.graphics',1:'rec.sport.hockey',2:'sci.med',3:'sci.space',4:'talk.politics.misc'}


  # creating the DataFrame class labels for the training set...
  class_labels_train = [0] * int(train_split_ratio*1000)
  class_labels_train.extend([1] * int(train_split_ratio*1000))
  class_labels_train.extend([2] * int(train_split_ratio*1000))
  class_labels_train.extend([3] * int(train_split_ratio*1000))
  class_labels_train.extend([4] * int(train_split_ratio*1000))
  train_y = pd.DataFrame(class_labels_train,columns=['Label'])

  # creating the DataFrame class labels for the testing set...
  class_labels_test = [0] * int(test_split_ratio*1000)
  class_labels_test.extend([1] * int(test_split_ratio*1000))
  class_labels_test.extend([2] * int(test_split_ratio*1000))
  class_labels_test.extend([3] * int(test_split_ratio*1000))
  class_labels_test.extend([4] * int(test_split_ratio*1000))
  test_y = pd.DataFrame(class_labels_test,columns=['Label'])

  # creating the doclist for the training set...
  doclist_train=[]
  doclist_train.extend(train_1)
  doclist_train.extend(train_2)
  doclist_train.extend(train_3)
  doclist_train.extend(train_4)
  doclist_train.extend(train_5)

  # creating the doclist for the test set...
  doclist_test=[]
  doclist_test.extend(test_1)
  doclist_test.extend(test_2)
  doclist_test.extend(test_3)
  doclist_test.extend(test_4)
  doclist_test.extend(test_5)

  #print(len(doclist_test),len(doclist_train))

  # creating the training dataframe for all the classes using the top k vocabulary obtianed..
  train_data = pd.DataFrame(doclist_train,columns=['Documents'])
  for word in effective_features:
    train_data[word] = 0

  # outer loop will be for the total number of documents in the first class..
  # inner loop will iterate for the top k features..
  for i in range(0,len(doclist_train)):
    for j in range(0,len(effective_features)):
      train_data[effective_features[j]][i] = word_occur_train[i][j]

  # creating the testing dataframe for all the classes using the top k vocabulary obtianed..
  test_data = pd.DataFrame(doclist_test,columns=['Documents'])
  for word in effective_features:
     test_data[word] = 0

  # outer loop will be for the total number of documents in the first class..
  # inner loop will iterate for the top k features..
  for i in range(0,len(doclist_test)):
    for j in range(0,len(effective_features)):
      test_data[effective_features[j]][i] = word_occur_test[i][j]

  train_x = train_data.drop(['Documents'],axis=1)
  test_x = test_data.drop(['Documents'],axis=1)

  #print(test_x.shape,train_x.shape,test_y.shape,train_y.shape)

  # Caliing Naive Bayes Function to get predictions...
  predicted = predict_NB(word_occur_train,word_occur_test,effective_features,train_split_ratio)

  pred = np.asarray(predicted)
  pred = pred.reshape(-1,1)
  # Computing Accuracy...
  print("Accuracy : ",find_accuracy(list(test_y['Label']),predicted)*100)
  print()
  print("Confusion Matrix : ")
  print()
  # Confusion Matrix...
  print(confusion_matrix(test_y,pred))
  print()
  print("Classification Report : ")
  print()
  # Classification Report...
  print(classification_report(test_y,pred))

In [21]:
naive_bayes(35,0.8)

Accuracy :  93.0

Confusion Matrix : 

[[200   0   0   0   0]
 [  5 195   0   0   0]
 [  3   0 194   3   0]
 [  6   0  38 152   4]
 [ 11   0   0   0 189]]

Classification Report : 

              precision    recall  f1-score   support

           0       0.89      1.00      0.94       200
           1       1.00      0.97      0.99       200
           2       0.84      0.97      0.90       200
           3       0.98      0.76      0.86       200
           4       0.98      0.94      0.96       200

    accuracy                           0.93      1000
   macro avg       0.94      0.93      0.93      1000
weighted avg       0.94      0.93      0.93      1000



In [25]:
naive_bayes(35,0.7)

Accuracy :  93.46666666666667

Confusion Matrix : 

[[290   0   9   0   1]
 [  5 295   0   0   0]
 [  7   0 288   2   3]
 [  4   0  58 234   4]
 [  4   0   0   1 295]]

Classification Report : 

              precision    recall  f1-score   support

           0       0.94      0.97      0.95       300
           1       1.00      0.98      0.99       300
           2       0.81      0.96      0.88       300
           3       0.99      0.78      0.87       300
           4       0.97      0.98      0.98       300

    accuracy                           0.93      1500
   macro avg       0.94      0.93      0.93      1500
weighted avg       0.94      0.93      0.93      1500



In [24]:
naive_bayes(35,0.5)

Accuracy :  96.56

Confusion Matrix : 

[[487   0   3  10   0]
 [ 13 487   0   0   0]
 [ 17   0 483   0   0]
 [ 17   0   0 474   9]
 [ 17   0   0   0 483]]

Classification Report : 

              precision    recall  f1-score   support

           0       0.88      0.97      0.93       500
           1       1.00      0.97      0.99       500
           2       0.99      0.97      0.98       500
           3       0.98      0.95      0.96       500
           4       0.98      0.97      0.97       500

    accuracy                           0.97      2500
   macro avg       0.97      0.97      0.97      2500
weighted avg       0.97      0.97      0.97      2500

