# Project Text Classification

In [None]:
from google.colab import drive
import os
from nltk import tokenize
import re

In [7]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


##### nltk requirements

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
path = '/content/gdrive/My Drive/Text_classify'
os.listdir(path)

['20_newsgroups',
 'mini_newsgroups',
 'dictionary_docs (1).json',
 'Clean_dict_docs.json']

### Our Training documents are in 20_newsgroups and testing data is in mini_newsgroups

In [10]:
path_20_news = path + '/20_newsgroups'
categories = os.listdir(path_20_news)
categories

['sci.med',
 'sci.crypt',
 'rec.sport.hockey',
 'sci.electronics',
 'talk.politics.misc',
 'talk.politics.mideast',
 'sci.space',
 'soc.religion.christian',
 'talk.religion.misc',
 'talk.politics.guns',
 'comp.sys.ibm.pc.hardware',
 'rec.sport.baseball',
 'alt.atheism',
 'rec.autos',
 'rec.motorcycles',
 'misc.forsale',
 'comp.graphics',
 'comp.sys.mac.hardware',
 'comp.os.ms-windows.misc',
 'comp.windows.x']

In [None]:
files = os.listdir(path_20_news)
path_file = path_20_news + '/' + files[0]
fd = open(path_file + '/' + os.listdir(path_file)[0])
# print(fd.read())

In [12]:
dic = []
for cat in categories :
  files_path = path_20_news + '/' + cat
  files = os.listdir(files_path)
  for fileid in files:
    fd = open(files_path + '/' + fileid,'r',encoding="ISO-8859-1")
    file_words = tokenize.word_tokenize(fd.read())
    dic.append((file_words,cat))
dic[0]

(['Xref',
  ':',
  'cantaloupe.srv.cs.cmu.edu',
  'misc.kids:73984',
  'sci.med:58045',
  'Path',
  ':',
  'cantaloupe.srv.cs.cmu.edu',
  '!',
  'crabapple.srv.cs.cmu.edu',
  '!',
  'fs7.ece.cmu.edu',
  '!',
  'europa.eng.gtefsd.com',
  '!',
  'emory',
  '!',
  'swrinde',
  '!',
  'zaphod.mps.ohio-state.edu',
  '!',
  'ub',
  '!',
  'galileo.cc.rochester.edu',
  '!',
  'rochester',
  '!',
  'fulk',
  'From',
  ':',
  'fulk',
  '@',
  'cs.rochester.edu',
  '(',
  'Mark',
  'Fulk',
  ')',
  'Newsgroups',
  ':',
  'misc.kids',
  ',',
  'sci.med',
  'Subject',
  ':',
  'Re',
  ':',
  'Breech',
  'Baby',
  'Info',
  'Needed',
  'Message-ID',
  ':',
  '<',
  '1993Apr5.184453.8394',
  '@',
  'cs.rochester.edu',
  '>',
  'Date',
  ':',
  '5',
  'Apr',
  '93',
  '18:44:53',
  'GMT',
  'References',
  ':',
  '<',
  'TIGGER.93Apr2110251',
  '@',
  'satyr.Sylvan.COM',
  '>',
  '<',
  '1993Apr3.161757.19612',
  '@',
  'cs.rochester.edu',
  '>',
  '<',
  '1993Apr5.151818.27409',
  '@',
  'trentu.ca'

### Now we need to shuffle our data

In [13]:
import random
random.shuffle(dic)
print(dic[0:5])



In [None]:
import json
fopen = open("dictionary_docs.json",'w+')
json.dump(dic,fopen)

### Now performing the cleaning

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet,stopwords
from nltk import pos_tag
import string

In [15]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation
stop_words.update(list(punctuation))
lemmatizer = WordNetLemmatizer()

In [None]:
def getsimplepos(tag):
  if tag.startswith('J'):
    return wordnet.ADJ
  elif tag.startswith('V'):
    return wordnet.VERB
  elif tag.startswith('R'):
    return wordnet.ADV
  else:
    return wordnet.NOUN

In [None]:
def cleaning(words):
  output_words = []
  for w in words :
    if w.lower() not in stop_words :
      #applying lemmatization
      pos = pos_tag([w])
      cleanword = lemmatizer.lemmatize(w , pos = getsimplepos(pos[0][1]))
      output_words.append(cleanword.lower())
  return output_words

In [19]:
dic = [(cleaning(doc),cat) for doc,cat in dic]
dic[0][1]

'comp.graphics'

In [None]:
with open('Clean_dict_docs.json','w+') as fopen:
    json.dump(dic,fopen)

### performing the split

In [20]:
dic[0][0][0],dic[0][1]

('xref', 'comp.graphics')

In [21]:
Y = [cat for doc,cat in dic]
Y[0:5]

['comp.graphics',
 'alt.atheism',
 'comp.sys.ibm.pc.hardware',
 'soc.religion.christian',
 'talk.politics.mideast']

In [22]:
X = [" ".join(doc) for doc,cat in dic]
X[0]



In [None]:
import sklearn.model_selection as ms

In [None]:
xtrain,xtest,ytrain,ytest = ms.train_test_split(X,Y,test_size = 0.10,random_state = 2)

### Importing Count Vectorizer from sklearn.feature_extraction.text to apply to the training data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count_vec = CountVectorizer(max_features = 3000)

In [27]:
x_train_transformed = count_vec.fit_transform(xtrain)
x_train_transformed.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [28]:
x_test_transform = count_vec.transform(xtest)
x_test_transform.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

### Here comes the Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
clf = MultinomialNB()
clf.fit(x_train_transformed,ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [31]:
clf.score(x_test_transform,ytest)

0.8302546180728907

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [33]:
y_predict = clf.predict(x_test_transform)
print(classification_report(ytest,y_predict))
print(confusion_matrix(ytest,y_predict))

                          precision    recall  f1-score   support

             alt.atheism       0.83      0.83      0.83       103
           comp.graphics       0.77      0.75      0.76       103
 comp.os.ms-windows.misc       1.00      0.08      0.15       110
comp.sys.ibm.pc.hardware       0.61      0.83      0.70        92
   comp.sys.mac.hardware       0.75      0.90      0.82        99
          comp.windows.x       0.60      0.86      0.71        87
            misc.forsale       0.73      0.93      0.82       103
               rec.autos       0.88      0.90      0.89       110
         rec.motorcycles       0.92      0.97      0.94       100
      rec.sport.baseball       0.97      0.94      0.96       121
        rec.sport.hockey       0.94      0.96      0.95        79
               sci.crypt       0.96      0.94      0.95        95
         sci.electronics       0.87      0.90      0.89        94
                 sci.med       0.98      0.88      0.92        91
         

### implementing own Multinomial Naive Bayes Classifier

In [None]:
import numpy as np

In [35]:
X_train = np.array(x_train_transformed.todense())
Y_train = np.array(ytrain)
X_test = np.array(x_test_transform.todense())
Y_test = np.array(ytest)
X_train.shape,Y_train.shape,X_test.shape,Y_test.shape

((18024, 3000), (18024,), (2003, 3000), (2003,))

In [36]:
X_train,X_test

(array([[0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64), array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

In [None]:
def fit(x_train,y_train):
  result = {}
  class_vals = set(y_train)
  result['total_count'] = len(y_train)
  
  for curclass in class_vals :
    result[curclass] = {}
    num_features = x_train.shape[-1]
    curr_rows_req = (y_train == curclass)
    X_train_cur = x_train[curr_rows_req]
    Y_train_cur = y_train[curr_rows_req]
    result[curclass]['total_count'] = len(Y_train_cur)
    
    for j in range(num_features):
      result[curclass][j] = {}
      all_possible_vals = set(x_train[:,j])
      for cur_val in all_possible_vals :
        result[curclass][j][cur_val] = (cur_val == X_train_cur[:,j]).sum()
  return result

In [None]:
def probability(dictionary,x,curclass):
  # P(y = ai)
  class_probab = np.log(dictionary[curclass]['total_count']) - np.log(dictionary['total_count'])
  output = class_probab
  
  # P(Xj = xj / y = ai)
  num_features = len(dictionary[curclass].keys()) - 1
  for j in range(num_features):
    xj = x[j]
    #print('xj is :',xj)
    if xj in dictionary[curclass][j].keys():
      count_cur_class_with_val_xj = dictionary[curclass][j][xj] + 1
    else:
      count_cur_class_with_val_xj = 1
    count_cur_class = dictionary[curclass]['total_count'] + len(dictionary[curclass][j].keys())-1
    count_x_y_probab = np.log(count_cur_class_with_val_xj) - np.log(count_cur_class)
    output = output + count_x_y_probab
    
  #total output = Multiplication_for_all_j( P(Xj=xj / y=ai) * P(y=ai) )
  return output

In [None]:
def predictSinglePoint(dictionary,x):
  classes = dictionary.keys()
  best_p = -1000
  best_class = -1
  for curclass in classes:
    if curclass != 'total_count':
      pcurclass = probability(dictionary,x,curclass)
      if pcurclass > best_p:
        best_p = pcurclass
        best_class = curclass
  return best_class

In [None]:
def predict(dictionary,x_test):
  ypred = []
  for x in x_test:
    x_class = predictSinglePoint(dictionary,x)
    ypred.append(x_class)
  return ypred

In [None]:
dictionary = fit(X_train,Y_train)

In [42]:
len(dictionary)

21

In [43]:
X_test.shape,type(X_test)

((2003, 3000), numpy.ndarray)

In [44]:
Y_pred = predict(dictionary,X_test)
len(Y_pred)

2003

In [46]:
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

                          precision    recall  f1-score   support

                      -1       0.00      0.00      0.00         0
             alt.atheism       0.83      0.79      0.81       103
           comp.graphics       0.97      0.88      0.92       103
 comp.os.ms-windows.misc       0.96      0.97      0.97       110
comp.sys.ibm.pc.hardware       0.96      0.93      0.95        92
   comp.sys.mac.hardware       0.98      0.96      0.97        99
          comp.windows.x       0.98      0.97      0.97        87
            misc.forsale       0.80      1.00      0.89       103
               rec.autos       0.98      0.95      0.97       110
         rec.motorcycles       0.96      0.98      0.97       100
      rec.sport.baseball       0.99      0.96      0.97       121
        rec.sport.hockey       0.97      0.96      0.97        79
               sci.crypt       0.98      0.91      0.94        95
         sci.electronics       0.86      0.99      0.92        94
         

  'recall', 'true', average, warn_for)


### Now importing the testing documents : mini_news and testing on it

In [49]:
path_mini = path + '/mini_newsgroups'
categories = os.listdir(path_mini)
len(categories)

20

In [52]:
test_dic = []
for cat in categories :
  files_path = path_mini + '/' + cat
  files = os.listdir(files_path)
  for fileid in files:
    fd = open(files_path + '/' + fileid,'r',encoding="ISO-8859-1")
    file_words = tokenize.word_tokenize(fd.read())
    test_dic.append((file_words,cat))
test_dic[0]

(['Path',
  ':',
  'cantaloupe.srv.cs.cmu.edu',
  '!',
  'crabapple.srv.cs.cmu.edu',
  '!',
  'fs7.ece.cmu.edu',
  '!',
  'europa.eng.gtefsd.com',
  '!',
  'news.ans.net',
  '!',
  'cmcl2',
  '!',
  'arizona',
  '!',
  'ho',
  'From',
  ':',
  'ho',
  '@',
  'cs.arizona.edu',
  '(',
  'Hilarie',
  'Orman',
  ')',
  'Newsgroups',
  ':',
  'sci.crypt',
  'Subject',
  ':',
  'Re',
  ':',
  'Licensing',
  'of',
  'public',
  'key',
  'implementations',
  'Message-ID',
  ':',
  '<',
  '36179',
  '@',
  'optima.cs.arizona.edu',
  '>',
  'Date',
  ':',
  '6',
  'Apr',
  '93',
  '00:12:20',
  'GMT',
  'References',
  ':',
  '<',
  '930403152101.890833',
  '@',
  'DOCKMASTER.NCSC.MIL',
  '>',
  'Sender',
  ':',
  'news',
  '@',
  'cs.arizona.edu',
  'Organization',
  ':',
  'U',
  'of',
  'Arizona',
  ',',
  'CS',
  'Dept',
  ',',
  'Tucson',
  'Lines',
  ':',
  '6',
  'With',
  'regard',
  'to',
  'your',
  'speculations',
  'on',
  'NSA',
  'involvement',
  'in',
  'the',
  'creation',
  'of'

In [53]:
test_dic = [(cleaning(doc),cat) for doc,cat in test_dic]
test_dic[0][1]

'sci.crypt'

In [None]:
Y_test_mini = [cat for doc,cat in test_dic]
X_test_mini = [" ".join(doc) for doc,cat in test_dic]

In [None]:
X_test_mini_transformed = count_vec.transform(X_test_mini)
x_test_to_fed = np.array(X_test_mini_transformed.todense())

In [60]:
Y_pred_mini = predict(dictionary,x_test_to_fed)
len(Y_pred_mini)

2000

In [61]:
print(classification_report(Y_test_mini,Y_pred_mini))
print(confusion_matrix(Y_test_mini,Y_pred_mini))

                          precision    recall  f1-score   support

                      -1       0.00      0.00      0.00         0
             alt.atheism       0.85      0.87      0.86       100
           comp.graphics       0.93      0.93      0.93       100
 comp.os.ms-windows.misc       0.99      0.99      0.99       100
comp.sys.ibm.pc.hardware       1.00      0.97      0.98       100
   comp.sys.mac.hardware       1.00      0.96      0.98       100
          comp.windows.x       0.97      0.98      0.98       100
            misc.forsale       0.85      0.99      0.91       100
               rec.autos       0.99      0.96      0.97       100
         rec.motorcycles       0.98      1.00      0.99       100
      rec.sport.baseball       1.00      0.98      0.99       100
        rec.sport.hockey       0.99      1.00      1.00       100
               sci.crypt       1.00      0.90      0.95       100
         sci.electronics       0.92      0.98      0.95       100
         

  'recall', 'true', average, warn_for)


In [62]:
clf.score(x_test_to_fed,Y_pred_mini)

0.8525