In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import numpy as np
import math
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
data=pd.read_excel('/content/drive/MyDrive/Annotated_Article_Dataset.xlsx')
data=data.dropna()

In [None]:
#Preprocessing the Dataset
def Preprocessing(text):

  #converting into lower case
  text=text.lower()

  #lemmatization
  lemmatizer = WordNetLemmatizer()
  tokens = nltk.word_tokenize(text)
  new_tokens=[] 
  #applying lemmatization
  for token in tokens:
    new_tokens.append(lemmatizer.lemmatize(token))

  #Removing stopwords and Punctuation
  stop = stopwords.words('english') + list(string.punctuation) + ["''","``",".."]
  preprocessed = " ".join(i for i in new_tokens if i not in stop)
  return preprocessed


In [None]:
def preprocess(raw_data):
  #articles
  articles=[]
  #label assigned to them
  labels=[]
  #Iterating through articles & label
  for article,label in zip(raw_data.Article.iloc,raw_data.Annotation.iloc):    
    article=Preprocessing(article)
    articles.append(article)  
    labels.append(label.upper())
  Tuples = list(zip(articles, labels))  
  return pd.DataFrame(Tuples, columns = ['Article', 'label']) 


In [None]:
df=preprocess(data)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,classification_report,accuracy_score
X_train, X_test, Y_train, Y_test = train_test_split(df['Article'],df['label'], test_size=0.25,random_state=42,shuffle=True,stratify=df['label'])

**Feature Selection**

In [None]:
class_label={'BJP':0,'CONGRESS':1,'AAP':2,'NONE':3}
reverse_class_label={0:'BJP',1:'CONGRESS',2:'AAP',3:'NONE'} 
def table_creation(X_train,Y_train):
  #To keep the track of the Article Count of a term in a particular class
  # table dictionary will have class count for each term.
  table={}  
  #Term count in a class           
  term_count={}
  #Number of Articles in a Class         
  docs_count=np.array([0,0,0,0])
  #Number of Words in a Class
  words_count=np.array([0,0,0,0])
  class_count=np.array([0,0,0,0])

  for article,label in zip(X_train,Y_train):
    docs_count[class_label[label]]+=1
    words_count[class_label[label]]+=len(article)
    unique_Tokens={-1}
    for term in article.split():
      #term count in of each term in each class
      if term not in term_count:
        term_count[term]=class_count.copy()
      term_count[term][class_label[label]]+=1
      unique_Tokens.add(term)
    
    #print("unique tokens in article are",unique_Tokens)
    unique_Tokens.remove(-1) 

    for term in unique_Tokens:
      if term not in table:
        table[term]=class_count.copy()
      table[term][class_label[label]]+=1
  
  return table, words_count, docs_count, term_count
  
table, words_count, docs_count, term_count=table_creation(X_train,Y_train)

In [None]:
#Calculating Mutual Information for each term in each Class: BJP, AAP, Congress, None(NOT BIASED)
def calculating_Mutual_Info(table,words_count,docs_count):
  N=0
  for i in docs_count:
    N+=i
  print("total number of articles are",N)
  #List to contain the mutual info for each class
  mutual_Info_table=[[],[],[],[]] 
  for term in table:
      for class_ in range(0,4):
          #Term Present in class
          N_11=table[term][class_]  
          #Term Present but not in Class
          N_10=np.sum(table[term])-N_11 
          #Num of Docs in class NOT having term       
          N_01=docs_count[class_]-N_11
          #Num of Docs neither Term Nor Class
          N_00=N-(N_01+N_10+N_11)                
          
          if N_11==0:
              X=0
          else:
              X=(N_11/N) * ((np.log(N)+np.log(N_11)) - (np.log(N_11+N_01) + np.log(N_11+N_10)))
          if N_01==0:
              Y=0
          else:
              Y=(N_01/N) * ((np.log(N)+np.log(N_01)) - (np.log(N_01+N_00) + np.log(N_01+N_11)))
          if N_10==0:
              Z=0
          else:
              Z=(N_10/N) * ((np.log(N)+np.log(N_10)) - (np.log(N_10+N_11) + np.log(N_10+N_00)))
          if N_00==0:
              W=0
          else:
              W=(N_00/N) * ((np.log(N)+np.log(N_00)) - (np.log(N_00+N_01) + np.log(N_00+N_10)))
          m=X+Y+Z+W
          mutual_Info_table[class_].append(m)
  return mutual_Info_table
mutual_Info_table=calculating_Mutual_Info(table,words_count, docs_count)


total number of articles are 674


In [None]:
#creating word map, each word is assigned a uniuqe id like we did for each document
def creating_map_of_words(list_of_words):
    forward_map={}
    reverse_map={}
    count=0
    for word in list_of_words:
        forward_map[word] = count
        count = count + 1
    reverse_map = {v: k for k, v in forward_map.items()}
    return forward_map,reverse_map
forward_map,reverse_map=creating_map_of_words(table.keys())


In [None]:
#this method will select feaures for each class and returns new vocabulary
def feature_selection(mutual_Info_table,k,forward_map,reverse_map):
    top_k_words=[]
    for id in range(0,4):
        temp = np.argsort(np.array(mutual_Info_table[id]))
        temp = temp[::-1]
        top_k_words.append(temp[:k].copy())
    
    new_vocab={-1}
    count=0
    for list_of_words in top_k_words:
        for wordid in list_of_words:
            new_vocab.add(reverse_map[wordid])
        count=count+1
        
    new_vocab.remove(-1)
    return top_k_words,new_vocab

top_k_words,new_vocab=feature_selection(mutual_Info_table,200,forward_map,reverse_map)


In [None]:
#to calculate term and class probability 
def calc_prob(new_vocab,term_count,class_word_count,docs_count):
    tprobability={}
    beta=len(new_vocab)
    for word in new_vocab:
      tprobability[word]=[]
      for id in range(0,4):
        tot=class_word_count[id]
        tc=term_count[word][id]
        p=(tc+1)/(tot+beta)
        tprobability[word].append(p)    
    cprobability=[]
    N=np.sum(docs_count)
    for doc_count in docs_count:
        cprobability.append(doc_count/N)
    return cprobability,tprobability
cprobability,tprobability=calc_prob(new_vocab,term_count,words_count,docs_count)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
def count_vector(X_data,vocab,term_count):
  data_count=[]
  for index in X_data:
    count_vector=[]
    values=index.split(' ')
    for word in vocab:
      if word in values:
        count_vector.append(values.count(word))
      else:
        count_vector.append(0)
    data_count.append(count_vector)
  return data_count

In [None]:
x_train=count_vector(X_train,new_vocab,term_count)
x_test=count_vector(X_test,new_vocab,term_count)

In [None]:
vector=TfidfTransformer()
x_train=vector.fit_transform(x_train)
x_test=vector.transform(x_test)

**SVM**

In [None]:
from sklearn.svm import SVC
kernels = ['Polynomial', 'RBF', 'Sigmoid','Linear']#A function which returns the corresponding SVC model
def getClassifier(ktype):
    if ktype == 0:
        # Polynomial kernal
        return SVC(kernel='poly', degree=8, gamma="auto")
    elif ktype == 1:
        # Radial Basis Function kernal
        return SVC(kernel='rbf', gamma="auto")
    elif ktype == 2:
        # Sigmoid kernal
        return SVC(kernel='sigmoid', gamma="auto")
    elif ktype == 3:
        # Linear kernal
        return SVC(kernel='linear', gamma="auto")

In [None]:
for i in range(4):
    svclassifier = getClassifier(i) 
    svclassifier.fit(x_train, Y_train)# Make prediction
    y_pred = svclassifier.predict(x_test)# Evaluate our model
    print("Evaluation:", kernels[i], "kernel")
    print(classification_report(Y_test,y_pred))

Evaluation: Polynomial kernel
              precision    recall  f1-score   support

         AAP       0.00      0.00      0.00        22
         BJP       0.00      0.00      0.00        72
    CONGRESS       0.00      0.00      0.00        48
        NONE       0.37      1.00      0.54        83

    accuracy                           0.37       225
   macro avg       0.09      0.25      0.13       225
weighted avg       0.14      0.37      0.20       225



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluation: RBF kernel
              precision    recall  f1-score   support

         AAP       0.00      0.00      0.00        22
         BJP       0.00      0.00      0.00        72
    CONGRESS       0.00      0.00      0.00        48
        NONE       0.37      1.00      0.54        83

    accuracy                           0.37       225
   macro avg       0.09      0.25      0.13       225
weighted avg       0.14      0.37      0.20       225



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluation: Sigmoid kernel
              precision    recall  f1-score   support

         AAP       0.00      0.00      0.00        22
         BJP       0.00      0.00      0.00        72
    CONGRESS       0.00      0.00      0.00        48
        NONE       0.37      1.00      0.54        83

    accuracy                           0.37       225
   macro avg       0.09      0.25      0.13       225
weighted avg       0.14      0.37      0.20       225

Evaluation: Linear kernel
              precision    recall  f1-score   support

         AAP       0.95      0.95      0.95        22
         BJP       0.79      0.89      0.84        72
    CONGRESS       0.90      0.79      0.84        48
        NONE       0.90      0.87      0.88        83

    accuracy                           0.87       225
   macro avg       0.89      0.88      0.88       225
weighted avg       0.87      0.87      0.87       225



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Classifier Algorithm**

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
from matplotlib.colors import ListedColormap
import seaborn as sns
import warnings; warnings.filterwarnings('ignore')
def run_classifier(clf, param_grid, title):
    # -----------------------------------------------------
    cv = StratifiedKFold(n_splits= 3, shuffle = True, random_state= 123)
    # Randomized grid search
    n_iter_search = 10
    gs = RandomizedSearchCV(clf, 
                            param_distributions = param_grid,
                            n_iter = n_iter_search, 
                            cv = cv,                 
                            scoring= 'accuracy')
    # -----------------------------------------------------
    # Train model
    gs.fit(x_train, Y_train)  
    print("The best parameters are %s" % (gs.best_params_)) 
    # Predict on test set
    y_pred = gs.best_estimator_.predict(x_test)
    # Get Probability estimates
    y_prob = gs.best_estimator_.predict_proba(x_test)[:, 1]
    # -----------------------------------------------------
    print('Accuracy score: %.2f%%' %(accuracy_score(Y_test, y_pred)*100))  
    print('Precision score: %.2f%%' % (precision_score(Y_test, y_pred, average= 'weighted')*100))
    print('Recall score: %.2f%%' % (recall_score(Y_test, y_pred, average= 'weighted')*100))

**Logisitic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
param_grid = {'penalty': ['l2'],
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
run_classifier(lr, param_grid, 'Logistic Regression')

The best parameters are {'solver': 'saga', 'penalty': 'l2'}
Accuracy score: 83.11%
Precision score: 83.75%
Recall score: 83.11%


**KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1,15), 
             'weights': ['uniform', 'distance'],
             'leaf_size':[1, 3, 5],
             'algorithm':['auto', 'kd_tree']}
run_classifier(knn, param_grid, 'Nearest Neighbors')

The best parameters are {'weights': 'distance', 'n_neighbors': 4, 'leaf_size': 3, 'algorithm': 'auto'}
Accuracy score: 75.11%
Precision score: 75.44%
Recall score: 75.11%


**Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
param_grid = {'criterion': ['gini', 'entropy'],
              'splitter': ['best', 'random'],
              'max_depth': np.arange(1, 20, 2),
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4, 10],
              'max_features': ['auto', 'sqrt', 'log2', None]}
run_classifier(dtree, param_grid, "Decision Tree")

The best parameters are {'splitter': 'random', 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 13, 'criterion': 'gini'}
Accuracy score: 79.11%
Precision score: 79.00%
Recall score: 79.11%


**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
param_grid = {'n_estimators': [100, 200],
              'max_depth': [10, 20, 100, None],
              'max_features': ['auto', 'sqrt', None],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4, 10],
              'bootstrap': [True, False],
              'criterion': ['gini', 'entropy']}
run_classifier(rf, param_grid, 'Random Forest')

The best parameters are {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 100, 'criterion': 'gini', 'bootstrap': True}
Accuracy score: 89.33%
Precision score: 89.77%
Recall score: 89.33%


**Random Forest - One Vs All MultiClass Classifier**

In [None]:
from sklearn.multiclass import OneVsRestClassifier
model =RandomForestClassifier(n_estimators= 200, min_samples_split= 2, min_samples_leaf= 1, max_features= 'sqrt', max_depth= 100, criterion= 'gini', bootstrap= True)
ovr = OneVsRestClassifier(model)
ovr.fit(x_train, Y_train)
y_pred = ovr.predict(x_test)
cm = confusion_matrix(Y_test, y_pred)
print("Confusion Matrix : \n", cm)
print('Accuracy score: %.2f%%' %(accuracy_score(Y_test, y_pred)*100))  
print('Precision score: %.2f%%' % (precision_score(Y_test, y_pred, average= 'weighted')*100))
print('Recall score: %.2f%%' % (recall_score(Y_test, y_pred, average= 'weighted')*100))

Confusion Matrix : 
 [[22  0  0  0]
 [ 0 66  3  3]
 [ 0  5 41  2]
 [ 0  7  1 75]]
Accuracy score: 90.67%
Precision score: 90.88%
Recall score: 90.67%


**Dumping the Model**

In [None]:
import pickle
filename = '/content/drive/MyDrive/IR PROJECT/RFC_OneVsAll.sav'
pickle.dump(ovr, open(filename, 'wb'))

**Random Forest - One Vs One MultiClass Classifier**

In [None]:
from sklearn.multiclass import OneVsOneClassifier
model =RandomForestClassifier(n_estimators= 200, min_samples_split= 2, min_samples_leaf= 1, max_features='sqrt', max_depth= 100, criterion= 'gini', bootstrap= True)
ovo = OneVsOneClassifier(model)
ovo.fit(x_train, Y_train)
y_pred = ovo.predict(x_test)
cm = confusion_matrix(Y_test, y_pred)
print("Confusion Matrix : \n", cm)
print('Accuracy score: %.2f%%' %(accuracy_score(Y_test, y_pred)*100))  
print('Precision score: %.2f%%' % (precision_score(Y_test, y_pred, average= 'weighted')*100))
print('Recall score: %.2f%%' % (recall_score(Y_test, y_pred, average= 'weighted')*100))

Confusion Matrix : 
 [[22  0  0  0]
 [ 0 67  2  3]
 [ 0  5 40  3]
 [ 0  8  1 74]]
Accuracy score: 90.22%
Precision score: 90.54%
Recall score: 90.22%
