In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import numpy as np
import math
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
data=pd.read_excel('/content/drive/MyDrive/Annotated_Article_Dataset.xlsx')
data=data.dropna()

In [None]:
#Preprocessing the Dataset
def Preprocessing(text):

  #converting into lower case
  text=text.lower()

  #lemmatization
  lemmatizer = WordNetLemmatizer()
  tokens = nltk.word_tokenize(text)
  new_tokens=[] 
  #applying lemmatization
  for token in tokens:
    new_tokens.append(lemmatizer.lemmatize(token))

  #Removing stopwords and Punctuation
  stop = stopwords.words('english') + list(string.punctuation) + ["''","``",".."]
  preprocessed = " ".join(i for i in new_tokens if i not in stop)
  return preprocessed


In [None]:
def preprocess(raw_data):
  #articles
  articles=[]
  #label assigned to them
  labels=[]
  #Iterating through articles & label
  for article,label in zip(raw_data.Article.iloc,raw_data.Annotation.iloc):    
    article=Preprocessing(article)
    articles.append(article)  
    labels.append(label.upper())
  Tuples = list(zip(articles, labels))  
  return pd.DataFrame(Tuples, columns = ['Article', 'label']) 


In [None]:
df=preprocess(data)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,classification_report,accuracy_score
X_train, X_test, Y_train, Y_test = train_test_split(df['Article'],df['label'], test_size=0.25,random_state=42,shuffle=True,stratify=df['label'])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
#tf idf vectorizer
vector=TfidfVectorizer(analyzer='word',ngram_range=(1,1))

In [None]:
x_train=vector.fit_transform(X_train)
x_test=vector.transform(X_test)

**SVM**

In [None]:
kernels = ['Polynomial', 'RBF', 'Sigmoid','Linear']#A function which returns the corresponding SVC model
def getClassifier(ktype):
    if ktype == 0:
        # Polynomial kernal
        return SVC(kernel='poly', degree=8, gamma="auto")
    elif ktype == 1:
        # Radial Basis Function kernal
        return SVC(kernel='rbf', gamma="auto")
    elif ktype == 2:
        # Sigmoid kernal
        return SVC(kernel='sigmoid', gamma="auto")
    elif ktype == 3:
        # Linear kernal
        return SVC(kernel='linear', gamma="auto")

In [None]:
for i in range(4):
    svclassifier = getClassifier(i) 
    svclassifier.fit(x_train, Y_train)# Make prediction
    y_pred = svclassifier.predict(x_test)# Evaluate our model
    print("Evaluation:", kernels[i], "kernel")
    print(classification_report(Y_test,y_pred))

Evaluation: Polynomial kernel
              precision    recall  f1-score   support

         AAP       0.00      0.00      0.00        22
         BJP       0.00      0.00      0.00        72
    CONGRESS       0.00      0.00      0.00        48
        NONE       0.37      1.00      0.54        83

    accuracy                           0.37       225
   macro avg       0.09      0.25      0.13       225
weighted avg       0.14      0.37      0.20       225

Evaluation: RBF kernel
              precision    recall  f1-score   support

         AAP       0.00      0.00      0.00        22
         BJP       0.00      0.00      0.00        72
    CONGRESS       0.00      0.00      0.00        48
        NONE       0.37      1.00      0.54        83

    accuracy                           0.37       225
   macro avg       0.09      0.25      0.13       225
weighted avg       0.14      0.37      0.20       225

Evaluation: Sigmoid kernel
              precision    recall  f1-score   supp

**Classifier Algorithm**

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
from matplotlib.colors import ListedColormap
import seaborn as sns
import warnings; warnings.filterwarnings('ignore')
def run_classifier(clf, param_grid, title):
    # -----------------------------------------------------
    cv = StratifiedKFold(n_splits= 3, shuffle = True, random_state= 123)
    # Randomized grid search
    n_iter_search = 10
    gs = RandomizedSearchCV(clf, 
                            param_distributions = param_grid,
                            n_iter = n_iter_search, 
                            cv = cv,                 
                            scoring= 'accuracy')
    # -----------------------------------------------------
    # Train model
    gs.fit(x_train, Y_train)  
    print("The best parameters are %s" % (gs.best_params_)) 
    # Predict on test set
    y_pred = gs.best_estimator_.predict(x_test)
    # Get Probability estimates
    y_prob = gs.best_estimator_.predict_proba(x_test)[:, 1]
    # -----------------------------------------------------
    print('Accuracy score: %.2f%%' %(accuracy_score(Y_test, y_pred)*100))  
    print('Precision score: %.2f%%' % (precision_score(Y_test, y_pred, average= 'weighted')*100))
    print('Recall score: %.2f%%' % (recall_score(Y_test, y_pred, average= 'weighted')*100))

**Logisitic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
param_grid = {'penalty': ['l2'],
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
run_classifier(lr, param_grid, 'Logistic Regression')

The best parameters are {'solver': 'saga', 'penalty': 'l2'}
Accuracy score: 76.44%
Precision score: 77.89%
Recall score: 76.44%


**KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1,15), 
             'weights': ['uniform', 'distance'],
             'leaf_size':[1, 3, 5],
             'algorithm':['auto', 'kd_tree']}
run_classifier(knn, param_grid, 'Nearest Neighbors')

The best parameters are {'weights': 'uniform', 'n_neighbors': 14, 'leaf_size': 5, 'algorithm': 'auto'}
Accuracy score: 76.44%
Precision score: 77.85%
Recall score: 76.44%


**Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
param_grid = {'criterion': ['gini', 'entropy'],
              'splitter': ['best', 'random'],
              'max_depth': np.arange(1, 20, 2),
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4, 10],
              'max_features': ['auto', 'sqrt', 'log2', None]}
run_classifier(dtree, param_grid, "Decision Tree")

The best parameters are {'splitter': 'random', 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 9, 'criterion': 'gini'}
Accuracy score: 75.11%
Precision score: 75.93%
Recall score: 75.11%


**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
param_grid = {'n_estimators': [100, 200],
              'max_depth': [10, 20, 100, None],
              'max_features': ['auto', 'sqrt', None],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4, 10],
              'bootstrap': [True, False],
              'criterion': ['gini', 'entropy']}
run_classifier(rf, param_grid, 'Random Forest')

The best parameters are {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 100, 'criterion': 'gini', 'bootstrap': True}
Accuracy score: 84.89%
Precision score: 85.18%
Recall score: 84.89%


**Random Forest - One Vs All MultiClass Classifier**

In [None]:
from sklearn.multiclass import OneVsRestClassifier
model =RandomForestClassifier(n_estimators= 200, min_samples_split= 2, min_samples_leaf= 2, max_features= None, max_depth= 100, criterion= 'gini', bootstrap= True)
ovr = OneVsRestClassifier(model)
ovr.fit(x_train, Y_train)
y_pred = ovr.predict(x_test)
cm = confusion_matrix(Y_test, y_pred)
print("Confusion Matrix : \n", cm)
print('Accuracy score: %.2f%%' %(accuracy_score(Y_test, y_pred)*100))  
print('Precision score: %.2f%%' % (precision_score(Y_test, y_pred, average= 'weighted')*100))
print('Recall score: %.2f%%' % (recall_score(Y_test, y_pred, average= 'weighted')*100))

Confusion Matrix : 
 [[22  0  0  0]
 [ 0 67  3  2]
 [ 0  3 38  7]
 [ 1  8  3 71]]
Accuracy score: 88.00%
Precision score: 88.00%
Recall score: 88.00%


In [None]:
import pickle
filename = '/content/drive/MyDrive/IR PROJECT/RFC_SplitDataset.sav'
pickle.dump(ovr, open(filename, 'wb'))

**Random Forest - One Vs One MultiClass Classifier**

In [None]:
from sklearn.multiclass import OneVsOneClassifier
model =RandomForestClassifier(n_estimators= 200, min_samples_split= 2, min_samples_leaf= 2, max_features= None, max_depth= 100, criterion= 'gini', bootstrap= True)
ovo = OneVsOneClassifier(model)
ovo.fit(x_train, Y_train)
y_pred = ovo.predict(x_test)
cm = confusion_matrix(Y_test, y_pred)
print("Confusion Matrix : \n", cm)
print('Accuracy score: %.2f%%' %(accuracy_score(Y_test, y_pred)*100))  
print('Precision score: %.2f%%' % (precision_score(Y_test, y_pred, average= 'weighted')*100))
print('Recall score: %.2f%%' % (recall_score(Y_test, y_pred, average= 'weighted')*100))

Confusion Matrix : 
 [[21  0  0  1]
 [ 0 53  8 11]
 [ 0  5 39  4]
 [ 0  8  2 73]]
Accuracy score: 82.67%
Precision score: 82.71%
Recall score: 82.67%


**Dumping the Model**

In [None]:
X=df['Article']
Y=df['label']
X=vector.fit_transform(X)

In [None]:
ovr.fit(X, Y)

OneVsRestClassifier(estimator=RandomForestClassifier(max_depth=100,
                                                     max_features=None,
                                                     min_samples_leaf=2,
                                                     n_estimators=200))

In [None]:
import pickle
filename = '/content/drive/MyDrive/IR PROJECT/TrainingDataset.sav'
pickle.dump(X, open(filename, 'wb'))

In [None]:
import pickle
filename = '/content/drive/MyDrive/IR PROJECT/RFC_Dataset.sav'
pickle.dump(ovr, open(filename, 'wb'))