In [31]:
import csv
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [14]:
df = pd.read_excel('CleanedCombinedLabelled.xlsx')
df

Unnamed: 0,TEXT,TAG
0,sir without mask vaccine keise le sakte hai,NAG
1,nimsjosh ghanta gopal modi teri chodi,OAG
2,tum se yuwao ko bahut pareshani ho rahi,NAG
3,aaj khud virus ne vaccine li,CAG
4,sir kbhi apne promise k bare m v baat kar lij...,CAG
...,...,...
3454,ju madarchod ubeisa mulla sala harami kutte k...,OAG
3455,jo ram mandir machhali ke khilaf mein bol raha...,CAG
3456,modi ko kya kisi ko nahi jana cahiye kyu bhai ...,NAG
3457,chal bhaag bhosree k jyada gyan na pel khu...,OAG


In [18]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

df["TOKENS"] = df["TEXT"].map(tokenizer.tokenize)

In [19]:
from sklearn.model_selection import train_test_split

X = df['TEXT']
y = df['TAG']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

In [20]:
print(X_train.shape[0])
print(X_test.shape[0])

2767
692


# Applying TF-IDF character n-gram for Feature Extraction

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3,4))

X_train_tfidf_34 = vectorizer.fit_transform(X_train)
X_test_tfidf_34 = vectorizer.transform(X_test)

print(X_train.shape)
print(X_train_tfidf_34.shape)

(2767,)
(2767, 28234)


# Applying Support Vector Machine

In [25]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],'kernel': ['rbf']} 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

grid.fit(X_train_tfidf_34, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.477, total=  14.6s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.5s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.477, total=  13.9s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   28.4s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.477, total=  14.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.477, total=  14.2s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.479, total=  14.2s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.477, total=  12.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.477, total=  12.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.477, total=  12.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.477, total=  13.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.480, total=  12.6s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.496, total=  13.0s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.490, total=  12.9s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.486, total=  12.7s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.492, total=  12.9s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.477, total=  12.4s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.477, total=  12.6s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.575, total=  13.5s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.609, total=  14.1s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.480, total=  14.2s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.496, total=  12.8s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.494, total=  12.8s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.486, total=  12.8s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.494, total=  12.9s


[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed: 27.2min finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [27]:
grid_svm = grid

In [30]:
grid_predictions_svm = grid_svm.predict(X_test_tfidf_34)

In [32]:
print(accuracy_score(y_test, grid_predictions_svm))
print(confusion_matrix(y_test, grid_predictions_svm))
print(classification_report(y_test, grid_predictions_svm))

0.6127167630057804
[[ 65 122  18]
 [ 38 305   4]
 [ 46  40  54]]
              precision    recall  f1-score   support

         CAG       0.44      0.32      0.37       205
         NAG       0.65      0.88      0.75       347
         OAG       0.71      0.39      0.50       140

    accuracy                           0.61       692
   macro avg       0.60      0.53      0.54       692
weighted avg       0.60      0.61      0.59       692



In [71]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],'kernel': ['linear']} 
grid_l = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

grid_l.fit(X_train_tfidf_34, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.477, total=  13.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.0s remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.477, total=  13.6s
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   26.6s remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.477, total=  12.4s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.477, total=  12.8s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.479, total=  12.9s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.477, total=  12.5s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.477, total=  12.3s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.477, total=  12.5s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.477, total=  12.5s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] .

[CV] ..... C=10, gamma=0.01, kernel=linear, score=0.542, total=  13.0s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] ..... C=10, gamma=0.01, kernel=linear, score=0.561, total=  13.1s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] ..... C=10, gamma=0.01, kernel=linear, score=0.591, total=  13.3s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] ..... C=10, gamma=0.01, kernel=linear, score=0.588, total=  13.3s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] ..... C=10, gamma=0.01, kernel=linear, score=0.608, total=  13.5s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.542, total=  13.1s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.561, total=  13.1s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .

[CV] .. C=1000, gamma=0.001, kernel=linear, score=0.582, total=  13.2s
[CV] C=1000, gamma=0.001, kernel=linear ..............................
[CV] .. C=1000, gamma=0.001, kernel=linear, score=0.584, total=  13.4s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.527, total=  13.0s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.574, total=  13.1s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.566, total=  13.2s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.582, total=  13.2s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.584, total=  13.3s


[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed: 27.1min finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear']},
             verbose=3)

In [72]:
grid_svm_linear = grid_l
grid_svm_linear

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear']},
             verbose=3)

In [75]:
grid_predictions_svm_l = grid_svm_linear.predict(X_test_tfidf_34)

In [76]:
print(accuracy_score(y_test, grid_predictions_svm_l))
print(confusion_matrix(y_test, grid_predictions_svm_l))
print(classification_report(y_test, grid_predictions_svm_l))

0.611271676300578
[[ 72 114  19]
 [ 48 296   3]
 [ 48  37  55]]
              precision    recall  f1-score   support

         CAG       0.43      0.35      0.39       205
         NAG       0.66      0.85      0.75       347
         OAG       0.71      0.39      0.51       140

    accuracy                           0.61       692
   macro avg       0.60      0.53      0.55       692
weighted avg       0.60      0.61      0.59       692



# Applying XGBoost

In [34]:
from xgboost import XGBClassifier

clf1 = XGBClassifier()
clf1.fit(X_train_tfidf_34, y_train)

xg_predictions = clf1.predict(X_test_tfidf_34)



In [35]:
print(accuracy_score(y_test, xg_predictions))
print(confusion_matrix(y_test, xg_predictions))
print(classification_report(y_test, xg_predictions))

0.5852601156069365
[[ 70 112  23]
 [ 65 276   6]
 [ 51  30  59]]
              precision    recall  f1-score   support

         CAG       0.38      0.34      0.36       205
         NAG       0.66      0.80      0.72       347
         OAG       0.67      0.42      0.52       140

    accuracy                           0.59       692
   macro avg       0.57      0.52      0.53       692
weighted avg       0.58      0.59      0.57       692



# Applying Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs', max_iter=10000)
lr.fit(X_train_tfidf_34, y_train)

lr_predictions = lr.predict(X_test_tfidf_34)

In [38]:
print(accuracy_score(y_test, lr_predictions))
print(confusion_matrix(y_test, lr_predictions))
print(classification_report(y_test, lr_predictions))

0.6098265895953757
[[ 55 133  17]
 [ 28 316   3]
 [ 42  47  51]]
              precision    recall  f1-score   support

         CAG       0.44      0.27      0.33       205
         NAG       0.64      0.91      0.75       347
         OAG       0.72      0.36      0.48       140

    accuracy                           0.61       692
   macro avg       0.60      0.51      0.52       692
weighted avg       0.60      0.61      0.57       692



# Applying KNeighbors

In [41]:
from sklearn.neighbors import KNeighborsClassifier

knear_clf_5 = KNeighborsClassifier(n_neighbors=5)
knear_clf_5.fit(X_train_tfidf_34, y_train)

knear_predictions_5 = knear_clf_5.predict(X_test_tfidf_34)

In [42]:
print(accuracy_score(y_test, knear_predictions_5))
print(confusion_matrix(y_test, knear_predictions_5))
print(classification_report(y_test, knear_predictions_5))

0.5245664739884393
[[ 24 176   5]
 [ 25 321   1]
 [ 23  99  18]]
              precision    recall  f1-score   support

         CAG       0.33      0.12      0.17       205
         NAG       0.54      0.93      0.68       347
         OAG       0.75      0.13      0.22       140

    accuracy                           0.52       692
   macro avg       0.54      0.39      0.36       692
weighted avg       0.52      0.52      0.44       692



# Applying Multinomial Naive Bayes

In [46]:
from sklearn.naive_bayes import MultinomialNB

mnb_clf = MultinomialNB()
mnb_clf.fit(X_train_tfidf_34, y_train)
mnb_predictions = mnb_clf.predict(X_test_tfidf_34)

In [47]:
print(accuracy_score(y_test, mnb_predictions))
print(confusion_matrix(y_test, mnb_predictions))
print(classification_report(y_test, mnb_predictions))

0.5274566473988439
[[  8 197   0]
 [  2 345   0]
 [ 17 111  12]]
              precision    recall  f1-score   support

         CAG       0.30      0.04      0.07       205
         NAG       0.53      0.99      0.69       347
         OAG       1.00      0.09      0.16       140

    accuracy                           0.53       692
   macro avg       0.61      0.37      0.31       692
weighted avg       0.56      0.53      0.40       692



# Applying Stocastic Gradient Descent

In [50]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=100)
sgd_clf.fit(X_train_tfidf_34, y_train)
sgd_predictions_100_l2_mh = sgd_clf.predict(X_test_tfidf_34)

In [51]:
print(accuracy_score(y_test, sgd_predictions_100_l2_mh))
print(confusion_matrix(y_test, sgd_predictions_100_l2_mh))
print(classification_report(y_test, sgd_predictions_100_l2_mh))

0.5953757225433526
[[ 88  89  28]
 [ 72 262  13]
 [ 49  29  62]]
              precision    recall  f1-score   support

         CAG       0.42      0.43      0.43       205
         NAG       0.69      0.76      0.72       347
         OAG       0.60      0.44      0.51       140

    accuracy                           0.60       692
   macro avg       0.57      0.54      0.55       692
weighted avg       0.59      0.60      0.59       692



In [52]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=10)
sgd_clf.fit(X_train_tfidf_34, y_train)
sgd_predictions_10_l2_mh = sgd_clf.predict(X_test_tfidf_34)

In [53]:
print(accuracy_score(y_test, sgd_predictions_10_l2_mh))
print(confusion_matrix(y_test, sgd_predictions_10_l2_mh))
print(classification_report(y_test, sgd_predictions_10_l2_mh))

0.6011560693641619
[[ 82  96  27]
 [ 64 273  10]
 [ 48  31  61]]
              precision    recall  f1-score   support

         CAG       0.42      0.40      0.41       205
         NAG       0.68      0.79      0.73       347
         OAG       0.62      0.44      0.51       140

    accuracy                           0.60       692
   macro avg       0.58      0.54      0.55       692
weighted avg       0.59      0.60      0.59       692

