In [1]:
import csv
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
df = pd.read_excel('CleanedCombinedLabelled.xlsx')
df

Unnamed: 0,TEXT,TAG
0,sir without mask vaccine keise le sakte hai,NAG
1,nimsjosh ghanta gopal modi teri chodi,OAG
2,tum se yuwao ko bahut pareshani ho rahi,NAG
3,aaj khud virus ne vaccine li,CAG
4,sir kbhi apne promise k bare m v baat kar lij...,CAG
...,...,...
3454,ju madarchod ubeisa mulla sala harami kutte k...,OAG
3455,jo ram mandir machhali ke khilaf mein bol raha...,CAG
3456,modi ko kya kisi ko nahi jana cahiye kyu bhai ...,NAG
3457,chal bhaag bhosree k jyada gyan na pel khu...,OAG


In [3]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

df["TOKENS"] = df["TEXT"].map(tokenizer.tokenize)

In [4]:
from sklearn.model_selection import train_test_split

X = df['TEXT']
y = df['TAG']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

In [5]:
print(X_train.shape[0])
print(X_test.shape[0])

2767
692


# Applying Count Vectorizer character n-gram for Feature Extraction

In [7]:
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer(analyzer='char', ngram_range=(4,5))
X_train_count_22 = vectorizer.fit_transform(X_train)
X_test_count_22 = vectorizer.transform(X_test)


# Applying Support Vector Machine

In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01, 0.001],'kernel': ['rbf']} 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

grid.fit(X_train_count_22, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.477, total=  17.6s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   17.5s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.477, total=  17.3s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   34.8s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.477, total=  16.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.477, total=  15.7s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.479, total=  14.8s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.477, total=  15.2s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.477, total=  16.4s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.477, total=  15.2s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.477, total=  15.3s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 17.3min finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['rbf']},
             verbose=3)

In [9]:
grid_svm_rbf = grid

In [10]:
grid_predictions_svm_rbf = grid_svm_rbf.predict(X_test_count_22)

In [11]:
print(accuracy_score(y_test, grid_predictions_svm_rbf))
print(confusion_matrix(y_test, grid_predictions_svm_rbf))
print(classification_report(y_test, grid_predictions_svm_rbf))

0.6141618497109826
[[ 76 112  17]
 [ 47 296   4]
 [ 43  44  53]]
              precision    recall  f1-score   support

         CAG       0.46      0.37      0.41       205
         NAG       0.65      0.85      0.74       347
         OAG       0.72      0.38      0.50       140

    accuracy                           0.61       692
   macro avg       0.61      0.53      0.55       692
weighted avg       0.61      0.61      0.59       692



In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01, 0.001],'kernel': ['linear']} 
grid_l = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

grid_l.fit(X_train_count_22, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.567, total=  14.9s
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.8s remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.565, total=  14.5s
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   29.3s remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.580, total=  15.3s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.571, total=  14.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.622, total=  14.0s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.567, total=  14.7s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.565, total=  13.5s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.580, total=  13.4s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.571, total=  13.4s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] .

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 13.5min finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['linear']},
             verbose=3)

In [13]:
grid_svm_linear = grid_l

In [14]:
grid_predictions_svm_l = grid_svm_linear.predict(X_test_count_22)

In [15]:
print(accuracy_score(y_test, grid_predictions_svm_l))
print(confusion_matrix(y_test, grid_predictions_svm_l))
print(classification_report(y_test, grid_predictions_svm_l))

0.5939306358381503
[[ 91  89  25]
 [ 68 270   9]
 [ 54  36  50]]
              precision    recall  f1-score   support

         CAG       0.43      0.44      0.44       205
         NAG       0.68      0.78      0.73       347
         OAG       0.60      0.36      0.45       140

    accuracy                           0.59       692
   macro avg       0.57      0.53      0.54       692
weighted avg       0.59      0.59      0.58       692



# Applying XGBoost

In [19]:
from xgboost import XGBClassifier

xg_clf = XGBClassifier()
xg_clf.fit(X_train_count_22, y_train)

xg_predictions = xg_clf.predict(X_test_count_22)





In [20]:
print(accuracy_score(y_test, xg_predictions))
print(confusion_matrix(y_test, xg_predictions))
print(classification_report(y_test, xg_predictions))

0.5982658959537572
[[ 68 116  21]
 [ 48 288  11]
 [ 43  39  58]]
              precision    recall  f1-score   support

         CAG       0.43      0.33      0.37       205
         NAG       0.65      0.83      0.73       347
         OAG       0.64      0.41      0.50       140

    accuracy                           0.60       692
   macro avg       0.57      0.53      0.54       692
weighted avg       0.58      0.60      0.58       692



# Applying Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs', max_iter=10000)
lr.fit(X_train_count_22, y_train)

lr_predictions = lr.predict(X_test_count_22)

In [22]:
print(accuracy_score(y_test, lr_predictions))
print(confusion_matrix(y_test, lr_predictions))
print(classification_report(y_test, lr_predictions))

0.619942196531792
[[ 88  94  23]
 [ 55 283   9]
 [ 44  38  58]]
              precision    recall  f1-score   support

         CAG       0.47      0.43      0.45       205
         NAG       0.68      0.82      0.74       347
         OAG       0.64      0.41      0.50       140

    accuracy                           0.62       692
   macro avg       0.60      0.55      0.57       692
weighted avg       0.61      0.62      0.61       692



# Applying KNeighbors

In [25]:
from sklearn.neighbors import KNeighborsClassifier

knear_clf_5 = KNeighborsClassifier(n_neighbors=5)
knear_clf_5.fit(X_train_count_22, y_train)

knear_predictions_5 = knear_clf_5.predict(X_test_count_22)

In [26]:
print(accuracy_score(y_test, knear_predictions_5))
print(confusion_matrix(y_test, knear_predictions_5))
print(classification_report(y_test, knear_predictions_5))

0.5072254335260116
[[  4 201   0]
 [  1 346   0]
 [  3 136   1]]
              precision    recall  f1-score   support

         CAG       0.50      0.02      0.04       205
         NAG       0.51      1.00      0.67       347
         OAG       1.00      0.01      0.01       140

    accuracy                           0.51       692
   macro avg       0.67      0.34      0.24       692
weighted avg       0.60      0.51      0.35       692



# Applying Multinomial Naive Bayes

In [29]:
from sklearn.naive_bayes import MultinomialNB

mnb_clf = MultinomialNB()
mnb_clf.fit(X_train_count_22, y_train)
mnb_predictions = mnb_clf.predict(X_test_count_22)

In [30]:
print(accuracy_score(y_test, mnb_predictions))
print(confusion_matrix(y_test, mnb_predictions))
print(classification_report(y_test, mnb_predictions))

0.6026011560693642
[[ 83  91  31]
 [ 59 265  23]
 [ 40  31  69]]
              precision    recall  f1-score   support

         CAG       0.46      0.40      0.43       205
         NAG       0.68      0.76      0.72       347
         OAG       0.56      0.49      0.52       140

    accuracy                           0.60       692
   macro avg       0.57      0.55      0.56       692
weighted avg       0.59      0.60      0.60       692



# Applying Stocastic Gradient Descent

In [31]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=100)
sgd_clf.fit(X_train_count_22, y_train)
sgd_predictions_100_l2_mh = sgd_clf.predict(X_test_count_22)

In [32]:
print(accuracy_score(y_test, sgd_predictions_100_l2_mh))
print(confusion_matrix(y_test, sgd_predictions_100_l2_mh))
print(classification_report(y_test, sgd_predictions_100_l2_mh))

0.6026011560693642
[[  0   0   0   0   0]
 [  0   0   0   0   0]
 [  0   0  70 107  28]
 [  3   0  48 286  10]
 [  0   2  34  43  61]]
              precision    recall  f1-score   support

         CAG       0.00      0.00      0.00         0
         NAG       0.00      0.00      0.00         0
         CAG       0.46      0.34      0.39       205
         NAG       0.66      0.82      0.73       347
         OAG       0.62      0.44      0.51       140

    accuracy                           0.60       692
   macro avg       0.35      0.32      0.33       692
weighted avg       0.59      0.60      0.59       692



  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=10)
sgd_clf.fit(X_train_count_22, y_train)
sgd_predictions_10_l2_mh = sgd_clf.predict(X_test_count_22)



In [34]:
print(accuracy_score(y_test, sgd_predictions_10_l2_mh))
print(confusion_matrix(y_test, sgd_predictions_10_l2_mh))
print(classification_report(y_test, sgd_predictions_10_l2_mh))

0.5924855491329479
[[  0   0   0   0]
 [  7  80  77  41]
 [  4  67 262  14]
 [  4  38  30  68]]
              precision    recall  f1-score   support

         CAG       0.00      0.00      0.00         0
         CAG       0.43      0.39      0.41       205
         NAG       0.71      0.76      0.73       347
         OAG       0.55      0.49      0.52       140

    accuracy                           0.59       692
   macro avg       0.42      0.41      0.41       692
weighted avg       0.60      0.59      0.59       692

