## Packages 

In [8]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score,KFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier

In [9]:
np.random.seed(500)

## Dataset Load

In [10]:
Corpus =pd.read_csv('/home/anjir29/Desktop/Covid_Vaccine_Sentiment/raw_data/data.csv')

In [11]:
Corpus=Corpus.drop(['Timestamp', 'Gender','Age','Did you take COVID 19 vaccine?',], axis = 1)

In [12]:
Corpus

Unnamed: 0,outcome,Sentiment
0,Yes,It will keep us safe from Covid-19. I think ev...
1,Yes,We should be Taking vaccines as it reduces the...
2,Yes,vaccines are helpful for us. We should take a ...
3,Yes,Because this vaccine has around 80 to 90% accu...
4,Yes,It will be good for humanity.
...,...,...
1403,No,Vaccines are not 100% safe.
1404,No,"After getting a vaccine, people are too weak."
1405,No,As the government prepares vaccination centers...
1406,No,Since commencing the COVID vaccination program...


## Data Pre-Processing

In [13]:
Corpus['Sentiment'].dropna(inplace=True)
Corpus['Sentiment'] = [entry.lower() for entry in Corpus['Sentiment']]
Corpus['Sentiment']= [word_tokenize(entry) for entry in Corpus['Sentiment']]

In [14]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [15]:
for index,entry in enumerate(Corpus['Sentiment']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

## Train-Test Split

In [16]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['outcome'],test_size=0.2)

In [17]:
Train_Y.size,Test_Y.size

(1126, 282)

In [18]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [19]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

# Naive Bayes

In [20]:
cv = KFold(n_splits=4, random_state=1, shuffle=True)
model = naive_bayes.MultinomialNB()

In [21]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='accuracy', cv=cv, n_jobs=-1)
scores

array([0.77304965, 0.75531915, 0.79715302, 0.70818505])

In [22]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='precision', cv=cv, n_jobs=-1)
scores

array([0.75      , 0.70430108, 0.79190751, 0.63829787])

In [23]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='recall', cv=cv, n_jobs=-1)
scores

array([0.89240506, 0.90344828, 0.86708861, 0.89552239])

In [24]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='f1', cv=cv, n_jobs=-1)
scores

array([0.8150289 , 0.79154079, 0.82779456, 0.74534161])

In [25]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset

MultinomialNB()

In [26]:
predictions_NB = Naive.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)
print("Naive Bayes Accuracy Score -> ",precision_score(predictions_NB, Test_Y)*100)
print("Naive Bayes Accuracy Score -> ",recall_score(predictions_NB, Test_Y)*100)
print("Naive Bayes Accuracy Score -> ",f1_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  80.85106382978722
Naive Bayes Accuracy Score ->  91.02564102564102
Naive Bayes Accuracy Score ->  78.02197802197803
Naive Bayes Accuracy Score ->  84.02366863905326


In [27]:
print("Classification report for - \n{}:\n{}\n".format(
    predictions_NB, metrics.classification_report(predictions_NB, Test_Y)))

Classification report for - 
[1 0 1 1 1 1 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 1 1 1 1 0 1 1 1
 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 0 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 0
 1 1 1 1 0 1 1 0 1 0 0 1 1 1 1 0 1 0 0 0 0 1 1 1 1 0 1 1 0 1 1 1 0 1 1 0 1
 1 0 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 0 1 1 1 1 1 0 1 0 1 0 0 1 0 1 0 0 1 1 1 0 1
 1 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 0 0 0 1 1 1 1 0 1 1 1 1 1 1 0 1 0
 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 1 1
 0 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1]:
              precision    recall  f1-score   support

           0       0.68      0.86      0.76       100
           1       0.91      0.78      0.84       182

    accuracy                           0.81       282
   macro avg       0.80      0.82      0.80       282
weighted avg       0.83      0.81      0.81       282




# SVM

In [28]:
cv = KFold(n_splits=4, random_state=1, shuffle=True)
model = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

In [29]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='accuracy', cv=cv, n_jobs=-1)
scores

array([0.79078014, 0.76595745, 0.79359431, 0.76868327])

In [30]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='precision', cv=cv, n_jobs=-1)
scores

array([0.84137931, 0.79259259, 0.84722222, 0.74468085])

In [31]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='recall', cv=cv, n_jobs=-1)
scores

array([0.7721519 , 0.73793103, 0.7721519 , 0.78358209])

In [32]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='f1', cv=cv, n_jobs=-1)
scores

array([0.80528053, 0.76428571, 0.80794702, 0.76363636])

In [33]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset

SVC(gamma='auto', kernel='linear')

In [34]:
predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  79.43262411347519


In [35]:
print("Classification report for - \n{}:\n{}\n".format(
    predictions_SVM, metrics.classification_report(predictions_SVM, Test_Y)))

Classification report for - 
[1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 1 1 1 1 0 1 1 1
 1 1 1 0 1 1 0 1 1 0 0 1 0 0 1 0 1 1 0 0 1 1 1 1 0 0 1 1 1 0 0 1 0 1 1 0 0
 1 1 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 0 0 0 0 1 0 1 1 0 1 0 0 1 0 1 0 1 1 0 1
 1 0 0 1 0 1 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 0 1
 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1
 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 1 1 0 0 1 0 1 1 0 0 1 1 1 1 1 1 0 1 0
 0 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1
 0 1 0 1 0 0 1 0 1 1 1 1 0 0 1 1 1 1 1 0 0 1 1]:
              precision    recall  f1-score   support

           0       0.84      0.74      0.79       144
           1       0.76      0.86      0.80       138

    accuracy                           0.79       282
   macro avg       0.80      0.80      0.79       282
weighted avg       0.80      0.79      0.79       282




# KNN

In [36]:
cv = KFold(n_splits=4, random_state=1, shuffle=True)
model = KNeighborsClassifier(n_neighbors=5)

In [37]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='accuracy', cv=cv, n_jobs=-1)
scores

array([0.75886525, 0.71985816, 0.72953737, 0.70106762])

In [38]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='precision', cv=cv, n_jobs=-1)
scores

array([0.75862069, 0.69186047, 0.74404762, 0.65060241])

In [39]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='recall', cv=cv, n_jobs=-1)
scores

array([0.83544304, 0.82068966, 0.79113924, 0.80597015])

In [40]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='f1', cv=cv, n_jobs=-1)
scores

array([0.79518072, 0.75078864, 0.76687117, 0.72      ])

In [41]:
modelknn = KNeighborsClassifier(n_neighbors=5)
modelknn.fit(Train_X_Tfidf,Train_Y)

KNeighborsClassifier()

In [42]:
predictions_KNN = modelknn.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
print("KNN Accuracy Score -> ",accuracy_score(predictions_KNN, Test_Y)*100)

KNN Accuracy Score ->  74.822695035461


In [43]:
print("Classification report for - \n{}:\n{}\n".format(
    predictions_KNN, metrics.classification_report(predictions_KNN, Test_Y)))

Classification report for - 
[1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 1 1 1 1 1 0 1 1 1
 1 1 1 0 1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 1 1 1 1 1 0 0 1 1 1 0 0 1 0 1 1 1 0
 1 1 1 1 0 1 1 0 1 0 0 0 1 1 1 0 1 0 0 0 0 1 1 1 1 0 1 1 0 1 1 1 1 0 0 0 1
 1 0 0 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1
 0 0 0 1 0 0 1 1 0 0 1 0 1 1 0 1 1 1 1 1 0 0 1 1 1 0 1 1 0 0 0 0 1 0 1 0 1
 1 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0 0 1 0
 0 1 1 1 0 0 1 0 1 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 1 1 1 0 1 0 1 0 0 0 1 1
 1 1 0 1 1 0 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1]:
              precision    recall  f1-score   support

           0       0.63      0.76      0.69       105
           1       0.84      0.74      0.79       177

    accuracy                           0.75       282
   macro avg       0.74      0.75      0.74       282
weighted avg       0.76      0.75      0.75       282




# Decision Tree

In [44]:
cv = KFold(n_splits=4, random_state=1, shuffle=True)
model = DecisionTreeClassifier()

In [45]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='accuracy', cv=cv, n_jobs=-1)
scores

array([0.68085106, 0.63120567, 0.6975089 , 0.66192171])

In [46]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='precision', cv=cv, n_jobs=-1)
scores

array([0.77272727, 0.6744186 , 0.71428571, 0.63636364])

In [47]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='recall', cv=cv, n_jobs=-1)
scores

array([0.64556962, 0.62758621, 0.7278481 , 0.67164179])

In [48]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='f1', cv=cv, n_jobs=-1)
scores

array([0.71864407, 0.63768116, 0.70926518, 0.65201465])

In [49]:
dt = DecisionTreeClassifier()
dt.fit(Train_X_Tfidf,Train_Y)

DecisionTreeClassifier()

In [50]:
y_pred_dt = dt.predict(Test_X_Tfidf)
print("DT Accuracy Score -> ",accuracy_score(y_pred_dt, Test_Y)*100)

DT Accuracy Score ->  67.37588652482269


In [51]:
print("Classification report for - \n{}:\n{}\n".format(
    y_pred_dt, metrics.classification_report(y_pred_dt, Test_Y)))

Classification report for - 
[1 0 0 1 1 1 0 0 0 1 1 1 0 1 0 0 1 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 1 1 1 0 0
 1 1 0 0 1 1 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 1 0 0 1 1 1 0 0 0 1 1 0 0
 1 1 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1 0 0 0 0 1 1 1 1 0 0 1 0 1 1 1 0 0 0 0 1
 0 1 0 0 1 1 1 1 1 1 0 1 0 0 0 0 1 1 0 1 1 1 1 0 0 1 1 0 1 1 0 1 0 1 1 1 1
 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0
 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 1 1 0 0 1 1 1 1 1 1 0 0 1 0 1 1 0 0 0
 0 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0
 0 1 0 0 0 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1]:
              precision    recall  f1-score   support

           0       0.70      0.62      0.66       142
           1       0.65      0.73      0.69       140

    accuracy                           0.67       282
   macro avg       0.68      0.67      0.67       282
weighted avg       0.68      0.67      0.67       282




# Random Forest

In [52]:
cv = KFold(n_splits=4, random_state=1, shuffle=True)
model = RandomForestClassifier(n_estimators=100, random_state=0)

In [53]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='accuracy', cv=cv, n_jobs=-1)
scores

array([0.74822695, 0.74468085, 0.74733096, 0.74733096])

In [54]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='precision', cv=cv, n_jobs=-1)
scores

array([0.80851064, 0.75886525, 0.83206107, 0.72992701])

In [55]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='recall', cv=cv, n_jobs=-1)
scores

array([0.72151899, 0.73793103, 0.68987342, 0.74626866])

In [56]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='f1', cv=cv, n_jobs=-1)
scores

array([0.76254181, 0.74825175, 0.75432526, 0.73800738])

In [57]:
classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(Train_X_Tfidf,Train_Y) 

RandomForestClassifier(random_state=0)

In [58]:
y_pred_rf= classifier.predict(Test_X_Tfidf)
print("RF Accuracy Score -> ",accuracy_score(y_pred_rf, Test_Y)*100)

RF Accuracy Score ->  80.1418439716312


In [59]:
print("Classification report for - \n{}:\n{}\n".format(
    y_pred_rf, metrics.classification_report(y_pred_rf, Test_Y)))

Classification report for - 
[1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1
 1 1 1 0 1 1 0 1 1 0 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 0 1 1 1 1 0 0 0 1 1 1 0
 1 1 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 1 0 1
 1 0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 0 1
 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 1 0 0 0 0 0 1 1 1 0 1
 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 0 1 1 1 1 1 1 0 0 0
 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1 0
 0 1 0 1 1 0 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1]:
              precision    recall  f1-score   support

           0       0.84      0.75      0.79       142
           1       0.77      0.86      0.81       140

    accuracy                           0.80       282
   macro avg       0.81      0.80      0.80       282
weighted avg       0.81      0.80      0.80       282




# Gradient Boosting

In [60]:
cv = KFold(n_splits=4, random_state=1, shuffle=True)
model = GradientBoostingClassifier()

In [61]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='accuracy', cv=cv, n_jobs=-1)
scores

array([0.72695035, 0.70212766, 0.71886121, 0.72597865])

In [62]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='precision', cv=cv, n_jobs=-1)
scores

array([0.7826087 , 0.73529412, 0.76296296, 0.71223022])

In [63]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='recall', cv=cv, n_jobs=-1)
scores

array([0.68987342, 0.70344828, 0.65822785, 0.73134328])

In [64]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='f1', cv=cv, n_jobs=-1)
scores

array([0.73220339, 0.69818182, 0.70307167, 0.72592593])

In [65]:
gb_clf2 = GradientBoostingClassifier()
gb_clf2.fit(Train_X_Tfidf,Train_Y)

GradientBoostingClassifier()

In [66]:
y_pred_gb = gb_clf2.predict(Test_X_Tfidf)
print("RF Accuracy Score -> ",accuracy_score(y_pred_gb, Test_Y)*100)

RF Accuracy Score ->  76.24113475177306


In [67]:
print("Classification report for - \n{}:\n{}\n".format(
    y_pred_gb, metrics.classification_report(y_pred_gb, Test_Y)))

Classification report for - 
[1 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 1 1 0 1 1 1
 1 1 1 0 1 1 0 1 1 0 1 0 1 0 1 1 0 1 0 0 0 1 1 1 1 0 1 1 1 1 0 0 0 1 1 0 0
 1 1 1 0 0 0 1 0 1 1 0 1 1 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 1 1 0 0
 1 0 0 0 0 1 1 1 1 1 0 1 1 0 0 0 1 1 1 1 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1 0 1
 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0
 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0
 0 1 0 1 0 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1]:
              precision    recall  f1-score   support

           0       0.83      0.70      0.76       149
           1       0.71      0.83      0.77       133

    accuracy                           0.76       282
   macro avg       0.77      0.77      0.76       282
weighted avg       0.77      0.76      0.76       282




# Ada Boost

In [68]:
cv = KFold(n_splits=4, random_state=1, shuffle=True)
model = AdaBoostClassifier()

In [69]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='accuracy', cv=cv, n_jobs=-1)
scores

array([0.71985816, 0.71276596, 0.71174377, 0.71886121])

In [70]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='precision', cv=cv, n_jobs=-1)
scores

array([0.79259259, 0.71917808, 0.77304965, 0.68707483])

In [71]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='recall', cv=cv, n_jobs=-1)
scores

array([0.67721519, 0.72413793, 0.68987342, 0.75373134])

In [72]:
scores = cross_val_score(model, Train_X_Tfidf, Train_Y, scoring='f1', cv=cv, n_jobs=-1)
scores

array([0.73037543, 0.72164948, 0.72909699, 0.71886121])

In [73]:
# Create adaboost classifer object
adaB = AdaBoostClassifier()
# Train Adaboost Classifer
model = adaB.fit(Train_X_Tfidf,Train_Y)

In [74]:
y_pred_AB = model.predict(Test_X_Tfidf)
print("AB Accuracy Score -> ",accuracy_score(y_pred_AB, Test_Y)*100)

AB Accuracy Score ->  78.01418439716312


In [75]:
print("Classification report for - \n{}:\n{}\n".format(
    y_pred_AB, metrics.classification_report(y_pred_AB, Test_Y)))

Classification report for - 
[1 0 1 1 1 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1 0 1 1 1
 1 1 1 0 1 1 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 1 0 1 0 1 1 0 1 0 1 1 1 1
 1 1 1 0 0 0 1 0 1 1 0 1 1 1 0 0 1 0 0 0 0 0 1 1 1 0 1 0 0 0 0 1 0 1 1 0 1
 1 0 0 1 0 1 1 0 1 1 0 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 0 1
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 0 0 1 0 0 1 1 1 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 1 0 0 1 1 1 1 1 0 0 1 1 1 1 1 0 0 0
 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1 0
 0 0 0 1 0 0 1 0 1 1 0 0 0 1 1 1 1 1 1 0 1 1 1]:
              precision    recall  f1-score   support

           0       0.85      0.71      0.78       150
           1       0.72      0.86      0.78       132

    accuracy                           0.78       282
   macro avg       0.79      0.78      0.78       282
weighted avg       0.79      0.78      0.78       282




In [76]:
acc_NB=accuracy_score(predictions_NB, Test_Y)*100
acc_SVM=accuracy_score(predictions_SVM, Test_Y)*100
acc_KNN=accuracy_score(predictions_KNN, Test_Y)*100
acc_DT=accuracy_score(y_pred_dt, Test_Y)*100
acc_RF=accuracy_score(y_pred_rf, Test_Y)*100
acc_GB=accuracy_score(y_pred_gb, Test_Y)*100
acc_AB=accuracy_score(y_pred_AB, Test_Y)*100

pre_NB=metrics.precision_score(predictions_NB, Test_Y)*100
pre_SVM=metrics.precision_score(predictions_SVM, Test_Y)*100
pre_KNN=metrics.precision_score(predictions_KNN, Test_Y)*100
pre_DT=metrics.precision_score(y_pred_dt, Test_Y)*100
pre_RF=metrics.precision_score(y_pred_rf, Test_Y)*100
pre_GB=metrics.precision_score(y_pred_gb, Test_Y)*100
pre_AB=metrics.precision_score(y_pred_AB, Test_Y)*100

re_NB=metrics.recall_score(predictions_NB, Test_Y)*100
re_SVM=metrics.recall_score(predictions_SVM, Test_Y)*100
re_KNN=metrics.recall_score(predictions_KNN, Test_Y)*100
re_DT=metrics.recall_score(y_pred_dt, Test_Y)*100
re_RF=metrics.recall_score(y_pred_rf, Test_Y)*100
re_GB=metrics.recall_score(y_pred_gb, Test_Y)*100
re_AB=metrics.recall_score(y_pred_AB, Test_Y)*100

f1_NB=metrics.f1_score(predictions_NB, Test_Y)*100
f1_SVM=metrics.f1_score(predictions_SVM, Test_Y)*100
f1_KNN=metrics.f1_score(predictions_KNN, Test_Y)*100
f1_DT=metrics.f1_score(y_pred_dt, Test_Y)*100
f1_RF=metrics.f1_score(y_pred_rf, Test_Y)*100
f1_GB=metrics.f1_score(y_pred_gb, Test_Y)*100
f1_AB=metrics.f1_score(y_pred_AB, Test_Y)*100

In [77]:
results = pd.DataFrame({
    'Model': ['Naive Bayes', 'Support Vector Machine', 'K-nearest neighbors','Decision tree', 
              'Random Forest','Gradient Boosting','Ada Boost'],
    'Accuracy': [acc_NB,acc_SVM,acc_KNN, 
             acc_DT,acc_RF,acc_GB,acc_AB],
    'Precision': [pre_NB,pre_SVM,pre_KNN, 
             pre_DT,pre_RF,pre_GB,pre_AB],
    'Recall':[re_NB,re_SVM,re_KNN, 
             re_DT,re_RF,re_GB,re_AB],
    'F1-Score':[f1_NB,f1_SVM,f1_KNN, 
             f1_DT,f1_RF,f1_GB,f1_AB]})

result_df = results.sort_values(by='Accuracy', ascending=False)
#result_df = result_df.set_index('Score')
result_df = result_df.round(2)

In [78]:
result_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Naive Bayes,80.85,91.03,78.02,84.02
4,Random Forest,80.14,76.92,85.71,81.08
1,Support Vector Machine,79.43,75.64,85.51,80.27
6,Ada Boost,78.01,72.44,85.61,78.47
5,Gradient Boosting,76.24,71.15,83.46,76.82
2,K-nearest neighbors,74.82,83.97,74.01,78.68
3,Decision tree,67.38,65.38,72.86,68.92
