# Loading data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('sentiment_5_class.csv')
df.head()

Unnamed: 0,Phrase,Sentiment
0,injects just enough freshness into the proceed...,3
1,that,2
2,never plays as dramatic even when dramatic thi...,0
3,"None of this is very original , and it is n't ...",0
4,", Madonna gives her best performance since Abe...",3


In [3]:
X = df.Phrase.tolist()
y = df.Sentiment.tolist()

# Train-Test Split

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)
print(len(X_train), len(X_test))

14711 3678


In [5]:
#from sklearn.feature_extraction.text import CountVectorizer

#c_vectorizer = CountVectorizer()
#c_vectorizer.fit(X_train)
#c_vectorizer.get_feature_names()

#c_vectorizer = CountVectorizer()
#c_vectorizer.fit(X_train)
#c_vectorizer.get_feature_names()

#c_X_train_v.toarray()

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

t_vectorizer = TfidfVectorizer()
t_vectorizer.fit(X_train)
#t_vectorizer.get_feature_names()

X_train_v = t_vectorizer.transform(X_train)
X_test_v = t_vectorizer.transform(X_test)
X_train_v.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Model(out of the box)

In [7]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Instantiate a classification-tree 'dt'
dt = DecisionTreeClassifier(max_depth= 15 ,
                            min_samples_leaf= 2 ,
                            random_state=1)
# Instantiate a BaggingClassifier 'bc'; 
bc = AdaBoostClassifier(base_estimator=dt, 
                       n_estimators= 300)
# Fit 'bc' to the traing set
bc.fit(X_train_v, y_train)


AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=3,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=2,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

In [8]:
# Predict the test set labels
y_pred = bc.predict(X_test_v)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.41      0.51       247
           1       0.52      0.41      0.46       291
           2       0.52      0.21      0.30       469
           3       0.57      0.82      0.67      1759
           4       0.60      0.39      0.48       912

    accuracy                           0.58      3678
   macro avg       0.57      0.45      0.48      3678
weighted avg       0.58      0.58      0.55      3678



# Effect of max depth

In [9]:
from sklearn.metrics import accuracy_score
for max_depth in (1, 3, 5, 7, 9, 11, 15):
    model = DecisionTreeClassifier(random_state = 1, max_depth=max_depth)
    model.fit(X_train_v, y_train)
    train_preds = model.predict(X_train_v)
    test_preds = model.predict(X_test_v)
    #print(f'Max Depth: {max_depth}; Training Accuracy: {accuracy_score(y_train, train_preds)} Test Accuracy: {accuracy_score(y_test, test_preds)})
    print('Max Depth: ', max_depth)
    print('Training Accuracy: ', accuracy_score(y_train, train_preds))
    print('Test Accuracy: ', accuracy_score(y_test, test_preds))

Max Depth:  1
Training Accuracy:  0.4780776289851132
Test Accuracy:  0.4782490483958673
Max Depth:  3
Training Accuracy:  0.48881789137380194
Test Accuracy:  0.48450244698205547
Max Depth:  5
Training Accuracy:  0.5004418462375093
Test Accuracy:  0.49157150625339857
Max Depth:  7
Training Accuracy:  0.5116579430358236
Test Accuracy:  0.49429037520391517
Max Depth:  9
Training Accuracy:  0.5206987968187071
Test Accuracy:  0.49782490483958675
Max Depth:  11
Training Accuracy:  0.531167153830467
Test Accuracy:  0.5029907558455683
Max Depth:  15
Training Accuracy:  0.551492080755897
Test Accuracy:  0.5065252854812398


# Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV
grid_params = {
    'n_estimators': ( 5,15,100),
    
}

from sklearn.metrics import make_scorer, f1_score

scorer = make_scorer(f1_score, average = 'micro')
clf = GridSearchCV(AdaBoostClassifier(), grid_params, scoring = scorer)
clf.fit(X_train_v, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='deprecated', n_jobs=None,
             param_grid={'n_estimators': (5, 15, 100)}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False,
             scoring=make_scorer(f1_score, average=micro), verbose=0)

In [11]:
print(clf.best_score_, clf.best_params_)

0.5187274046447422 {'n_estimators': 100}


# Best Model

In [12]:
model = AdaBoostClassifier(random_state = 1, n_estimators = 100)
model.fit(X_train_v, y_train)
y_pred = model.predict(X_test_v)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.23      0.34       247
           1       0.58      0.09      0.15       291
           2       0.52      0.09      0.15       469
           3       0.51      0.87      0.64      1759
           4       0.56      0.29      0.38       912

    accuracy                           0.52      3678
   macro avg       0.57      0.31      0.33      3678
weighted avg       0.54      0.52      0.46      3678

