# Loading data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('sentiment_5_class.csv')
df.head()

Unnamed: 0,Phrase,Sentiment
0,injects just enough freshness into the proceed...,3
1,that,2
2,never plays as dramatic even when dramatic thi...,0
3,"None of this is very original , and it is n't ...",0
4,", Madonna gives her best performance since Abe...",3


In [3]:
X = df.Phrase.tolist()
y = df.Sentiment.tolist()

# Train-Test Split

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)
print(len(X_train), len(X_test))

14711 3678


In [5]:
X_train.shape

AttributeError: 'list' object has no attribute 'shape'

In [None]:
#from sklearn.feature_extraction.text import CountVectorizer

#c_vectorizer = CountVectorizer()
#c_vectorizer.fit(X_train)
#c_vectorizer.get_feature_names()

#c_vectorizer = CountVectorizer()
#c_vectorizer.fit(X_train)
#c_vectorizer.get_feature_names()

#c_X_train_v.toarray()

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

t_vectorizer = TfidfVectorizer()
t_vectorizer.fit(X_train)
#t_vectorizer.get_feature_names()

X_train_v = t_vectorizer.transform(X_train)
X_test_v = t_vectorizer.transform(X_test)
X_train_v.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Model(out of the box)

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Instantiate a GradientBoostingClassifier 'gb'; 
gb = GradientBoostingClassifier(n_estimators= 300)
# Fit 'gb' to the traing set
gb.fit(X_train_v, y_train)
# Predict the test set labels
y_pred = gb.predict(X_test_v)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.40      0.54       247
           1       0.69      0.24      0.36       291
           2       0.66      0.20      0.30       469
           3       0.57      0.91      0.70      1759
           4       0.72      0.39      0.51       912

    accuracy                           0.60      3678
   macro avg       0.70      0.43      0.48      3678
weighted avg       0.64      0.60      0.56      3678



In [8]:
# Evaluate test set accuracy
test_accuracy = accuracy_score(y_test, y_pred)
# Print test set accuracy
print( 'Test set accuracy: {:.3f}' .format(test_accuracy))

Test set accuracy: 0.604


# Grid Search

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
grid_params = {
    'max_depth': (3, 5, 7),
    'min_samples_split': (2, 4, 6)
}


scorer = make_scorer(f1_score, average = 'micro')
clf = GridSearchCV(GradientBoostingClassifier(), grid_params, scoring = scorer)
clf.fit(X_train_v, y_train)

KeyboardInterrupt: 

In [10]:
print(clf.best_score_, clf.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

# Best Model

In [30]:
model = GradientBoostingClassifier(random_state = 1, max_depth=13, min_samples_split = 6)
model.fit(X_train_v, y_train)
y_pred = model.predict(X_test_v)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.05      0.09       247
           1       0.66      0.07      0.12       291
           2       0.45      0.06      0.11       469
           3       0.50      0.96      0.65      1759
           4       0.69      0.13      0.22       912

    accuracy                           0.51      3678
   macro avg       0.58      0.25      0.24      3678
weighted avg       0.56      0.51      0.40      3678



In [None]:
# Evaluate test set accuracy
test_accuracy = accuracy_score(y_test, y_pred)
# Print test set accuracy
print( 'Test set accuracy: {:.3f}' .format(test_accuracy))