# Loading data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [12]:
df = pd.read_csv('sentiment_5_class.csv')
df.head()

Unnamed: 0,Phrase,Sentiment
0,injects just enough freshness into the proceed...,3
1,that,2
2,never plays as dramatic even when dramatic thi...,0
3,"None of this is very original , and it is n't ...",0
4,", Madonna gives her best performance since Abe...",3


In [15]:
X = df.Phrase.tolist()
y = df.Sentiment.tolist()

# Train-Test Split

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)
print(len(X_train), len(X_test))

14711 3678


In [4]:
X_train.shape

(242, 13)

In [17]:
#from sklearn.feature_extraction.text import CountVectorizer

#c_vectorizer = CountVectorizer()
#c_vectorizer.fit(X_train)
#c_vectorizer.get_feature_names()

#c_vectorizer = CountVectorizer()
#c_vectorizer.fit(X_train)
#c_vectorizer.get_feature_names()

#c_X_train_v.toarray()

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

t_vectorizer = TfidfVectorizer()
t_vectorizer.fit(X_train)
#t_vectorizer.get_feature_names()

X_train_v = t_vectorizer.transform(X_train)
X_test_v = t_vectorizer.transform(X_test)
X_train_v.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Model(out of the box)

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

model = DecisionTreeClassifier(random_state=1)
model.fit(X_train_v, y_train)
preds = model.predict(X_test_v)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.64      0.51      0.57       247
           1       0.57      0.44      0.49       291
           2       0.44      0.65      0.52       469
           3       0.65      0.65      0.65      1759
           4       0.59      0.51      0.55       912

    accuracy                           0.59      3678
   macro avg       0.58      0.55      0.56      3678
weighted avg       0.60      0.59      0.59      3678



# Effect of max depth

In [20]:
from sklearn.metrics import accuracy_score
for max_depth in (1, 3, 5, 7, 9):
    model = DecisionTreeClassifier(random_state = 1, max_depth=max_depth)
    model.fit(X_train_v, y_train)
    train_preds = model.predict(X_train_v)
    test_preds = model.predict(X_test_v)
    #print(f'Max Depth: {max_depth}; Training Accuracy: {accuracy_score(y_train, train_preds)} Test Accuracy: {accuracy_score(y_test, test_preds)})
    print('Max Depth: ', max_depth)
    print('Training Accuracy: ', accuracy_score(y_train, train_preds))
    print('Test Accuracy: ', accuracy_score(y_test, test_preds))

Max Depth:  1
Training Accuracy:  0.4780776289851132
Test Accuracy:  0.4782490483958673
Max Depth:  3
Training Accuracy:  0.48881789137380194
Test Accuracy:  0.48450244698205547
Max Depth:  5
Training Accuracy:  0.5004418462375093
Test Accuracy:  0.49157150625339857
Max Depth:  7
Training Accuracy:  0.5116579430358236
Test Accuracy:  0.49429037520391517
Max Depth:  9
Training Accuracy:  0.5206987968187071
Test Accuracy:  0.49782490483958675


# Grid Search

In [28]:
from sklearn.model_selection import GridSearchCV
grid_params = {
    'max_depth': (3, 5, 7, 9, 11, 13,15,19),
    'min_samples_split': (2, 4, 6, 8, 10)
}

from sklearn.metrics import make_scorer, f1_score

scorer = make_scorer(f1_score, average = 'micro')
clf = GridSearchCV(DecisionTreeClassifier(), grid_params, scoring = scorer)
clf.fit(X_train_v, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': (3, 5, 7, 9, 11, 13, 15, 19),
       

In [29]:
print(clf.best_score_, clf.best_params_)

0.5183873150244402 {'max_depth': 19, 'min_samples_split': 2}


# Best Model

In [30]:
model = DecisionTreeClassifier(random_state = 1, max_depth=13, min_samples_split = 6)
model.fit(X_train_v, y_train)
y_pred = model.predict(X_test_v)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.05      0.09       247
           1       0.66      0.07      0.12       291
           2       0.45      0.06      0.11       469
           3       0.50      0.96      0.65      1759
           4       0.69      0.13      0.22       912

    accuracy                           0.51      3678
   macro avg       0.58      0.25      0.24      3678
weighted avg       0.56      0.51      0.40      3678



## Visualizing the tree

In [31]:
from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(model, out_file='tree.dot', 
                feature_names = X_train_v.columns,
                class_names = ['0', '1'],
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot','-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')

AttributeError: columns not found