# Loading data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [12]:
df = pd.read_csv('sentiment_5_class.csv')
df.head()

Unnamed: 0,Phrase,Sentiment
0,injects just enough freshness into the proceed...,3
1,that,2
2,never plays as dramatic even when dramatic thi...,0
3,"None of this is very original , and it is n't ...",0
4,", Madonna gives her best performance since Abe...",3


In [15]:
X = df.Phrase.tolist()
y = df.Sentiment.tolist()

# Train-Test Split

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)
print(len(X_train), len(X_test))

14711 3678


In [4]:
X_train.shape

(242, 13)

In [17]:
#from sklearn.feature_extraction.text import CountVectorizer

#c_vectorizer = CountVectorizer()
#c_vectorizer.fit(X_train)
#c_vectorizer.get_feature_names()

#c_vectorizer = CountVectorizer()
#c_vectorizer.fit(X_train)
#c_vectorizer.get_feature_names()

#c_X_train_v.toarray()

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

t_vectorizer = TfidfVectorizer()
t_vectorizer.fit(X_train)
#t_vectorizer.get_feature_names()

X_train_v = t_vectorizer.transform(X_train)
X_test_v = t_vectorizer.transform(X_test)
X_train_v.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## One-Hot Encoding

In [6]:
cat_columns = ['cp', 'exang', 'slope', 'thal']
num_columns = [c for c in X_train.columns if c not in cat_columns]

from sklearn.preprocessing import OneHotEncoder

#Create an OneHotEncoder instance
encoder = OneHotEncoder(handle_unknown = 'ignore')

#Fit on categorical columns
encoder.fit(X_train[cat_columns])

#Transform on training data
X_train_cat_encoded = encoder.transform(X_train[cat_columns])

column_names = encoder.get_feature_names(input_features = cat_columns)
#print(X_train_cat_encoded.toarray())
print(X_train_cat_encoded.todense().shape)
print(column_names)

X_train_encoded_df = pd.DataFrame(X_train_cat_encoded.todense(),
                                  columns = column_names,
                                  index = X_train.index)

#X_train_encoded_df.head()

X_train_encoded = pd.concat([X_train[num_columns], X_train_encoded_df], axis = 1)
#X_train_encoded.head()

(242, 13)
['cp_0' 'cp_1' 'cp_2' 'cp_3' 'exang_0' 'exang_1' 'slope_0' 'slope_1'
 'slope_2' 'thal_0' 'thal_1' 'thal_2' 'thal_3']


In [7]:
#Fit on categorical columns
encoder.fit(X_test[cat_columns])

#Transform on training data
X_test_cat_encoded = encoder.transform(X_test[cat_columns])

column_names = encoder.get_feature_names(input_features = cat_columns)
#print(X_test_cat_encoded.toarray())
print(X_test_cat_encoded.todense().shape)
print(column_names)

X_test_encoded_df = pd.DataFrame(X_test_cat_encoded.todense(),
                                  columns = column_names,
                                  index = X_test.index)

#print(X_test_encoded_df.head())
X_test_encoded = pd.concat([X_test[num_columns], X_test_encoded_df], axis = 1)
#X_test_encoded.head()

(61, 13)
['cp_0' 'cp_1' 'cp_2' 'cp_3' 'exang_0' 'exang_1' 'slope_0' 'slope_1'
 'slope_2' 'thal_0' 'thal_1' 'thal_2' 'thal_3']


# Model(out of the box)

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

model = DecisionTreeClassifier(random_state=1)
model.fit(X_train_encoded, y_train)
preds = model.predict(X_test_encoded)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.58      0.54      0.56        28
           1       0.63      0.67      0.65        33

    accuracy                           0.61        61
   macro avg       0.60      0.60      0.60        61
weighted avg       0.60      0.61      0.61        61



# Effect of max depth

In [9]:
from sklearn.metrics import accuracy_score
for max_depth in (1, 3, 5, 7, 9):
    model = DecisionTreeClassifier(random_state = 1, max_depth=max_depth)
    model.fit(X_train_encoded, y_train)
    train_preds = model.predict(X_train_encoded)
    test_preds = model.predict(X_test_encoded)
    #print(f'Max Depth: {max_depth}; Training Accuracy: {accuracy_score(y_train, train_preds)} Test Accuracy: {accuracy_score(y_test, test_preds)})
    print('Max Depth: ', max_depth)
    print('Training Accuracy: ', accuracy_score(y_train, train_preds))
    print('Test Accuracy: ', accuracy_score(y_test, test_preds))

Max Depth:  1
Training Accuracy:  0.7768595041322314
Test Accuracy:  0.7213114754098361
Max Depth:  3
Training Accuracy:  0.8388429752066116
Test Accuracy:  0.7377049180327869
Max Depth:  5
Training Accuracy:  0.9256198347107438
Test Accuracy:  0.7049180327868853
Max Depth:  7
Training Accuracy:  0.9917355371900827
Test Accuracy:  0.6065573770491803
Max Depth:  9
Training Accuracy:  1.0
Test Accuracy:  0.6065573770491803


# Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV
grid_params = {
    'max_depth': (3, 5, 7, 9, 11, 13),
    'min_samples_split': (2, 4, 6, 8, 10)
}

from sklearn.metrics import make_scorer, f1_score

scorer = make_scorer(f1_score, average = 'micro')
clf = GridSearchCV(DecisionTreeClassifier(), grid_params, scoring = scorer)
clf.fit(X_train_encoded, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': (3, 5, 7, 9, 11, 13),
               

In [11]:
print(clf.best_score_, clf.best_params_)

0.7728741496598639 {'max_depth': 3, 'min_samples_split': 4}


# Best Model

In [None]:
model = DecisionTreeClassifier(random_state = 1, max_depth=3, min_samples_split = 2)
model.fit(X_train_encoded, y_train)
y_pred = model.predict(X_test_encoded)
print(classification_report(y_test, y_pred))

## Visualizing the tree

In [None]:
from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(model, out_file='tree.dot', 
                feature_names = X_train_encoded.columns,
                class_names = ['0', '1'],
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot','-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')