In [1]:
import pandas as pd

# read in all data
test = pd.read_csv('../data/test.txt', delimiter=';', names=['text', 'target'])
train = pd.read_csv('../data/train.txt', delimiter=';',
                    names=['text', 'target'])
val = pd.read_csv('../data/val.txt', delimiter=';', names=['text', 'target'])
trainval = pd.concat([train,val])
testval = pd.concat([test,val])

In [78]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.9,min_df=5,stop_words='english')
BoW = vectorizer.fit_transform(trainval.text)
print('Number of Features: ',len(vectorizer.get_feature_names()))

X_train = BoW.toarray()
Y_train = trainval.target
X_test = vectorizer.transform(test.text)
Y_test = test.target


Number of Features in BoW:  3398


## Bagging Classification with DT

In [79]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import sklearn.metrics as skm

bag = BaggingClassifier(n_estimators=10, random_state=0)
bag.fit(X_train, Y_train)
Y_test_pred = bag.predict(X_test)

train_acc = bag.score(X_train, Y_train)
test_acc = bag.score(X_test, Y_test)


In [80]:
# get metrics

print('---------------- Bagging Statistics ----------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred), 
    columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])


---------------- Bagging Statistics ----------------
Train Accuracy: 0.9864
Test Accuracy: 0.862
              precision    recall  f1-score   support

       anger       0.87      0.92      0.89       275
        fear       0.83      0.85      0.84       224
         joy       0.92      0.84      0.88       695
        love       0.71      0.75      0.73       159
     sadness       0.89      0.90      0.89       581
    surprise       0.60      0.76      0.67        66

    accuracy                           0.86      2000
   macro avg       0.80      0.84      0.82      2000
weighted avg       0.87      0.86      0.86      2000



Unnamed: 0,anger,fear,joy,love,sadness,surprise
anger,252,10,2,1,8,2
fear,7,190,0,2,10,15
joy,9,5,587,38,46,10
love,2,0,33,120,2,2
sadness,20,14,9,8,525,5
surprise,1,9,4,0,2,50


## Random Forest with DT

In [81]:
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as skm

rf = RandomForestClassifier(n_estimators=10, random_state=0)
rf.fit(X_train, Y_train)
Y_test_pred = rf.predict(X_test)

train_acc = rf.score(X_train, Y_train)
test_acc = rf.score(X_test, Y_test)


In [82]:
print('------------- Random Forest Statistics -------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred),
             columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])

------------- Random Forest Statistics -------------
Train Accuracy: 0.991
Test Accuracy: 0.877
              precision    recall  f1-score   support

       anger       0.89      0.90      0.89       275
        fear       0.84      0.85      0.85       224
         joy       0.91      0.88      0.90       695
        love       0.73      0.75      0.74       159
     sadness       0.92      0.92      0.92       581
    surprise       0.62      0.73      0.67        66

    accuracy                           0.88      2000
   macro avg       0.82      0.84      0.83      2000
weighted avg       0.88      0.88      0.88      2000



Unnamed: 0,anger,fear,joy,love,sadness,surprise
anger,247,11,5,1,10,1
fear,8,191,0,1,10,14
joy,5,6,614,35,24,11
love,2,0,35,119,1,2
sadness,15,10,13,6,535,2
surprise,1,9,6,0,2,48


## AdaBoost Classification with DT

In [83]:
from sklearn.ensemble import AdaBoostClassifier
import sklearn.metrics as skm

ada = AdaBoostClassifier(n_estimators=100, random_state=0)
ada.fit(X_train, Y_train)
Y_test_pred = ada.predict(X_test)

train_acc = ada.score(X_train, Y_train)
test_acc = ada.score(X_test, Y_test)


In [84]:

print('---------------- AdaBoost Statistics ----------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred),
             columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])


---------------- AdaBoost Statistics ----------------
Train Accuracy: 0.3651
Test Accuracy: 0.3745
              precision    recall  f1-score   support

       anger       0.20      0.06      0.09       275
        fear       0.57      0.22      0.32       224
         joy       0.36      0.90      0.52       695
        love       0.54      0.24      0.33       159
     sadness       0.00      0.00      0.00       581
    surprise       0.70      0.29      0.41        66

    accuracy                           0.37      2000
   macro avg       0.40      0.28      0.28      2000
weighted avg       0.28      0.37      0.27      2000



Unnamed: 0,anger,fear,joy,love,sadness,surprise
anger,16,6,252,0,0,1
fear,11,50,162,0,0,1
joy,32,1,626,31,1,4
love,8,2,111,38,0,0
sadness,15,5,557,2,0,2
surprise,0,23,15,0,9,19
