In [2]:
import pandas as pd

# read in all data
test = pd.read_csv('../data/test.txt', delimiter=';', names=['text', 'target'])
train = pd.read_csv('../data/train.txt', delimiter=';',
                    names=['text', 'target'])
val = pd.read_csv('../data/val.txt', delimiter=';', names=['text', 'target'])
trainval = pd.concat([train,val])
testval = pd.concat([test,val])

In [3]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.9,min_df=5,stop_words='english')
BoW = vectorizer.fit_transform(trainval.text)
print('Number of Features: ',len(vectorizer.get_feature_names()))

X_train = BoW.toarray()
Y_train = trainval.target
X_test = vectorizer.transform(test.text)
Y_test = test.target


Number of Features:  3398




## Bagging Classification with DT

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import sklearn.metrics as skm

bag = BaggingClassifier(n_estimators=10, random_state=0)
bag.fit(X_train, Y_train)
Y_test_pred = bag.predict(X_test)

train_acc = bag.score(X_train, Y_train)
test_acc = bag.score(X_test, Y_test)


KeyboardInterrupt: 

In [None]:
# get metrics

print('---------------- Bagging Statistics ----------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred), 
    columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])


---------------- Bagging Statistics ----------------
Train Accuracy: 0.9849
Test Accuracy: 0.87
              precision    recall  f1-score   support

       anger       0.89      0.90      0.89       275
        fear       0.80      0.87      0.83       224
         joy       0.89      0.90      0.90       695
        love       0.75      0.72      0.73       159
     sadness       0.92      0.89      0.91       581
    surprise       0.61      0.58      0.59        66

    accuracy                           0.87      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.87      0.87      0.87      2000



Unnamed: 0,anger,fear,joy,love,sadness,surprise
anger,248,10,5,0,10,2
fear,9,194,2,2,10,7
joy,3,8,627,29,20,8
love,2,0,38,114,3,2
sadness,17,12,21,7,519,5
surprise,1,17,8,0,2,38


## Random Forest with DT

In [4]:
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as skm

rf = RandomForestClassifier(n_estimators=10, random_state=0)
rf.fit(X_train, Y_train)
Y_test_pred = rf.predict(X_test)

train_acc = rf.score(X_train, Y_train)
test_acc = rf.score(X_test, Y_test)


In [5]:
print('------------- Random Forest Statistics -------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred),
             columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])

------------- Random Forest Statistics -------------
Train Accuracy: 0.9912
Test Accuracy: 0.873
              precision    recall  f1-score   support

       anger       0.88      0.90      0.89       275
        fear       0.83      0.90      0.86       224
         joy       0.88      0.91      0.89       695
        love       0.75      0.74      0.74       159
     sadness       0.95      0.87      0.91       581
    surprise       0.61      0.65      0.63        66

    accuracy                           0.87      2000
   macro avg       0.82      0.83      0.82      2000
weighted avg       0.88      0.87      0.87      2000



Unnamed: 0,anger,fear,joy,love,sadness,surprise
anger,248,12,7,1,6,1
fear,5,201,2,1,5,10
joy,6,6,631,31,12,9
love,3,0,36,118,0,2
sadness,20,11,33,7,505,5
surprise,1,12,8,0,2,43


## AdaBoost Classification with DT

In [7]:
from sklearn.ensemble import AdaBoostClassifier
import sklearn.metrics as skm

ada = AdaBoostClassifier(n_estimators=100, random_state=0)
ada.fit(X_train, Y_train)
Y_test_pred = ada.predict(X_test)

train_acc = ada.score(X_train, Y_train)
test_acc = ada.score(X_test, Y_test)


In [8]:

print('---------------- AdaBoost Statistics ----------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred),
             columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])


---------------- AdaBoost Statistics ----------------
Train Accuracy: 0.3683
Test Accuracy: 0.379
              precision    recall  f1-score   support

       anger       0.55      0.02      0.04       275
        fear       0.63      0.20      0.30       224
         joy       0.36      0.95      0.52       695
        love       0.46      0.12      0.19       159
     sadness       0.49      0.03      0.06       581
    surprise       0.67      0.12      0.21        66

    accuracy                           0.38      2000
   macro avg       0.53      0.24      0.22      2000
weighted avg       0.47      0.38      0.26      2000



Unnamed: 0,anger,fear,joy,love,sadness,surprise
anger,6,1,266,0,2,0
fear,0,44,179,0,1,0
joy,1,1,662,20,7,4
love,1,1,138,19,0,0
sadness,0,4,556,2,19,0
surprise,3,19,26,0,10,8
