In [95]:
import pandas as pd

# read in all data
test = pd.read_csv('../data/test.txt', delimiter=';', names=['text', 'target'])
train = pd.read_csv('../data/train.txt', delimiter=';',
                    names=['text', 'target'])
val = pd.read_csv('../data/val.txt', delimiter=';', names=['text', 'target'])
trainval = pd.concat([train,val])
testval = pd.concat([test,val])

In [96]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# create vectorier for BoW
vectorizer = CountVectorizer(max_df=0.9,min_df=5,stop_words='english',ngram_range=(2,2))
BoW = vectorizer.fit_transform(trainval.text)
print('Number of Features in BoW: ',len(vectorizer.get_feature_names()))

X_train = BoW.toarray()
Y_train = trainval.target
X_test = vectorizer.transform(test.text)
Y_test = test.target


Number of Features in BoW:  1838


## Bagging Classification with DT

In [97]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import sklearn.metrics as skm

bag = BaggingClassifier(n_estimators=10, random_state=0)
bag.fit(X_train, Y_train)
Y_test_pred = bag.predict(X_test)

train_acc = bag.score(X_train, Y_train)
test_acc = bag.score(X_test, Y_test)


In [98]:
# get metrics

print('---------------- Bagging Statistics ----------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred), 
    columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])


---------------- Bagging Statistics ----------------
Train Accuracy: 0.8211
Test Accuracy: 0.6855
              precision    recall  f1-score   support

       anger       0.74      0.55      0.63       275
        fear       0.78      0.62      0.69       224
         joy       0.65      0.83      0.73       695
        love       0.52      0.41      0.46       159
     sadness       0.74      0.71      0.73       581
    surprise       0.52      0.38      0.44        66

    accuracy                           0.69      2000
   macro avg       0.66      0.58      0.61      2000
weighted avg       0.69      0.69      0.68      2000



Unnamed: 0,anger,fear,joy,love,sadness,surprise
anger,150,6,62,10,45,2
fear,9,140,35,4,26,10
joy,11,7,576,40,52,9
love,6,4,69,65,15,0
sadness,26,11,122,5,415,2
surprise,1,11,22,2,5,25


## Random Forest with DT

In [99]:
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as skm

rf = RandomForestClassifier(n_estimators=10, random_state=0)
rf.fit(X_train, Y_train)
Y_test_pred = rf.predict(X_test)

train_acc = rf.score(X_train, Y_train)
test_acc = rf.score(X_test, Y_test)


In [100]:
print('------------- Random Forest Statistics -------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred),
             columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])

------------- Random Forest Statistics -------------
Train Accuracy: 0.8211
Test Accuracy: 0.688
              precision    recall  f1-score   support

       anger       0.70      0.55      0.61       275
        fear       0.79      0.63      0.70       224
         joy       0.65      0.84      0.73       695
        love       0.52      0.41      0.46       159
     sadness       0.77      0.71      0.74       581
    surprise       0.51      0.36      0.42        66

    accuracy                           0.69      2000
   macro avg       0.66      0.58      0.61      2000
weighted avg       0.69      0.69      0.68      2000



Unnamed: 0,anger,fear,joy,love,sadness,surprise
anger,150,8,64,10,42,1
fear,12,141,38,3,19,11
joy,16,4,582,39,45,9
love,8,3,70,65,12,1
sadness,27,10,123,6,414,1
surprise,1,12,22,1,6,24


## AdaBoost Classification with DT

In [101]:
from sklearn.ensemble import AdaBoostClassifier
import sklearn.metrics as skm

ada = AdaBoostClassifier(n_estimators=100, random_state=0)
ada.fit(X_train, Y_train)
Y_test_pred = ada.predict(X_test)

train_acc = ada.score(X_train, Y_train)
test_acc = ada.score(X_test, Y_test)


In [102]:

print('---------------- AdaBoost Statistics ----------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred),
             columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])


---------------- AdaBoost Statistics ----------------
Train Accuracy: 0.4364
Test Accuracy: 0.4255
              precision    recall  f1-score   support

       anger       0.97      0.13      0.23       275
        fear       0.87      0.33      0.47       224
         joy       0.38      1.00      0.55       695
        love       0.96      0.17      0.29       159
     sadness       0.50      0.00      0.01       581
    surprise       0.76      0.29      0.42        66

    accuracy                           0.43      2000
   macro avg       0.74      0.32      0.33      2000
weighted avg       0.61      0.43      0.32      2000



Unnamed: 0,anger,fear,joy,love,sadness,surprise
anger,36,0,238,0,0,1
fear,0,73,145,0,2,4
joy,0,0,694,1,0,0
love,1,0,131,27,0,0
sadness,0,1,577,0,2,1
surprise,0,10,37,0,0,19
