In [23]:
import pandas as pd

# read in all data
test = pd.read_csv('../data/test.txt', delimiter=';', names=['text', 'target'])
train = pd.read_csv('../data/train.txt', delimiter=';',
                    names=['text', 'target'])
val = pd.read_csv('../data/val.txt', delimiter=';', names=['text', 'target'])
trainval = pd.concat([train,val])
testval = pd.concat([test,val])

In [55]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# create vectorier for BoW
vectorizer = CountVectorizer(max_df=0.9,min_df=10,stop_words='english')
BoW = vectorizer.fit_transform(trainval.text)
print('Number of Features in BoW: ',len(vectorizer.get_feature_names()))

X_train = BoW.toarray()
Y_train = trainval.target
X_test = vectorizer.transform(test.text)
Y_test = test.target


Number of Features in BoW:  1999


## Bagging Classification with DT

In [52]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import sklearn.metrics as skm

bag = BaggingClassifier(n_estimators=10, random_state=0)
bag.fit(X_train, Y_train)
Y_test_pred = bag.predict(X_test)

train_acc = bag.score(X_train, Y_train)
test_acc = bag.score(X_test, Y_test)


In [54]:
# get metrics

print('---------------- Bagging Statistics ----------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred), 
    columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])


---------------- Bagging Statistics ----------------
Train Accuracy: 0.9851
Test Accuracy: 0.843
              precision    recall  f1-score   support

       anger       0.84      0.91      0.87       275
        fear       0.83      0.84      0.84       224
         joy       0.90      0.81      0.85       695
        love       0.69      0.73      0.71       159
     sadness       0.87      0.90      0.88       581
    surprise       0.58      0.73      0.64        66

    accuracy                           0.84      2000
   macro avg       0.78      0.82      0.80      2000
weighted avg       0.85      0.84      0.84      2000



Unnamed: 0,anger,fear,joy,love,sadness,surprise
anger,249,10,2,1,11,2
fear,9,189,1,1,9,15
joy,18,9,562,43,52,11
love,2,0,37,116,2,2
sadness,19,11,16,8,522,5
surprise,1,9,5,0,3,48


## AdaBoost Classification with DT

In [None]:
from sklearn.ensemble import AdaBoostClassifier
import sklearn.metrics as skm

ada = AdaBoostClassifier(n_estimators=10, random_state=0)
ada.fit(X_train, Y_train)
Y_test_pred = ada.predict(X_test)

train_acc = ada.score(X_train, Y_train)
test_acc = ada.score(X_test, Y_test)


In [None]:

print('---------------- AdaBoost Statistics ----------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred),
             columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])
