## XGBoost & Adaboost

In [1]:
import os
import glob

import pandas as pd
import numpy as np
import seaborn as sns
from collections import Counter

from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

%matplotlib inline

In [4]:
data_path = '../data'

files = glob.glob(os.path.join(data_path, 'raw', '*', '*'))

files

['../data\\raw\\data-a\\data_dev_A.csv',
 '../data\\raw\\data-a\\data_train_A.csv',
 '../data\\raw\\data-a\\stimulus dan coding guidelines data A.txt',
 '../data\\raw\\data-b\\data_dev_B.csv',
 '../data\\raw\\data-b\\data_train_B.csv',
 '../data\\raw\\data-b\\stimulus dan coding guidelines data B.txt']

In [30]:
d_train_a = pd.read_csv("../data\\raw\\data-a\\data_train_A.csv")

d_train_b = pd.read_csv("../data\\raw\\data-b\\data_train_B.csv")

In [31]:
d_dev_a = pd.read_csv("../data\\raw\\data-a\\data_dev_A.csv")

d_dev_b = pd.read_csv("../data\\raw\\data-b\\data_dev_B.csv")

In [7]:
# d_train = d_train_a.append(d_train_b)

# d_train.reset_index(drop = True, inplace = True)

In [32]:
def cleansing(sentence):
    sentence = sentence.lower()
    word_list = word_tokenize(sentence)
    word_list = [word for word in word_list if len(word) > 1]
    sentence = " ".join(word_list)
    
    return sentence

In [12]:
d_train['response_cleansing'] = d_train.RESPONSE.apply(cleansing)

In [13]:
text = " ".join(d_train['response_cleansing'])

In [14]:
word_list = word_tokenize(text)

In [15]:
word_freq = Counter(word_list)

In [16]:
word_freq = dict(word_freq)

In [17]:
d_word_freq = pd.DataFrame(data = {'word': list(word_freq.keys()), 'freq': list(word_freq.values())})

In [18]:
d_word_freq.shape

(675, 2)

In [19]:
cv = CountVectorizer(ngram_range=(1,2))
tfidf = TfidfVectorizer()
tfidf_t = TfidfTransformer()

In [20]:
X = cv.fit_transform(d_train.response_cleansing)

In [21]:
X  = tfidf_t.fit_transform(X)

In [22]:
X.shape

(268, 2433)

In [23]:
def evaluation(y_true, y_pred):
    f1score = f1_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    return {'f1score': f1score, 'precision': prec, 'recall': recall}

## XGBoost

In [24]:
kf = KFold(n_splits=5, random_state=123)

In [25]:
lr_list = np.array(list(range(1, 16, 1))) / 10

In [None]:
score_list = []
for param in [1]:
    params = {'booster':'gbtree', 'max_depth': 100, 'eta':param, 'objective':'binary:logistic'}
    for train, test in kf.split(X, d_train.LABEL):

        X_train, y_train = X[train], np.array(d_train.loc[train, 'LABEL'])
        X_test, y_test = X[test], np.array(d_train.loc[test, 'LABEL'])

        train = xgb.DMatrix(X_train, label=y_train)
        test = xgb.DMatrix(X_test)

        model = xgb.train(params, train, num_boost_round=2)

        y_pred = model.predict(test)
        y_pred = np.where(y_pred > 0.5, 1, 0)

        score = evaluation(y_test, y_pred)
        score['param'] = param
        score_list.append(score)

In [None]:
d_score = pd.DataFrame(score_list)

In [None]:
d_score

In [None]:
d_score.groupby('param')[["f1score", "precision", "recall"]].mean()

In [None]:
d_score.mean(axis = 0)

## Adaboost

In [26]:
kf = KFold(n_splits=5)

In [27]:
score_list = []
for train, test in kf.split(X, d_train.LABEL):
    X_train, y_train = X[train], d_train.loc[train, 'LABEL']
    X_test, y_test = X[test], d_train.loc[test, 'LABEL']
    
    abc = AdaBoostClassifier(n_estimators=1000)
    abc.fit(X_train, y_train)
    y_pred = abc.predict(X_test)
    
    score = evaluation(y_test, y_pred)
    score_list.append(score)

In [28]:
pd.DataFrame(score_list)

Unnamed: 0,f1score,precision,recall
0,0.761905,0.75,0.774194
1,0.825,0.868421,0.785714
2,0.810127,0.888889,0.744186
3,0.839506,0.85,0.829268
4,0.83871,0.928571,0.764706
