## XGBoost & Adaboost

In [183]:
import os
import glob

import pandas as pd
import numpy as np
import seaborn as sns
from collections import Counter

from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

%matplotlib inline

In [3]:
data_path = '../data'

files = glob.glob(os.path.join(data_path, 'raw', '*', '*'))

files

['../data/raw/data-a/data_dev_A.csv',
 '../data/raw/data-a/stimulus dan coding guidelines data A.txt',
 '../data/raw/data-a/data_train_A.csv',
 '../data/raw/data-b/data_train_B.csv',
 '../data/raw/data-b/data_dev_B.csv',
 '../data/raw/data-b/stimulus dan coding guidelines data B.txt']

In [4]:
d_train_a = pd.read_csv(files[2])

d_train_b = pd.read_csv(files[3])

In [7]:
d_train = d_train_a.append(d_train_b)

d_train.reset_index(drop = True, inplace = True)

In [14]:
d_train.head()

Unnamed: 0,RES_ID,RESPONSE,LABEL,response_cleansing
0,TRA1,intetraksi/beradaptasi terhadap lingkungan yan...,1,intetraksi/beradaptasi terhadap lingkungan yan...
1,TRA2,seperti jatuhnya meteor tsunami gempa bumi,0,seperti jatuhnya meteor tsunami gempa bumi
2,TRA3,hanya tuhan yang tahu tantangan nya itu apaan,0,hanya tuhan yang tahu tantangan nya itu apaan
3,TRA4,mereka akan sulit beradaptasi,1,mereka akan sulit beradaptasi
4,TRA5,"Tempat tinggal, ekonomi, dan pekerjaan",1,tempat tinggal ekonomi dan pekerjaan


In [8]:
def cleansing(sentence):
    sentence = sentence.lower()
    word_list = word_tokenize(sentence)
    word_list = [word for word in word_list if len(word) > 1]
    sentence = " ".join(word_list)
    
    return sentence

In [9]:
d_train['response_cleansing'] = d_train.RESPONSE.apply(cleansing)

In [179]:
text = " ".join(d_train['response_cleansing'])

In [182]:
word_list = word_tokenize(text)

In [186]:
word_freq = Counter(word_list)

In [188]:
word_freq = dict(word_freq)

In [191]:
d_word_freq = pd.DataFrame(data = {'word': list(word_freq.keys()), 'freq': list(word_freq.values())})

In [196]:
d_word_freq.shape

(1151, 2)

In [170]:
cv = CountVectorizer(ngram_range=(1,2))
tfidf = TfidfVectorizer()
tfidf_t = TfidfTransformer()

In [171]:
X = cv.fit_transform(d_train.response_cleansing)

In [172]:
X  = tfidf_t.fit_transform(X)

In [173]:
X.shape

(573, 4841)

In [17]:
def evaluation(y_true, y_pred):
    f1score = f1_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    return {'f1score': f1score, 'precision': prec, 'recall': recall}

## XGBoost

In [140]:
kf = KFold(n_splits=5, random_state=123)

In [141]:
lr_list = np.array(list(range(1, 16, 1))) / 10

In [142]:
score_list = []
for param in [1]:
    params = {'booster':'gbtree', 'max_depth': 100, 'eta':param, 'objective':'binary:logistic'}
    for train, test in kf.split(X, d_train.LABEL):

        X_train, y_train = X[train], np.array(d_train.loc[train, 'LABEL'])
        X_test, y_test = X[test], np.array(d_train.loc[test, 'LABEL'])

        train = xgb.DMatrix(X_train, label=y_train)
        test = xgb.DMatrix(X_test)

        model = xgb.train(params, train, num_boost_round=2)

        y_pred = model.predict(test)
        y_pred = np.where(y_pred > 0.5, 1, 0)

        score = evaluation(y_test, y_pred)
        score['param'] = param
        score_list.append(score)

In [143]:
d_score = pd.DataFrame(score_list)

In [144]:
d_score

Unnamed: 0,f1score,param,precision,recall
0,0.819876,1,0.814815,0.825
1,0.837209,1,0.837209,0.837209
2,0.691176,1,0.712121,0.671429
3,0.585366,1,0.537313,0.642857
4,0.671756,1,0.6875,0.656716


In [131]:
d_score.groupby('param')[["f1score", "precision", "recall"]].mean()

Unnamed: 0_level_0,f1score,precision,recall
param,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.753911,0.739064,0.77328


In [128]:
d_score.mean(axis = 0)

f1score      0.738320
param        0.800000
precision    0.735167
recall       0.746332
dtype: float64

## Adaboost

In [174]:
kf = KFold(n_splits=5)

In [177]:
score_list = []
for train, test in kf.split(X, d_train.LABEL):
    X_train, y_train = X[train], d_train.loc[train, 'LABEL']
    X_test, y_test = X[test], d_train.loc[test, 'LABEL']
    
    abc = AdaBoostClassifier(n_estimators=1000)
    abc.fit(X_train, y_train)
    y_pred = abc.predict(X_test)
    
    score = evaluation(y_test, y_pred)
    score_list.append(score)

In [176]:
pd.DataFrame(score_list)

Unnamed: 0,f1score,precision,recall
0,0.792208,0.824324,0.7625
1,0.837209,0.837209,0.837209
2,0.73913,0.75,0.728571
3,0.633333,0.59375,0.678571
4,0.645161,0.701754,0.597015
