In [2]:
import os
import glob

import pandas as pd
import numpy as np
import seaborn as sns
from collections import Counter

from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

%matplotlib inline

In [3]:
data_path = '../data'

files = glob.glob(os.path.join(data_path, 'raw', '*', '*'))

files

['../data\\raw\\data-a\\data_dev_A.csv',
 '../data\\raw\\data-a\\data_train_A.csv',
 '../data\\raw\\data-a\\stimulus dan coding guidelines data A.txt',
 '../data\\raw\\data-b\\data_dev_B.csv',
 '../data\\raw\\data-b\\data_train_B.csv',
 '../data\\raw\\data-b\\stimulus dan coding guidelines data B.txt']

In [4]:
d_train_a = pd.read_csv("../data\\raw\\data-a\\data_train_A.csv")
d_train_b = pd.read_csv("../data\\raw\\data-b\\data_train_B.csv")

d_dev_a = pd.read_csv("../data\\raw\\data-a\\data_dev_A.csv")
d_dev_b = pd.read_csv("../data\\raw\\data-b\\data_dev_B.csv")

In [6]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [8]:
def cleansing(sentence):
    sentence = sentence.lower()
    word_list = word_tokenize(sentence)
    word_list = [word for word in word_list if len(word) > 1]
    word_list = [stemmer.stem(word) for word in word_list]
    sentence = " ".join(word_list)
    
    return sentence

In [9]:
d_train_a['response_cleansing'] = d_train_a.RESPONSE.apply(cleansing)
d_train_b['response_cleansing'] = d_train_b.RESPONSE.apply(cleansing)
d_dev_a['response_cleansing'] = d_dev_a.RESPONSE.apply(cleansing)
d_dev_b['response_cleansing'] = d_dev_b.RESPONSE.apply(cleansing)

## feature extraction

In [11]:
tfidf_a = TfidfVectorizer()
tfidf_b = TfidfVectorizer()

In [12]:
X_a = tfidf_a.fit_transform(d_train_a.response_cleansing)
X_b = tfidf_a.fit_transform(d_train_a.response_cleansing)

In [13]:
X_dev_a = tfidf_a.transform(d_dev_a.response_cleansing)
X_dev_b = tfidf_a.transform(d_dev_b.response_cleansing)

## modeling

In [15]:
def evaluation(y_true, y_pred):
    f1score = f1_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    return {'f1score': f1score, 'precision': prec, 'recall': recall}

In [16]:
kf = KFold(n_splits=5)

data a

In [17]:
score_list = []
params = {'booster':'gbtree', 'max_depth': 300, 'eta':1, 'objective':'binary:logistic'}
for train, test in kf.split(X_a, d_train_a.LABEL):

    X_train, y_train = X_a[train], np.array(d_train_a.loc[train, 'LABEL'])
    X_test, y_test = X_a[test], np.array(d_train_a.loc[test, 'LABEL'])

    train = xgb.DMatrix(X_train, label=y_train)
    test = xgb.DMatrix(X_test)

    model_a = xgb.train(params, train, num_boost_round=2)

    y_pred = model_a.predict(test)
    y_pred = np.where(y_pred > 0.5, 1, 0)

    score = evaluation(y_test, y_pred)
    score_list.append(score)

In [18]:
pd.DataFrame(score_list)

Unnamed: 0,f1score,precision,recall
0,0.861538,0.823529,0.903226
1,0.839506,0.871795,0.809524
2,0.843373,0.875,0.813953
3,0.902439,0.902439,0.902439
4,0.8,0.83871,0.764706


data b

In [21]:
score_list = []
params = {'booster':'gbtree', 'max_depth': 300, 'eta':1, 'objective':'binary:logistic'}
for train, test in kf.split(X_b, d_train_a.LABEL):

    X_train, y_train = X_b[train], np.array(d_train_a.loc[train, 'LABEL'])
    X_test, y_test = X_b[test], np.array(d_train_a.loc[test, 'LABEL'])

    train = xgb.DMatrix(X_train, label=y_train)
    test = xgb.DMatrix(X_test)

    model_b = xgb.train(params, train, num_boost_round=2)

    y_pred = model_b.predict(test)
    y_pred = np.where(y_pred > 0.5, 1, 0)

    score = evaluation(y_test, y_pred)
    score_list.append(score)

In [22]:
pd.DataFrame(score_list)

Unnamed: 0,f1score,precision,recall
0,0.861538,0.823529,0.903226
1,0.839506,0.871795,0.809524
2,0.843373,0.875,0.813953
3,0.902439,0.902439,0.902439
4,0.8,0.83871,0.764706


## prediction

In [23]:
X_dev_a = xgb.DMatrix(X_dev_a)
y_pred_a = model_a.predict(X_dev_a)

y_pred_a = np.where(y_pred_a > 0.5, 1, 0)

In [24]:
X_dev_b = xgb.DMatrix(X_dev_b)
y_pred_b = model_b.predict(X_dev_b)

y_pred_b = np.where(y_pred_b > 0.5, 1, 0)

In [25]:
d_dev_a['LABEL'] = y_pred_a

d_dev_b['LABEL'] = y_pred_b

In [26]:
d_dev = d_dev_a.append(d_dev_b)

d_dev.reset_index(drop=True, inplace=True)

In [27]:
output_path = os.path.join(data_path, "output", "predictions_dev.json")

In [28]:
d_dev[["RES_ID", "LABEL"]].to_json(output_path, orient='records')