### Manual Cleansing, Data Augmentation Word Similarity, TF-IDF, Support Vector Machine

In [1]:
import os
import glob
import pickle

import pandas as pd
import numpy as np
import seaborn as sns
from collections import Counter

from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, mutual_info_classif, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import xgboost as xgb
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

%matplotlib inline

## data-prep

In [2]:
data_path = '../data'

In [3]:
data_dict = pickle.load(open(os.path.join(data_path, 'cleaned', 'all_data_cleaned_augmented_false.pkl'), 'rb'))

In [4]:
stopwords = pickle.load(open(os.path.join(data_path, 'support', 'stopwords.pkl'), 'rb'))

In [5]:
data_dict.keys()

dict_keys(['train', 'dev'])

In [6]:
d_train = data_dict['train']
d_dev = data_dict['dev']

In [7]:
d_train.shape

(7243, 6)

In [8]:
d_dev.shape

(459, 4)

In [9]:
d_train.LABEL.value_counts()

1    5474
0    1769
Name: LABEL, dtype: int64

In [10]:
d_train.reset_index(drop = True, inplace=True)

## cleansing

In [11]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [12]:
def cleansing(sentence):
    sentence = sentence.lower()
    word_list = word_tokenize(sentence)
#     word_list = [word for word in word_list if word not in stopwords]
    word_list = [word for word in word_list if len(word) > 1]
    word_list = [stemmer.stem(word) for word in word_list]
    sentence = " ".join(word_list)
    
    return sentence

In [13]:
d_train['response_cleansing'] = d_train.response_2.apply(cleansing)
d_dev['response_cleansing'] = d_dev.response_2.apply(cleansing)

## feature extraction

In [14]:
tfidf = TfidfVectorizer(ngram_range=(1,1))

X = tfidf.fit_transform(d_train.response_cleansing)

X_dev = tfidf.transform(d_dev.response_cleansing)

X.shape

(7243, 596)

## modeling

In [15]:
def evaluation(y_true, y_pred):
    f1score = f1_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    return {'f1score': f1score, 'precision': prec, 'recall': recall}

In [16]:
kf = StratifiedKFold(n_splits=5, random_state=123)

### SVM

In [17]:
score_list = []
spliter_index = []
C = np.array(list(range(1, 15))) / 10
for c_param in C:
    for train, test in kf.split(X, d_train.LABEL):
        spliter_index.append([train, test])
        X_train, y_train = X[train], np.array(d_train.loc[train, 'LABEL'])
        X_test, y_test = X[test], np.array(d_train.loc[test, 'LABEL'])
        
        model_b = SVC(kernel='linear', C = c_param)

        model_b.fit(X_train, y_train)

        y_pred = model_b.predict(X_test)
        y_pred = np.where(y_pred > 0.5, 1, 0)

        score = evaluation(y_test, y_pred)
        score['param'] = c_param
        score_list.append(score)

In [18]:
d_score = pd.DataFrame(score_list)

In [19]:
d_score.groupby('param')[["f1score", "param", "precision", "recall"]].mean()

Unnamed: 0_level_0,f1score,param,precision,recall
param,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.1,0.934633,0.1,0.907139,0.964379
0.2,0.945454,0.2,0.930863,0.960726
0.3,0.948119,0.3,0.939913,0.956706
0.4,0.951079,0.4,0.943787,0.958715
0.5,0.957933,0.5,0.953238,0.962918
0.6,0.959754,0.6,0.95685,0.962918
0.7,0.962372,0.7,0.957984,0.96712
0.8,0.965317,0.8,0.960188,0.970774
0.9,0.966668,0.9,0.962057,0.971504
1.0,0.967196,1.0,0.963114,0.971504


### close train

#### SVC

In [20]:
model_close = SVC(kernel='linear', C = 0.4, probability=True)
model_close.fit(X, d_train.LABEL)
y_pred = model_close.predict(X)
metrics = evaluation(d_train.LABEL, y_pred)

In [21]:
metrics

{'f1score': 0.97927129942471,
 'precision': 0.9790031038889904,
 'recall': 0.979539641943734}

#### Xgboost

In [None]:
params = {'booster':'gbtree', 'max_depth': 500, 'eta':1, 'objective':'binary:logistic'}
train = xgb.DMatrix(X, label=d_train.LABEL)

model_xgb = xgb.train(params, train, num_boost_round=2)

y_pred = model_xgb.predict(train)
y_pred = np.where(y_pred > 0.5, 1, 0)

In [None]:
metrics = evaluation(d_train.LABEL, y_pred)

In [None]:
metrics

## predictions

XgBoost

In [None]:
dev = xgb.DMatrix(X_dev)

y_pred_proba = model_xgb.predict(dev)

y_pred = np.where(y_pred_proba > 0.7, 1, 0)

In [None]:
y_pred.sum()

SVM

In [22]:
model = SVC(kernel='linear', C=0.4)

model.fit(X, d_train.LABEL)

y_pred = model.predict(X_dev)

In [23]:
d_dev['LABEL'] = y_pred

In [24]:
submission = d_dev[["RES_ID", "LABEL"]]

In [25]:
submission.LABEL.value_counts()

1    335
0    124
Name: LABEL, dtype: int64

In [None]:
d_dev[["RES_ID", "response_2", "LABEL"]].to_excel('../data/support/inspection.xlsx', index = False)

In [None]:
output_path = os.path.join(data_path, "output", "predictions_dev.json")

d_dev[["RES_ID", "LABEL"]].to_json(output_path, orient='records')

## Test

In [26]:
d_test = pickle.load(open('../data/cleaned/all_test.pkl', 'rb'))

### cleansing

In [37]:
d_test.RESPONSE = d_test.RESPONSE.fillna('tidak tahu')

In [38]:
d_test['response_cleansing'] = d_test.RESPONSE.apply(cleansing)

### transform

In [40]:
X_val = tfidf.transform(d_test.response_cleansing)

### modeling

In [42]:
y_pred_val = model.predict(X_val)

### submit

In [43]:
d_test['LABEL'] = y_pred_val

In [47]:
submission = d_test[["RES_ID", "LABEL"]]

In [48]:
output_path = os.path.join(data_path, "output", "predictions_test.json")

submission[["RES_ID", "LABEL"]].to_json(output_path, orient='records')