### Spell corrector

In [6]:
import os
import glob

import pandas as pd
import numpy as np
import seaborn as sns
from collections import Counter

from nltk.tokenize import word_tokenize
from symspellpy.symspellpy import SymSpell, Verbosity

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

%matplotlib inline

In [2]:
data_path = '../data'

In [3]:
glob.glob(os.path.join(data_path, 'raw', '*', '*'))

['../data\\raw\\data-a\\data_dev_A.csv',
 '../data\\raw\\data-a\\data_train_A.csv',
 '../data\\raw\\data-a\\stimulus dan coding guidelines data A.txt',
 '../data\\raw\\data-b\\data_dev_B.csv',
 '../data\\raw\\data-b\\data_train_B.csv',
 '../data\\raw\\data-b\\stimulus dan coding guidelines data B.txt']

In [4]:
d_train_a = pd.read_csv("../data\\raw\\data-a\\data_train_A.csv")

d_train_b = pd.read_csv("../data\\raw\\data-b\\data_train_B.csv")

In [5]:
d_dev_a = pd.read_csv("../data\\raw\\data-a\\data_dev_A.csv")

d_dev_b = pd.read_csv("../data\\raw\\data-b\\data_dev_B.csv")

In [35]:
def spell_corrector(input_term):
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join("../data/support/corpus-ref.csv")
    
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

#     # lookup suggestions for single-word input strings
#     input_term = "mkan"  # misspelling of "members"
    
#     # max edit distance per lookup
#     # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    
#     max_edit_distance_lookup = 2
#     suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
#     suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
#                                    max_edit_distance_lookup)
#     # display suggestion term, term frequency, and edit distance
#     for suggestion in suggestions:
#         print("{}, {}, {}".format(suggestion.term, suggestion.distance,
#                                   suggestion.count))

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
#     input_term = ("karena baju fast fasion tdk bertahan lama mereka gunakan wk")
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
#         print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count))
        return suggestion.term
        
    

In [36]:
def cleansing(sentence):
    sentence = sentence.lower()
    word_list = word_tokenize(sentence)
    word_list = [word for word in word_list if len(word) > 1]
    sentence = " ".join(word_list)
    sentence = spell_corrector(sentence)
    
    return sentence

In [37]:
d_train_a['response_cleansing'] = d_train_a.RESPONSE.apply(cleansing)
d_train_b['response_cleansing'] = d_train_b.RESPONSE.apply(cleansing)
d_dev_a['response_cleansing'] = d_dev_a.RESPONSE.apply(cleansing)
d_dev_b['response_cleansing'] = d_dev_b.RESPONSE.apply(cleansing)

## feature extraction

In [45]:
tfidf_a = TfidfVectorizer()
tfidf_b = TfidfVectorizer()

In [46]:
tfidf_X_a = tfidf_a.fit_transform(d_train_a.response_cleansing)
tfidf_X_b = tfidf_b.fit_transform(d_train_b.response_cleansing)

In [47]:
tfidf_X_dev_a = tfidf_a.transform(d_dev_a.response_cleansing)
tfidf_X_dev_b = tfidf_b.transform(d_dev_b.response_cleansing)

### modeling

In [42]:
def evaluation(y_true, y_pred):
    f1score = f1_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    return {'f1score': f1score, 'precision': prec, 'recall': recall}

In [43]:
kf = KFold(n_splits=5, random_state=123)

In [48]:
score_list = []
params = {'booster':'gbtree', 'max_depth': 300, 'eta':1, 'objective':'binary:logistic'}
for train, test in kf.split(tfidf_X_a, d_train_a.LABEL):

    X_train, y_train = tfidf_X_a[train], np.array(d_train_a.loc[train, 'LABEL'])
    X_test, y_test = tfidf_X_a[test], np.array(d_train_a.loc[test, 'LABEL'])

    train = xgb.DMatrix(X_train, label=y_train)
    test = xgb.DMatrix(X_test)

    model_a = xgb.train(params, train, num_boost_round=2)

    y_pred = model_a.predict(test)
    y_pred = np.where(y_pred > 0.5, 1, 0)

    score = evaluation(y_test, y_pred)
    score_list.append(score)

In [50]:
pd.DataFrame(score_list)

Unnamed: 0,f1score,precision,recall
0,0.811594,0.736842,0.903226
1,0.870588,0.860465,0.880952
2,0.833333,0.853659,0.813953
3,0.891566,0.880952,0.902439
4,0.8,0.83871,0.764706


data b

In [51]:
score_list = []
params = {'booster':'gbtree', 'max_depth': 3000, 'eta':0.7, 'objective':'binary:logistic'}
for thres in [0.5]:
    for train, test in kf.split(tfidf_X_b, d_train_b.LABEL):

        X_train, y_train = tfidf_X_b[train], np.array(d_train_b.loc[train, 'LABEL'])
        X_test, y_test = tfidf_X_b[test], np.array(d_train_b.loc[test, 'LABEL'])

        train = xgb.DMatrix(X_train, label=y_train)
        test = xgb.DMatrix(X_test)

        model_b = xgb.train(params, train, num_boost_round=2)

        y_pred = model_b.predict(test)
        y_pred = np.where(y_pred > thres, 1, 0)

        score = evaluation(y_test, y_pred)
        score['thres'] = thres
        score_list.append(score)

In [52]:
pd.DataFrame(score_list)

Unnamed: 0,f1score,precision,recall,thres
0,0.617647,0.65625,0.583333,0.5
1,0.657534,0.571429,0.774194,0.5
2,0.628571,0.578947,0.6875,0.5
3,0.676056,0.666667,0.685714,0.5
4,0.702703,0.65,0.764706,0.5
