In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

In [467]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

def prepare_data(filename):
    df = pd.read_csv(filename)
    if 'LABEL' in df.columns:
        X, y = df[['RESPONSE','LABEL']].values.T
        return X, y
    else:
        return df['RESPONSE'].values

In [626]:
from spacy.lang.id import Indonesian
from spell import correction

nlp = Indonesian()

def tokenizer(text, with_correction=False):
    if with_correction:
        text = ' '.join([correction(token) for token in text.split(' ')])
    return [token.lemma_ for token in nlp(text)]

In [4]:
stopwords = pd.read_csv('https://raw.githubusercontent.com/masdevid/ID-Stopwords/master/id.stopwords.02.01.2016.txt', header=None)[0].tolist()

In [627]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support, make_scorer
from sklearn.model_selection import cross_validate

clf = make_pipeline(
    make_pipeline(
        CountVectorizer(tokenizer=tokenizer),
        TruncatedSVD(100, random_state=42),
    ),
    RandomForestClassifier(n_estimators=200, random_state=42)
#     GradientBoostingClassifier(n_estimators=200, random_state=42)
)

# X, y = prepare_data('Data A/data_train_A.csv')
X, y = prepare_data('Data B/data_train_B.csv')
y = y.astype(int)
pd.DataFrame(
    cross_validate(clf, X, y, scoring=['precision', 'recall', 'f1'], cv=5, return_train_score=False)
)[['test_precision','test_recall','test_f1']].describe().style.format('{:.2%}')

Unnamed: 0,test_precision,test_recall,test_f1
count,500.00%,500.00%,500.00%
mean,68.58%,76.20%,72.12%
std,3.55%,4.62%,3.19%
min,64.86%,70.59%,67.61%
25%,65.00%,72.73%,70.27%
50%,69.23%,76.47%,72.73%
75%,71.05%,79.41%,75.00%
max,72.73%,81.82%,75.00%


# Prediction

In [586]:
X_A, y_A = prepare_data('Data A/data_train_A.csv')
X_B, y_B = prepare_data('Data B/data_train_B.csv')

y_A = y_A.astype(int)
y_B = y_B.astype(int)

dev_A = pd.read_csv('Data A/data_dev_A.csv')
dev_B = pd.read_csv('Data B/data_dev_B.csv')

X_dev_A = dev_A['RESPONSE'].values
X_dev_B = dev_B['RESPONSE'].values

In [610]:
model_A = make_pipeline(
    CountVectorizer(tokenizer=tokenizer),
    TruncatedSVD(100, random_state=42),
    RandomForestClassifier(n_estimators=200, random_state=42)
)
model_B = make_pipeline(
    CountVectorizer(tokenizer=tokenizer),
    TruncatedSVD(100, random_state=42),
    RandomForestClassifier(n_estimators=250, random_state=42)
)

for X, y, X_test, y_test, clf in zip([X_A, X_B], [y_A, y_B], [X_dev_A, X_dev_B], [dev_A, dev_B], [model_A, model_B]):
    clf.fit(X, y)
    y_pred = clf.predict(X_test)
    y_test['LABEL'] = y_pred

In [611]:
pd.concat([
    dev_A[['RES_ID','LABEL']],
    dev_B[['RES_ID','LABEL']]
]).to_json('dev_{}.json'.format(pd.Timestamp.today().strftime('%Y%m%d')), orient='records')