# Characters Count Base SVM, MNB, RF

In [40]:
import os
import glob

import pandas as pd
import numpy as np
import seaborn as sns

from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score

%matplotlib inline

In [2]:
data_path = 'data'

files = glob.glob(os.path.join(data_path, '*', '*'))

files

['data\\Data A\\data_dev_A.csv',
 'data\\Data A\\data_train_A.csv',
 'data\\Data A\\stimulus dan coding guidelines data A.txt',
 'data\\Data B\\data_dev_B.csv',
 'data\\Data B\\data_train_B.csv',
 'data\\Data B\\stimulus dan coding guidelines data B.txt']

In [3]:
d_train_a = pd.read_csv(files[1])

d_train_b = pd.read_csv(files[-2])

In [4]:
d_train = d_train_a.append(d_train_b)

d_train.reset_index(drop = True, inplace = True)

In [6]:
text = " ".join(d_train.RESPONSE)

In [26]:
char_unique = sorted(set(list(text)))

In [28]:
i2c = dict((i, c) for i, c in enumerate(char_unique, 1))
c2i = dict((c, i) for i, c in enumerate(char_unique, 1))

In [29]:
i2c[0] = 'UNK'
c2i['UNK'] = 0

In [30]:
len(i2c), len(c2i)

(70, 70)

In [31]:
def get_encoding(sentence, encoder):
    data = np.zeros(len(encoder))
    for c in sentence:
        try:
            data[encoder[c]] += 1
        except:
            data[0] += 1
            
    return data

In [32]:
X = np.zeros((len(d_train), len(c2i)))

In [34]:
for index in d_train.index:
    X[index] = get_encoding(d_train.loc[index, 'RESPONSE'], c2i)

## Modeling

In [37]:
def evaluation(y_true, y_pred):
    f1score = f1_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    return {'f1score': f1score, 'precision': prec, 'recall': recall}

In [36]:
kf = KFold(n_splits=5)

## Multinomial Naive Bayes

In [38]:
score_list = []
for train, test in kf.split(X, d_train.LABEL):
    X_train, y_train = X[train], d_train.loc[train, 'LABEL']
    X_test, y_test = X[test], d_train.loc[test, 'LABEL']
    
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    y_pred = mnb.predict(X_test)
    score_list.append(evaluation(y_test, y_pred))

In [39]:
pd.DataFrame(score_list)

Unnamed: 0,f1score,precision,recall
0,0.759494,0.769231,0.75
1,0.826087,0.77551,0.883721
2,0.769231,0.656566,0.928571
3,0.630872,0.505376,0.839286
4,0.721893,0.598039,0.910448


## Decision Tree

In [41]:
score_list = []
for train, test in kf.split(X, d_train.LABEL):
    X_train, y_train = X[train], d_train.loc[train, 'LABEL']
    X_test, y_test = X[test], d_train.loc[test, 'LABEL']
    
    mnb = RandomForestClassifier()
    mnb.fit(X_train, y_train)
    y_pred = mnb.predict(X_test)
    score_list.append(evaluation(y_test, y_pred))



In [42]:
pd.DataFrame(score_list)

Unnamed: 0,f1score,precision,recall
0,0.77027,0.838235,0.7125
1,0.789809,0.873239,0.72093
2,0.690141,0.680556,0.7
3,0.594203,0.5,0.732143
4,0.676056,0.64,0.716418


## Support Vector Machine

In [43]:
score_list = []
for train, test in kf.split(X, d_train.LABEL):
    X_train, y_train = X[train], d_train.loc[train, 'LABEL']
    X_test, y_test = X[test], d_train.loc[test, 'LABEL']
    
    mnb = SVC()
    mnb.fit(X_train, y_train)
    y_pred = mnb.predict(X_test)
    score_list.append(evaluation(y_test, y_pred))



In [44]:
pd.DataFrame(score_list)

Unnamed: 0,f1score,precision,recall
0,0.825,0.825,0.825
1,0.860465,0.860465,0.860465
2,0.76129,0.694118,0.842857
3,0.666667,0.525773,0.910714
4,0.769231,0.674157,0.895522
