In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv('training_set/pos_tagged.csv')

In [3]:
dataset.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat,pos
0,336691,cerah,4801,cuaca cerah adalah lazim sepanjang tahun,JJ
1,336270,cerah,4801,gambar yang dihasilkan oleh layarnya cukup cer...,JJ
2,336555,cerah,4803,masa depan yang cerah bagi pemuda berumur 20 d...,JJ
3,336618,cerah,4801,cor caroli alpha canum venaticorum nama lengka...,JJ
4,336613,cerah,4801,sanders lebih menyukai cat air untuk lilo deng...,JJ


In [4]:
dataset.filter(['sense'])

Unnamed: 0,sense
0,4801
1,4801
2,4803
3,4801
4,4801
5,4801
6,4801
7,4801
8,4801
9,4703


In [5]:
unique_katas = dataset['kata'].unique()
separated = {kata: dataset[dataset['kata'] == kata] for kata in unique_katas}

## Not separated by word

In [6]:
tfidf_svc_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', SVC(C=0.0001))
])

In [7]:
x = dataset['kalimat']
y = dataset['sense']

x_train, x_test, y_train, y_test = train_test_split(x, y)

In [8]:
tfidf_svc_pipeline = tfidf_svc_pipeline.fit(x_train, y_train)



In [9]:
tfidf_svc_pipeline.score(x_test, y_test)

0.016918967052537846

## Separated by word

In [10]:
acceptable_models = {}
score_sum = 0

for kata, df in separated.items():
    tfidf_rf_pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
        ('classifier', RandomForestClassifier(n_estimators=200))
    ])
    
    x = df['kalimat']
    y = df['sense']
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
    
    try:
        tfidf_rf_pipeline = tfidf_svc_pipeline.fit(x_train, y_train)
        score = tfidf_rf_pipeline.score(x_test, y_test)
        print(kata, score)
        score_sum += score
        
        if score > 0.9:
            acceptable_models[kata] = tfidf_svc_pipeline
    except Exception as e:
        print(kata, e)

print('==========')
print('avg score', score_sum / len(separated))

cerah 0.9024390243902439
coklat 0.7391304347826086
jalan 0.6190476190476191
sarung 0.5918367346938775
mengeluarkan 0.41304347826086957
dasar 0.4634146341463415
dunia 0.4473684210526316
harapan 0.6363636363636364
layar 0.34146341463414637
menangkap 0.6666666666666666
menyusun 0.4772727272727273
tinggi 0.21428571428571427
besar 0.45454545454545453
jaringan 0.40425531914893614
membawa 0.13953488372093023
mengikat 0.23255813953488372
nilai 0.35
jam 0.4888888888888889
ketat 0.41304347826086957
mata 0.36585365853658536
mengandung 0.9387755102040817
menjaga 0.4444444444444444
badan 0.42105263157894735
dalam 0.3
kunci 0.32558139534883723
lebat 0.723404255319149
panas 0.45
bunga 0.7317073170731707
halaman 0.5128205128205128
kepala 0.8780487804878049
pembagian 0.5405405405405406
bintang 0.358974358974359
cabang 0.9069767441860465
mengejar 0.6341463414634146
mengisi 0.3488372093023256
atas 0.2558139534883721
mendorong 0.8723404255319149
menerima 0.34146341463414637
rapat 0.6428571428571429
berat 

In [55]:
score_sum = 0

for kata, df in separated.items():
    tfidf = TfidfVectorizer(ngram_range=(1,2))
    rf = RandomForestClassifier(n_estimators=200)
    
    pos_dummies = pd.get_dummies(df['pos'])
    x = df.filter(['kalimat']).join(pos_dummies)
    y = df['sense']
    x_train, x_test, y_train, y_test = train_test_split(x, y)
    
    try:
        tfidf = tfidf.fit(x_train['kalimat'])
        
        tfidf_train_matrix = tfidf.transform(x_train['kalimat'])
        tfidf_train = pd.DataFrame(tfidf_train_matrix.toarray(), columns=tfidf.get_feature_names())
#         train_pos_dummies = pd.get_dummies(x_train['pos']).reset_index()
#         train_df = train_pos_dummies.join(tfidf_train)
        train_df = x_train.drop(['kalimat'], axis=1).reset_index().join(tfidf_train)

        rf = rf.fit(train_df, y_train)
        
        tfidf_test_matrix = tfidf.transform(x_test['kalimat'])
        tfidf_test = pd.DataFrame(tfidf_test_matrix.toarray(), columns=tfidf.get_feature_names())
#         test_pos_dummies = pd.get_dummies(x_test['pos']).reset_index()
#         test_df = test_pos_dummies.join(tfidf_test)
        test_df = x_test.drop(['kalimat'], axis=1).reset_index().join(tfidf_test)

        score = rf.score(test_df, y_test)        
        
        if score < 0.7:
            print(kata, score)
        score_sum += score
    except Exception as e:
        print('error :', kata, e)

print('==========')
print('avg score', score_sum / len(separated))

jalan 0.5476190476190477
mengeluarkan 0.5869565217391305
dasar 0.36585365853658536
dunia 0.47368421052631576
menyusun 0.5227272727272727
tinggi 0.5
jaringan 0.5319148936170213
membawa 0.2558139534883721
mengikat 0.3488372093023256
nilai 0.65
jam 0.5555555555555556
ketat 0.45652173913043476
mata 0.5365853658536586
menjaga 0.5333333333333333
badan 0.42105263157894735
dalam 0.4
kunci 0.37209302325581395
halaman 0.6410256410256411
pembagian 0.5135135135135135
bintang 0.5384615384615384
mengisi 0.32558139534883723
atas 0.3023255813953488
menerima 0.4146341463414634
berat 0.2222222222222222
kulit 0.6097560975609756
lingkungan 0.631578947368421
baru 0.6170212765957447
tengah 0.5217391304347826
kaki 0.42857142857142855
menurunkan 0.6190476190476191
avg score 0.6353431050007894
