In [2]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

In [3]:
dataset = pd.read_csv('training_set/pos_tagged.csv')

In [4]:
dataset.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat,pos
0,336691,cerah,4801,cuaca cerah adalah lazim sepanjang tahun,JJ
1,336270,cerah,4801,gambar yang dihasilkan oleh layarnya cukup cer...,JJ
2,336555,cerah,4803,masa depan yang cerah bagi pemuda berumur 20 d...,JJ
3,336618,cerah,4801,cor caroli alpha canum venaticorum nama lengka...,JJ
4,336613,cerah,4801,sanders lebih menyukai cat air untuk lilo deng...,JJ


In [6]:
unique_katas = dataset['kata'].unique()
separated = {kata: dataset[dataset['kata'] == kata] for kata in unique_katas}

## Not separated by word

In [7]:
tfidf_svc_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', SVC(C=0.0001))
])

In [8]:
x = dataset['kalimat']
y = dataset['sense']

x_train, x_test, y_train, y_test = train_test_split(x, y)

In [9]:
tfidf_svc_pipeline = tfidf_svc_pipeline.fit(x_train, y_train)



In [10]:
tfidf_svc_pipeline.score(x_test, y_test)

0.027604630454140695

## Separated by word

In [11]:
acceptable_models = {}
score_sum = 0

for kata, df in separated.items():
    tfidf_rf_pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(ngram_range=(1,5))),
        ('classifier', RandomForestClassifier(n_estimators=200))
    ])
    
    x = df['kalimat']
    y = df['sense']
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
    
    try:
        tfidf_rf_pipeline = tfidf_svc_pipeline.fit(x_train, y_train)
        score = tfidf_rf_pipeline.score(x_test, y_test)
        print(kata, score)
        score_sum += score
        
        if score > 0.9:
            acceptable_models[kata] = tfidf_svc_pipeline
    except Exception as e:
        print(kata, e)

print('==========')
print('avg score', score_sum / len(separated))

cerah 0.9024390243902439
coklat 0.7391304347826086
jalan 0.6190476190476191
sarung 0.5918367346938775
mengeluarkan 0.41304347826086957
dasar 0.4634146341463415
dunia 0.4473684210526316
harapan 0.6363636363636364
layar 0.34146341463414637
menangkap 0.6666666666666666
menyusun 0.4772727272727273
tinggi 0.21428571428571427
besar 0.45454545454545453
jaringan 0.40425531914893614
membawa 0.13953488372093023
mengikat 0.23255813953488372
nilai 0.35
jam 0.4888888888888889
ketat 0.41304347826086957
mata 0.36585365853658536
mengandung 0.9387755102040817
menjaga 0.4444444444444444
badan 0.42105263157894735
dalam 0.3
kunci 0.32558139534883723
lebat 0.723404255319149
panas 0.45
bunga 0.7317073170731707
halaman 0.5128205128205128
kepala 0.8780487804878049
pembagian 0.5405405405405406
bintang 0.358974358974359
cabang 0.9069767441860465
mengejar 0.6341463414634146
mengisi 0.3488372093023256
atas 0.2558139534883721
mendorong 0.8723404255319149
menerima 0.34146341463414637
rapat 0.6428571428571429
berat 

In [12]:
score_sum = 0
good_models = {}
bad_models = {}

for kata, df in separated.items():
    tfidf = TfidfVectorizer(ngram_range=(1,1))
    rf = RandomForestClassifier(n_estimators=200)
    
    pos_dummies = pd.get_dummies(df['pos'])
    x = df.filter(['kalimat']).join(pos_dummies)
    y = df['sense']
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
    
    try:
        tfidf = tfidf.fit(x_train['kalimat'])
        
        tfidf_train_matrix = tfidf.transform(x_train['kalimat'])
        tfidf_train = pd.DataFrame(tfidf_train_matrix.toarray(), columns=tfidf.get_feature_names())
        train_df = x_train.drop(['kalimat'], axis=1).reset_index().join(tfidf_train)

        rf = rf.fit(train_df, y_train)
        
        tfidf_test_matrix = tfidf.transform(x_test['kalimat'])
        tfidf_test = pd.DataFrame(tfidf_test_matrix.toarray(), columns=tfidf.get_feature_names())
        test_df = x_test.drop(['kalimat'], axis=1).reset_index().join(tfidf_test)

        score = rf.score(test_df, y_test)        
        print(kata, score)
        
        if score < 0.7:
            bad_models[kata] = (tfidf, rf)
        else:
            good_models[kata] = (tfidf, rf)
        score_sum += score
    except Exception as e:
        print('error :', kata, e)

print('==========')
print('avg score', score_sum / len(separated))
print('good', g)
print('bad', b)

cerah 0.926829268292683
coklat 0.7391304347826086
jalan 0.6190476190476191
sarung 0.9591836734693877
mengeluarkan 0.391304347826087
dasar 0.4878048780487805
dunia 0.47368421052631576
harapan 0.6363636363636364
layar 0.8536585365853658
menangkap 0.6666666666666666
menyusun 0.4772727272727273
tinggi 0.47619047619047616
besar 0.7045454545454546
jaringan 0.5531914893617021
membawa 0.23255813953488372
mengikat 0.3488372093023256
nilai 0.4
jam 0.5555555555555556
ketat 0.5
mata 0.5365853658536586
mengandung 0.9387755102040817
menjaga 0.4444444444444444
badan 0.4473684210526316
dalam 0.375
kunci 0.3953488372093023
lebat 0.9361702127659575
panas 0.775
bunga 0.7317073170731707
halaman 0.5641025641025641
kepala 0.8780487804878049
pembagian 0.5135135135135135
bintang 0.5384615384615384
cabang 0.9069767441860465
mengejar 0.6829268292682927
mengisi 0.3488372093023256
atas 0.5348837209302325
mendorong 0.8723404255319149
menerima 0.5365853658536586
rapat 0.6428571428571429
berat 0.4074074074074074
kab

NameError: name 'g' is not defined

In [16]:
separated['mengeluarkan']

Unnamed: 0,kalimat_id,kata,sense,kalimat,pos
35,1006405,mengeluarkan,2905,isyana mengeluarkan semua pikiran pikirannya y...,VB
36,1004587,mengeluarkan,2905,majelis umum perserikatan bangsa bangsa telah ...,VB
37,1008067,mengeluarkan,2902,selanjutnya nokia mengeluarkan versi beta dari...,VB
38,1004006,mengeluarkan,2905,pada detik detik terakhir menjelang pelaksanaa...,VB
39,1005813,mengeluarkan,2905,para cendekiawan mengeluarkan berbagai alasan ...,VB
40,1004009,mengeluarkan,2901,dewan keamanan pbb gagal mengeluarkan resolusi...,VB
775,1007627,mengeluarkan,2905,fathurrahman menyadari hal tersebut dan mengel...,VB
776,1007562,mengeluarkan,2902,group musik naif mengeluarkan album terbaru de...,VB
777,1007006,mengeluarkan,2906,pepagan turi mengeluarkan getah bening yang ak...,VB
778,1005994,mengeluarkan,2901,bukti tersebut menyisakan keyakinan seperti me...,VB
