In [1]:
import pandas as pd
import numpy as np

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv('training_set/pos_tagged.csv')

In [3]:
dataset.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat,pos
0,336691,cerah,4801,cuaca |cerah| lazim sepanjang tahun,JJ
1,336270,cerah,4801,gambar dihasilkan layarnya cukup |cerah| memil...,JJ
2,336555,cerah,4803,masa depan |cerah| pemuda berumur 20 prancis a...,JJ
3,336618,cerah,4801,cor caroli alpha canum venaticorum nama lengka...,JJ
4,336613,cerah,4801,sanders lebih menyukai cat air lilo maksud men...,JJ


In [4]:
unique_katas = dataset['kata'].unique()
separated = {kata: dataset[dataset['kata'] == kata] for kata in unique_katas}

## Separated by word

In [6]:
score_sum = 0
good_models = {}
bad_models = {}

for kata, df in separated.items():
    tfidf = TfidfVectorizer(ngram_range=(1,2))
    rf = RandomForestClassifier(n_estimators=200)
    
    pos_dummies = pd.get_dummies(df['pos'])
    x = df.filter(['kalimat']).join(pos_dummies)
    y = df['sense']
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
    
    try:
        tfidf = tfidf.fit(x_train['kalimat'])
        
        tfidf_train_matrix = tfidf.transform(x_train['kalimat'])
        tfidf_train = pd.DataFrame(tfidf_train_matrix.toarray(), columns=tfidf.get_feature_names())
        train_df = x_train.drop(['kalimat'], axis=1).reset_index().join(tfidf_train)
        
        rf = rf.fit(train_df, y_train)
        
        tfidf_test_matrix = tfidf.transform(x_test['kalimat'])
        tfidf_test = pd.DataFrame(tfidf_test_matrix.toarray(), columns=tfidf.get_feature_names())
        test_df = x_test.drop(['kalimat'], axis=1).reset_index().join(tfidf_test)

        score = rf.score(test_df, y_test)        
        print(kata, score)
        
        if score < 0.7:
            bad_models[kata] = (tfidf, rf)
        else:
            good_models[kata] = (tfidf, rf)
        score_sum += score
    except Exception as e:
        print('error :', kata, e)
    

print('==========')
print('avg score', score_sum / len(separated))

cerah 0.926829268292683
coklat 0.7608695652173914
jalan 0.7619047619047619
sarung 0.8775510204081632
mengeluarkan 0.41304347826086957
dasar 0.7560975609756098
dunia 0.631578947368421
harapan 0.6818181818181818
layar 0.8292682926829268
menangkap 0.6666666666666666
menyusun 0.4772727272727273
tinggi 0.5476190476190477
besar 0.7045454545454546
jaringan 0.44680851063829785
membawa 0.16279069767441862
mengikat 0.3953488372093023
nilai 0.525
jam 0.4888888888888889
ketat 0.5217391304347826
mata 0.7560975609756098
mengandung 0.9387755102040817
menjaga 0.4444444444444444
badan 0.7105263157894737
dalam 0.35
kunci 0.6511627906976745
lebat 0.9787234042553191
panas 0.925
bunga 0.7804878048780488
halaman 0.5641025641025641
kepala 0.8780487804878049
pembagian 0.5405405405405406
bintang 0.46153846153846156
cabang 0.9069767441860465
mengejar 0.6585365853658537
mengisi 0.3953488372093023
atas 0.5581395348837209
mendorong 0.8723404255319149
menerima 0.5365853658536586
rapat 0.7619047619047619
berat 0.555