In [14]:
import pandas as pd
import numpy as np

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv('training_set/pos_tagged.csv')

In [3]:
dataset.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat,pos
0,336691,cerah,4801,cuaca cerah adalah lazim sepanjang tahun,JJ
1,336270,cerah,4801,gambar yang dihasilkan oleh layarnya cukup cer...,JJ
2,336555,cerah,4803,masa depan yang cerah bagi pemuda berumur 20 d...,JJ
3,336618,cerah,4801,cor caroli alpha canum venaticorum nama lengka...,JJ
4,336613,cerah,4801,sanders lebih menyukai cat air untuk lilo deng...,JJ


In [4]:
dataset['kalimat'] = dataset.apply(lambda r: r['kalimat'].replace(r['kata'], f'|{r["kata"]}|'), axis=1)

In [5]:
sw_remover = StopWordRemoverFactory().create_stop_word_remover()
dataset['kalimat'] = dataset['kalimat'].apply(lambda s: sw_remover.remove(s))

In [15]:
stemmer = StemmerFactory().create_stemmer()
dataset['kalimat'] = dataset['kalimat'].apply(lambda s: stemmer.stem(s))

In [16]:
dataset.to_csv('training_set/stemmed.csv', index=False)

In [17]:
unique_katas = dataset['kata'].unique()
separated = {kata: dataset[dataset['kata'] == kata] for kata in unique_katas}

In [19]:
dataset.head(100)

Unnamed: 0,kalimat_id,kata,sense,kalimat,pos
0,336691,cerah,4801,cuaca cerah lazim panjang tahun,JJ
1,336270,cerah,4801,gambar hasil layar cukup cerah milik speaker m...,JJ
2,336555,cerah,4803,masa depan cerah pemuda umur 20 prancis abad 17,JJ
3,336618,cerah,4801,cor caroli alpha canum venaticorum nama lengka...,JJ
4,336613,cerah,4801,sanders lebih suka cat air lilo maksud tampil ...,JJ
5,336406,cerah,4801,ulleungdo milik iklim subtropis basah klasifik...,JJ
6,336324,cerah,4801,ikan hias mungkin besar sulit tahan hidup alam...,JJ
7,336401,cerah,4801,buah entremet tanda akhir saji suatu set menu ...,JJ
8,336426,cerah,4801,alangkah lega hati ia mulai lihat binatang bin...,JJ
9,337760,coklat,4703,sisi atas tubuh warna coklat tembaga emas,NN


## Not separated by word

In [7]:
# tfidf_svc_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer()),
#     ('classifier', SVC(C=0.0001))
# ])

In [8]:
# x = dataset['kalimat']
# y = dataset['sense']

# x_train, x_test, y_train, y_test = train_test_split(x, y)

In [9]:
# tfidf_svc_pipeline = tfidf_svc_pipeline.fit(x_train, y_train)

In [10]:
# tfidf_svc_pipeline.score(x_test, y_test)

## Separated by word

In [22]:
score_sum = 0
good_models = {}
bad_models = {}

for kata, df in separated.items():
    tfidf = TfidfVectorizer(ngram_range=(1,2))
    rf = RandomForestClassifier(n_estimators=200)
    
    pos_dummies = pd.get_dummies(df['pos'])
    x = df.filter(['kalimat']).join(pos_dummies)
    y = df['sense']
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
    
    try:
        tfidf = tfidf.fit(x_train['kalimat'])
        
        tfidf_train_matrix = tfidf.transform(x_train['kalimat'])
        tfidf_train = pd.DataFrame(tfidf_train_matrix.toarray(), columns=tfidf.get_feature_names())
        train_df = x_train.drop(['kalimat'], axis=1).reset_index().join(tfidf_train)
        
        rf = rf.fit(train_df, y_train)
        
        tfidf_test_matrix = tfidf.transform(x_test['kalimat'])
        tfidf_test = pd.DataFrame(tfidf_test_matrix.toarray(), columns=tfidf.get_feature_names())
        test_df = x_test.drop(['kalimat'], axis=1).reset_index().join(tfidf_test)

        score = rf.score(test_df, y_test)        
        print(kata, score)
        
        if score < 0.7:
            bad_models[kata] = (tfidf, rf)
        else:
            good_models[kata] = (tfidf, rf)
        score_sum += score
    except Exception as e:
        print('error :', kata, e)
    

print('==========')
print('avg score', score_sum / len(separated))

cerah 0.926829268292683
coklat 0.7391304347826086
jalan 0.6666666666666666
sarung 0.9591836734693877
mengeluarkan 0.41304347826086957
dasar 0.5121951219512195
dunia 0.47368421052631576
harapan 0.6363636363636364
layar 0.8780487804878049
menangkap 0.6666666666666666
menyusun 0.4772727272727273
tinggi 0.47619047619047616
besar 0.7045454545454546
jaringan 0.48936170212765956
membawa 0.20930232558139536
mengikat 0.3023255813953488
nilai 0.525
jam 0.5333333333333333
ketat 0.5869565217391305
mata 0.6097560975609756
mengandung 0.9387755102040817
menjaga 0.4444444444444444
badan 0.4473684210526316
dalam 0.275
kunci 0.32558139534883723
lebat 0.9574468085106383
panas 0.8
bunga 0.7317073170731707
halaman 0.5641025641025641
kepala 0.8780487804878049
pembagian 0.5405405405405406
bintang 0.48717948717948717
cabang 0.9069767441860465
mengejar 0.6341463414634146
mengisi 0.3953488372093023
atas 0.5581395348837209
mendorong 0.8723404255319149
menerima 0.43902439024390244
rapat 0.6428571428571429
berat 0