In [1]:
import pandas as pd
import numpy as np

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv('training_set/pos_tagged.csv')

In [3]:
dataset.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat,pos
0,336691,cerah,4801,cuaca cerah adalah lazim sepanjang tahun,JJ
1,336270,cerah,4801,gambar yang dihasilkan oleh layarnya cukup cer...,JJ
2,336555,cerah,4803,masa depan yang cerah bagi pemuda berumur 20 d...,JJ
3,336618,cerah,4801,cor caroli alpha canum venaticorum nama lengka...,JJ
4,336613,cerah,4801,sanders lebih menyukai cat air untuk lilo deng...,JJ


In [4]:
dataset['kalimat'] = dataset.apply(lambda r: r['kalimat'].replace(r['kata'], f'|{r["kata"]}|'), axis=1)

In [5]:
sw_remover = StopWordRemoverFactory().create_stop_word_remover()
dataset['kalimat'] = dataset['kalimat'].apply(lambda s: sw_remover.remove(s))

In [6]:
stemmer = StemmerFactory().create_stemmer()
dataset['kalimat'] = dataset['kalimat'].apply(lambda s: stemmer.stem(s))

KeyboardInterrupt: 

In [None]:
dataset.to_csv('training_set/stemmed.csv', index=False)

In [7]:
dataset = pd.read_csv('training_set/stemmed.csv')
unique_katas = dataset['kata'].unique()
separated = {kata: dataset[dataset['kata'] == kata] for kata in unique_katas}

In [8]:
separated['mengikat']

Unnamed: 0,kalimat_id,kata,sense,kalimat,pos
80,1010775,mengikat,2202,kaisar asyur sibuk perang saudara revolusi tah...,VB
81,1010515,mengikat,2203,bagi dua saran yang ikat siapa hanya makna sek...,VB
82,1010477,mengikat,2202,patangatus patang puluh ikang cakakala 440 s 5...,VB
83,1010291,mengikat,2203,literatur fiqih ahl al hall wa al aqd orang or...,VB
84,1010239,mengikat,2204,pusat hg ikat ikat rangkap tiga cc kemudian se...,VB
85,1010165,mengikat,2204,asam ikat logam racun per alumunium danau,VB
671,1010919,mengikat,2204,semen ikat butir butir sama biasa rupa kalsit ...,VB
672,1009970,mengikat,2202,raden panji margono putra tejakusuma v jabat a...,VB
673,1010405,mengikat,2203,meski dapat tidak ikat bawah hukum internasion...,VB
674,1010575,mengikat,2202,janji ada allah situ umat nya sangat penting i...,VB


In [None]:
dataset.head(100)

## Not separated by word

In [None]:
# tfidf_svc_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer()),
#     ('classifier', SVC(C=0.0001))
# ])

In [None]:
# x = dataset['kalimat']
# y = dataset['sense']

# x_train, x_test, y_train, y_test = train_test_split(x, y)

In [None]:
# tfidf_svc_pipeline = tfidf_svc_pipeline.fit(x_train, y_train)

In [None]:
# tfidf_svc_pipeline.score(x_test, y_test)

## Separated by word

In [None]:
score_sum = 0
good_models = {}
bad_models = {}

for kata, df in separated.items():
    tfidf = TfidfVectorizer(ngram_range=(1,2))
    rf = RandomForestClassifier(n_estimators=200)
    
    pos_dummies = pd.get_dummies(df['pos'])
    x = df.filter(['kalimat']).join(pos_dummies)
    y = df['sense']
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
    
    try:
        tfidf = tfidf.fit(x_train['kalimat'])
        
        tfidf_train_matrix = tfidf.transform(x_train['kalimat'])
        tfidf_train = pd.DataFrame(tfidf_train_matrix.toarray(), columns=tfidf.get_feature_names())
        train_df = x_train.drop(['kalimat'], axis=1).reset_index().join(tfidf_train)
        
        rf = rf.fit(train_df, y_train)
        
        tfidf_test_matrix = tfidf.transform(x_test['kalimat'])
        tfidf_test = pd.DataFrame(tfidf_test_matrix.toarray(), columns=tfidf.get_feature_names())
        test_df = x_test.drop(['kalimat'], axis=1).reset_index().join(tfidf_test)

        score = rf.score(test_df, y_test)        
        print(kata, score)
        
        if score < 0.7:
            bad_models[kata] = (tfidf, rf)
        else:
            good_models[kata] = (tfidf, rf)
        score_sum += score
    except Exception as e:
        print('error :', kata, e)
    

print('==========')
print('avg score', score_sum / len(separated))