In [7]:
#Data Analysis
import pandas as pd
import numpy as np

#Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.set(font_scale=1)
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

#Modeling
import fasttext
from sklearn.model_selection import cross_val_predict, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn_crfsuite import CRF, scorers, metrics
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import classification_report, make_scorer
import scipy.stats
import eli5

In [8]:
path = "C:\\Users\\farih\\Downloads\\"
indoNLUmodel = "C:\\Users\\farih\\Downloads\\fasttext.4B.id.300.epoch5.uncased.bin"
model = fasttext.load_model(indoNLUmodel)



In [5]:
train_df = pd.read_csv("D:\Drive\KULEEAH\Semester 8\TA Farihin\Dataset\\bert\\trainval.csv")
test_df = pd.read_csv("D:\Drive\KULEEAH\Semester 8\TA Farihin\Dataset\\bert\\test.csv")

In [6]:
train_df

Unnamed: 0,sentence_id,words,labels
0,1382601382042103808,Hidup,O
1,1382601382042103808,sesedih,O
2,1382601382042103808,dan,O
3,1382601382042103808,secaper,O
4,1382601382042103808,apa,O
...,...,...,...
136051,1386004972207153156,muntah,O
136052,1386301391086309382,Ngabuburit,O
136053,1386301391086309382,tadi,O
136054,1386301391086309382,sore,O


In [6]:
# A class to retrieve the sentences from the dataset
class getsentence(object):
    
    def __init__(self, data):
        self.n_sent = 1.0
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["words"].values.tolist(),
                                                     s["labels"].values.tolist())]
        self.grouped = self.data.groupby("sentence_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [7]:
getter_train = getsentence(train_df)
getter_test = getsentence(test_df)

sentences_train = getter_train.sentences
sentences_test = getter_test.sentences
#This is how a sentence will look like. 
print(sentences_train[:1])
print(sentences_test[:1])


[[('@ridwankamil', 'O'), ('@KickAndyShow', 'O'), ('@Metro_TV', 'O'), ('Upaya', 'O'), ('untuk', 'O'), ('mnaikan', 'O'), ('nilai', 'O'), ('jual', 'O'), ('yg', 'O'), ('bgus,dngn', 'O'), ('cara', 'O'), ('d', 'O'), ('desain', 'O'), ('pak', 'O'), ('gubernur', 'B-PER'), ('akan', 'O'), ('menjadikn', 'O'), ('produk', 'O'), ('umkm', 'O'), ('d', 'O'), ('mintai', 'O'), ('krena', 'O'), ('yg', 'O'), ('desain', 'O'), ('org', 'O'), ('nomer', 'O'), ('satu', 'O'), ('di', 'O'), ('jabar', 'B-LOC')]]
[[('@GyuuPotter', 'O'), ('iyhh', 'O'), (',', 'O'), ('soalnya', 'O'), ('mwu', 'O'), ('kerjain', 'O'), ('tugas', 'O'), ('jg��', 'O')]]


In [8]:
words = list(set(train_df["words"].values))
n_words = len(words)
print(n_words)

30205


In [9]:
# Feature set
def word2features(sent, i):
    word = sent[i][0]
    # we=get_features(word) 
    # postag = sent[i][1]

    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower() if type(word) == str else word,
        'word[-3:]': word[-3:] if type(word) == str else word,
        'word[-2:]': word[-2:] if type(word) == str else word,
        'word[-1:]': word[-2:] if type(word) == str else word,
        'word[:3]': word[:3] if type(word) == str else word,
        'word[:2]': word[:2] if type(word) == str else word,
        'word[:1]': word[:2] if type(word) == str else word,
        'word.isupper()': word.isupper() if type(word) == str else word,
        'word.istitle()': word.istitle() if type(word) == str else word,
        'word.isdigit()': word.isdigit() if type(word) == str else word,
        # 'postag': postag,
        # 'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        # postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower() if type(word1) == str else word1,
            '-1:word.istitle()': word1.istitle() if type(word1) == str else word1,
            '-1:word.isupper()': word1.isupper() if type(word1) == str else word1,
            # '-1:postag': postag1,
            # '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        # postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower() if type(word1) == str else word1,
            '+1:word.istitle()': word1.istitle() if type(word1) == str else word1,
            '+1:word.isupper()': word1.isupper() if type(word1) == str else word1,
            # '+1:postag': postag1,
            # '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    #src: https://stackoverflow.com/questions/58736548/how-to-use-word-embedding-as-features-for-crf-sklearn-crfsuite-model-training
    # we=model.get_word_vector(features['word.lower()'])
    # for iv,value in enumerate(we):
    #     features['v{}'.format(iv)]=value

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

In [10]:
%%time
#Creating the train and test set
X_train = [sent2features(s) for s in sentences_train]
X_test = [sent2features(s) for s in sentences_test]
y_train = [sent2labels(s) for s in sentences_train]
y_test = [sent2labels(s) for s in sentences_test]

Wall time: 1min 37s


In [11]:
# group B and I results
# #First we select all the tags that are relevant for us i.e. remove the 'O' tag from the list. 

# labels = list(crf2.classes_)
labels = list(train_df['labels'].unique())
labels = list(filter(lambda a: a != 'O', labels))
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(sorted_labels)

['B-EV', 'I-EV', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-PROD', 'I-PROD', 'B-WA', 'I-WA']


In [12]:
#Creating the CRF model
crf = CRF(algorithm='lbfgs',
          c1= 0.05912981852829489, 
          c2= 0.09113455878833218,
          max_iterations=100,
          all_possible_transitions=False)

In [13]:
%%time
pred = cross_val_predict(estimator=crf, X=X_train, y=y_train, cv=5)
print(flat_classification_report(y_pred=pred, y_true=y_train, labels=sorted_labels, digits=3))   

              precision    recall  f1-score   support

        B-EV      0.571     0.241     0.339       652
        I-EV      0.587     0.322     0.416       789
       B-LOC      0.755     0.729     0.741      1805
       I-LOC      0.569     0.541     0.554       790
       B-ORG      0.603     0.413     0.490      1674
       I-ORG      0.502     0.390     0.439       877
       B-PER      0.858     0.715     0.780      4491
       I-PER      0.844     0.818     0.831      2504
      B-PROD      0.860     0.786     0.821      2788
      I-PROD      0.710     0.505     0.590       863
        B-WA      0.400     0.089     0.145       135
        I-WA      0.213     0.043     0.072       231

   micro avg      0.768     0.630     0.693     17599
   macro avg      0.623     0.466     0.518     17599
weighted avg      0.749     0.630     0.679     17599

Wall time: 13min 25s


In [14]:
#Tuning the parameters manually, setting c1 = 10
crf2 = CRF(algorithm='lbfgs',
          c1=1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [15]:
%%time
pred = cross_val_predict(estimator=crf2, X=X_train, y=y_train, cv=5)
print(flat_classification_report(y_pred=pred, y_true=y_train, labels=sorted_labels, digits=3))  

              precision    recall  f1-score   support

        B-EV      0.523     0.224     0.314       652
        I-EV      0.533     0.337     0.413       789
       B-LOC      0.757     0.721     0.739      1805
       I-LOC      0.578     0.563     0.571       790
       B-ORG      0.623     0.400     0.487      1674
       I-ORG      0.510     0.425     0.464       877
       B-PER      0.859     0.714     0.780      4491
       I-PER      0.831     0.826     0.828      2504
      B-PROD      0.851     0.782     0.815      2788
      I-PROD      0.671     0.512     0.581       863
        B-WA      0.174     0.030     0.051       135
        I-WA      0.121     0.048     0.068       231

   micro avg      0.759     0.632     0.690     17599
   macro avg      0.586     0.465     0.509     17599
weighted avg      0.739     0.632     0.677     17599

Wall time: 12min 52s


In [18]:
%%time
#Now we will create the Randomized CV search model wherein we will use a modified F1 scorer model considering only the relevant labels
# define fixed parameters and parameters to search
crf3 = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)   
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=8, #-1
                        n_iter=30,#50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


PicklingError: Could not pickle the task to send it to the workers.

In [28]:
#Lets check the best estimated parameters and CV score
print('Best parameters:', rs.best_params_)
print('Best CV score:', r   s.best_score_)
print('Model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

Best parameters: {'c1': 0.44278845810612677, 'c2': 0.0336690727591398}
Best CV score: 0.6335985928603546
Model size: 1.55M


In [36]:
#Now we create the model again using the best estimators
crf3 = rs.best_estimator_
y_pred = crf3.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

              precision    recall  f1-score   support

        B-EV      0.804     0.287     0.423       157
        I-EV      0.829     0.360     0.502       189
       B-LOC      0.789     0.719     0.752       420
       I-LOC      0.698     0.581     0.634       167
       B-ORG      0.710     0.448     0.549       411
       I-ORG      0.606     0.393     0.477       247
       B-PER      0.912     0.657     0.764      1073
       I-PER      0.873     0.794     0.832       554
      B-PROD      0.875     0.757     0.812       766
      I-PROD      0.727     0.500     0.592       314
        B-WA      0.600     0.103     0.176        29
        I-WA      0.556     0.250     0.345        40

   micro avg      0.825     0.616     0.705      4367
   macro avg      0.748     0.487     0.571      4367
weighted avg      0.818     0.616     0.695      4367



In [38]:
# %%time
# pred = cross_val_predict(estimator=crf3, X=X_test, y=y_test, cv=5)
# print(flat_classification_report(y_pred=pred, y_true=y_test, labels=sorted_labels, digits=3))

In [33]:
crf3.fit(X_train,y_train)



CRF(algorithm='lbfgs', all_possible_transitions=False, c1=0.44278845810612677,
    c2=0.0336690727591398, keep_tempfiles=None, max_iterations=100)

In [34]:
eli5.show_weights(crf3, top=30)

From \ To,O,B-EV,I-EV,B-LOC,I-LOC,B-ORG,I-ORG,B-PER,I-PER,B-PROD,I-PROD,B-WA,I-WA
O,1.96,0.451,-2.71,0.52,-3.08,1.16,-3.274,1.633,-4.03,1.038,-2.679,0.395,-1.264
B-EV,-0.619,2.037,6.31,0.0,0.0,0.254,0.0,0.0,0.0,0.523,0.0,0.0,0.0
I-EV,-0.489,0.488,6.045,-1.726,0.0,-1.197,0.0,-0.603,0.0,0.0,0.0,0.0,0.0
B-LOC,0.704,0.0,0.0,1.909,6.269,0.0,0.0,-1.114,0.0,0.0,0.0,0.0,0.0
I-LOC,-0.326,0.0,0.0,0.351,5.886,0.0,0.0,-0.774,0.0,0.0,0.0,0.0,0.0
B-ORG,0.512,0.888,0.0,-0.409,0.0,0.719,5.632,-0.689,0.0,-0.241,0.0,0.777,0.0
I-ORG,-0.126,0.0,0.0,-1.266,0.0,-0.003,5.481,-0.798,0.0,-0.117,0.0,0.0,0.0
B-PER,0.78,-0.088,0.0,0.0,0.0,0.253,0.0,1.794,5.429,0.011,0.0,0.0,0.0
I-PER,0.365,0.0,0.0,0.0,0.0,0.0,0.0,-0.102,4.459,0.0,0.0,0.0,0.0
B-PROD,0.606,-0.164,0.0,-0.689,0.0,0.0,0.0,0.0,0.0,2.041,5.365,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12
+4.982,+1:word.lower():presisi,,,,,,,,,,,
+4.808,+1:word.lower():menekankan,,,,,,,,,,,
+4.653,+1:word.lower():ppat,,,,,,,,,,,
+4.544,+1:word.lower():khofifah,,,,,,,,,,,
+4.461,bias,,,,,,,,,,,
+4.346,-1:word.lower():parawansa,,,,,,,,,,,
+4.097,-1:word.lower():pembayaran,,,,,,,,,,,
+4.036,+1:word.lower():subian,,,,,,,,,,,
+4.013,word.lower():anda,,,,,,,,,,,
+3.787,-1:word.lower():5-10%,,,,,,,,,,,

Weight?,Feature
+4.982,+1:word.lower():presisi
+4.808,+1:word.lower():menekankan
+4.653,+1:word.lower():ppat
+4.544,+1:word.lower():khofifah
+4.461,bias
+4.346,-1:word.lower():parawansa
+4.097,-1:word.lower():pembayaran
+4.036,+1:word.lower():subian
+4.013,word.lower():anda
+3.787,-1:word.lower():5-10%

Weight?,Feature
+3.879,-1:word.lower():acara
+3.723,+1:word.lower():klean����
+3.574,word.lower():musrenbang
+3.451,word.lower():tragedi
+3.211,+1:word.lower():��wish
+2.988,+1:word.lower():dilaksanakan
+2.928,word[:3]:#Ra
+2.697,+1:word.lower():yakali
+2.507,+1:word.lower():jaga
+2.472,+1:word.lower():pengen

Weight?,Feature
+4.696,-1:word.lower():pandemi
+4.080,word.lower():sale
+3.872,-1:word.lower():piala
+3.769,-1:word.lower():(
+3.109,-1:word.lower():hari
+3.036,word.lower():milenial
+2.428,-1:word.lower():antigen
+2.365,word[:3]:Tah
+2.304,-1:word.lower():pilkada
+2.259,-1:word.lower():tragedi

Weight?,Feature
+6.623,word.lower():medan
+4.772,word.lower():sragen
+4.752,word.lower():karawang
+4.608,word[:3]:se-
+4.425,word.lower():seturan
+4.390,word.lower():tmii
+4.248,word.lower():banyuwangi
+4.227,-1:word.lower():daerah
+4.189,word[-3:]:Kab
+4.044,word.lower():singapore

Weight?,Feature
+4.290,-1:word.lower():.
+3.703,-1:word.lower():(
+3.614,-1:word.lower():selat
+3.568,+1:word.lower():pasifik
+3.509,word.lower():selatan
+3.373,-1:word.lower():gunung
+3.231,-1:word.lower():stasiun
+3.091,-1:word.lower():pulau
+3.041,-1:word.lower():desa
+2.919,-1:word.lower():kota

Weight?,Feature
+5.365,word.lower():persebaya
+4.740,word.lower():persib
+4.437,word.lower():bmkg
+4.094,word.lower():sabhara
+4.076,word[:3]:#EN
+4.001,word.lower():@mitra_kpr_syariah
+3.917,word.lower():paspammenhan
+3.853,word[-3:]:fel
+3.806,word.lower():dreamies
+3.753,word.lower():barcelona

Weight?,Feature
+5.043,-1:word.lower():(
+4.501,+1:word.lower():pakar
+3.588,-1:word.lower():.
+3.282,+1:word.lower():https://t.co/ywplyvh1jp
+3.235,-1:word.lower():partai
+3.215,+1:word.lower():bertemu
+3.082,+1:word.lower():per
+3.031,-1:word.lower():rezim
+2.999,word[:3]:Sul
+2.801,-1:word.lower():harian

Weight?,Feature
+5.760,word[:3]:kiw
+5.008,word.lower():jkw
+4.953,word.lower():hong
+4.701,-1:word.lower():aiptu
+4.571,word.lower():gibran
+4.476,word.lower():bayu
+4.466,word.lower():@santi_nya
+4.456,word.lower():jake
+4.450,word.lower():meng
+4.289,word.lower():nawir

Weight?,Feature
+5.151,word.lower():baswedan
+4.965,-1:word.lower():.
+4.820,-1:word.lower():kepala
+4.442,+1:word.lower():grobogan
+4.399,-1:word.lower():ketua
+4.234,word.lower():soekarnoputri
+3.992,-1:word.lower():wakil
+3.928,-1:word.lower():gub
+3.743,-1:word.lower():cak
+3.645,-1:word.lower():ridwan

Weight?,Feature
+6.920,word.lower():iphone
+6.465,word.lower():tiktok
+6.094,word.lower():canva
+5.836,word.lower():marjan
+5.643,word.lower():ig
+5.393,word.lower():indihome
+5.322,word.lower():facebook
+5.135,word.lower():����ｏｐｐａｂｅｔ����ｏｐｐａｂｅｔ����
+4.893,word.lower():telkomsel
+4.880,+1:word.lower():situs

Weight?,Feature
+6.005,-1:word.lower():+
+3.684,-1:word.lower():vaksin
+3.433,+1:word.lower():sbgai
+3.404,-1:word.lower():redmi
+3.282,word.lower():ratu
+3.160,+1:word.lower():edisi
+3.054,word[:3]:Pre
+3.021,+1:word.lower():prem
+3.001,-1:word.lower():(
+2.989,-1:word.lower():mobile

Weight?,Feature
+3.518,-1:word.lower():nonton
+3.442,-1:word.lower():album
+3.300,word[-3:]:say
+3.029,+1:word.lower():https://t.co/bs28bqaiah
+3.029,word.lower():masjid-agung-purwokerto-hasil-rancangan-ridwan-kamil-mulai-dibangun
+2.705,word.lower():moonwalk
+2.599,word.lower():ic
+2.581,word[:2]:My
+2.581,word[:1]:My
+2.447,-1:word.lower():situs

Weight?,Feature
+3.128,+1:word.lower():tentang
+2.417,+1:word.lower():https://t.co/gh2dkrlajt
+2.322,word.lower():kompas
+2.201,+1:word.lower():mantep
+2.085,word[-3:]:bat
+1.944,-1:word.lower():the
+1.926,+1:word.lower():boa
+1.885,-1:word.lower():kompas
+1.773,+1:word.lower():butter
+1.615,-1:word.lower():bareng
