In [1]:
#Data Analysis
import pandas as pd
import numpy as np

#Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.set(font_scale=1)
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

#Modeling
import fasttext
from sklearn.model_selection import cross_val_predict, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn_crfsuite import CRF, scorers, metrics
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import classification_report, make_scorer
import scipy.stats
import eli5

In [2]:
path = "C:\\Users\\farih\\Downloads\\"
indoNLUmodel = "C:\\Users\\farih\\Downloads\\fasttext.4B.id.300.epoch5.uncased.bin"
model = fasttext.load_model(indoNLUmodel)



In [124]:
data = pd.read_csv("../Dataset/annotated_ner_data_new.csv")

In [125]:
data

Unnamed: 0,token,ner,tweet_id,index
0,Hidup,O,1382601382042103808,0
1,sesedih,O,1382601382042103808,1
2,dan,O,1382601382042103808,2
3,secaper,O,1382601382042103808,3
4,apa,O,1382601382042103808,4
...,...,...,...,...
170065,muntah,O,1386004972207153156,5
170066,Ngabuburit,O,1386301391086309382,0
170067,tadi,O,1386301391086309382,1
170068,sore,O,1386301391086309382,2


In [8]:
# A class to retrieve the sentences from the dataset
class getsentence(object):
    
    def __init__(self, data):
        self.n_sent = 1.0
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["token"].values.tolist(),
                                                     s["ner"].values.tolist())]
        self.grouped = self.data.groupby("tweet_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [9]:
getter = getsentence(data)
sentences = getter.sentences
#This is how a sentence will look like. 
print(sentences[:1])

[[('@ridwankamil', 'O'), ('@KickAndyShow', 'O'), ('@Metro_TV', 'O'), ('Upaya', 'O'), ('untuk', 'O'), ('mnaikan', 'O'), ('nilai', 'O'), ('jual', 'O'), ('yg', 'O'), ('bgus,dngn', 'O'), ('cara', 'O'), ('d', 'O'), ('desain', 'O'), ('pak', 'O'), ('gubernur', 'B-PER'), ('akan', 'O'), ('menjadikn', 'O'), ('produk', 'O'), ('umkm', 'O'), ('d', 'O'), ('mintai', 'O'), ('krena', 'O'), ('yg', 'O'), ('desain', 'O'), ('org', 'O'), ('nomer', 'O'), ('satu', 'O'), ('di', 'O'), ('jabar', 'B-LOC')]]


In [10]:
words = list(set(data["token"].values))
n_words = len(words)
print(n_words)

35418


In [13]:
# Feature set
def word2features(sent, i):
    word = sent[i][0]
    # we=get_features(word) 
    # postag = sent[i][1]

    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower() if type(word) == str else word,
        'word[-3:]': word[-3:] if type(word) == str else word,
        'word[-2:]': word[-2:] if type(word) == str else word,
        'word[-1:]': word[-2:] if type(word) == str else word,
        'word[:3]': word[:3] if type(word) == str else word,
        'word[:2]': word[:2] if type(word) == str else word,
        'word[:1]': word[:2] if type(word) == str else word,
        'word.isupper()': word.isupper() if type(word) == str else word,
        'word.istitle()': word.istitle() if type(word) == str else word,
        'word.isdigit()': word.isdigit() if type(word) == str else word,
        # 'postag': postag,
        # 'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        # postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower() if type(word1) == str else word1,
            '-1:word.istitle()': word1.istitle() if type(word1) == str else word1,
            '-1:word.isupper()': word1.isupper() if type(word1) == str else word1,
            # '-1:postag': postag1,
            # '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        # postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower() if type(word1) == str else word1,
            '+1:word.istitle()': word1.istitle() if type(word1) == str else word1,
            '+1:word.isupper()': word1.isupper() if type(word1) == str else word1,
            # '+1:postag': postag1,
            # '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    #src: https://stackoverflow.com/questions/58736548/how-to-use-word-embedding-as-features-for-crf-sklearn-crfsuite-model-training
    we=model.get_word_vector(features['word.lower()'])
    for iv,value in enumerate(we):
        features['v{}'.format(iv)]=value

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

In [109]:
#Creating the train and test set
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

KeyboardInterrupt: 

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        train_size=0.8,
        test_size=0.2,
        # random but same for all run, also accurancy depends on the
        # selection of data e.g. if we put 10 then accuracy will be 1.0
        # in this example
        random_state=23,
        # keep same proportion of 'target' in test and target data
    )

In [20]:
y_train[2]

['O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'B-PER', 'O', 'O', 'O', 'O']

In [1]:
(X[0],y[0])

NameError: name 'X' is not defined

In [27]:
# group B and I results
# #First we select all the tags that are relevant for us i.e. remove the 'O' tag from the list. 

# labels = list(crf2.classes_)
labels = list(data['ner'].value_counts().keys())
labels = list(filter(lambda a: a != 'O', labels))
print(labels)
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

['B-PER', 'B-PROD', 'I-PER', 'B-LOC', 'B-ORG', 'I-PROD', 'I-ORG', 'I-EV', 'I-LOC', 'B-EV', 'I-WA', 'B-WA']


In [31]:
#Creating the CRF model
crf = CRF(algorithm='lbfgs',
          c1= 0.05912981852829489, 
          c2= 0.09113455878833218,
          max_iterations=100,
          all_possible_transitions=False)

In [32]:
%%time
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
print(flat_classification_report(y_pred=pred, y_true=y, labels=sorted_labels, digits=3))   

              precision    recall  f1-score   support

        B-EV      0.604     0.262     0.366       809
        I-EV      0.579     0.340     0.429       978
       B-LOC      0.765     0.736     0.750      2225
       I-LOC      0.601     0.541     0.570       957
       B-ORG      0.630     0.447     0.523      2085
       I-ORG      0.547     0.449     0.493      1124
       B-PER      0.858     0.718     0.782      5564
       I-PER      0.852     0.826     0.838      3058
      B-PROD      0.861     0.788     0.823      3554
      I-PROD      0.689     0.506     0.583      1177
        B-WA      0.382     0.079     0.131       164
        I-WA      0.170     0.033     0.056       271

   micro avg      0.775     0.641     0.702     21966
   macro avg      0.628     0.477     0.529     21966
weighted avg      0.757     0.641     0.689     21966

Wall time: 18min 5s


In [18]:
#Tuning the parameters manually, setting c1 = 10
crf2 = CRF(algorithm='lbfgs',
          c1=1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [19]:
%%time
pred = cross_val_predict(estimator=crf2, X=X, y=y, cv=5)
print(flat_classification_report(y_pred=pred, y_true=y, labels=sorted_labels, digits=3))  

              precision    recall  f1-score   support

        B-EV      0.677     0.208     0.318       809
        I-EV      0.594     0.272     0.373       978
       B-LOC      0.769     0.633     0.694      2225
       I-LOC      0.585     0.471     0.522       957
       B-ORG      0.663     0.356     0.464      2085
       I-ORG      0.559     0.391     0.460      1124
       B-PER      0.899     0.634     0.744      5564
       I-PER      0.858     0.750     0.801      3058
      B-PROD      0.881     0.730     0.798      3554
      I-PROD      0.716     0.449     0.552      1177
        B-WA      0.286     0.012     0.023       164
        I-WA      0.190     0.015     0.027       271

   micro avg      0.801     0.566     0.663     21966
   macro avg      0.640     0.410     0.481     21966
weighted avg      0.779     0.566     0.648     21966

Wall time: 2min 39s


In [20]:
%%time
#Now we will create the Randomized CV search model wherein we will use a modified F1 scorer model considering only the relevant labels
# define fixed parameters and parameters to search
crf3 = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)   
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X, y)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 43.4min finished
Wall time: 44min 3s


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=False, c1=0.1, c2=0.1,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B9F30DC48>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B9F30D888>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-PER', 'B-PROD', 'I-PER', 'B-LOC', 'B-ORG', 'I-PROD', 'I-ORG', 'I-EV', 'I-LOC', 'B-EV', 'I-WA', 'B-WA']),
                   verbose=1)

In [21]:
#Lets check the best estimated parameters and CV score
print('Best parameters:', rs.best_params_)
print('Best CV score:', rs.best_score_)
print('Model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

Best parameters: {'c1': 0.05912981852829489, 'c2': 0.09113455878833218}
Best CV score: 0.6479905267319664
Model size: 3.65M


In [22]:
#Now we create the model again using the best estimators
crf3 = rs.best_estimator_
y_pred = crf3.predict(X)
print(metrics.flat_classification_report(y, y_pred, labels=sorted_labels, digits=3))

              precision    recall  f1-score   support

        B-EV      0.916     0.637     0.751       809
        I-EV      0.914     0.878     0.896       978
       B-LOC      0.954     0.967     0.961      2225
       I-LOC      0.942     0.964     0.953       957
       B-ORG      0.957     0.888     0.921      2085
       I-ORG      0.958     0.944     0.951      1124
       B-PER      0.980     0.967     0.974      5564
       I-PER      0.968     0.986     0.977      3058
      B-PROD      0.957     0.970     0.964      3554
      I-PROD      0.952     0.934     0.943      1177
        B-WA      0.949     0.902     0.925       164
        I-WA      0.911     0.908     0.909       271

   micro avg      0.960     0.942     0.951     21966
   macro avg      0.946     0.912     0.927     21966
weighted avg      0.959     0.942     0.950     21966



In [23]:
%%time
pred = cross_val_predict(estimator=crf3, X=X, y=y, cv=5)
print(flat_classification_report(y_pred=pred, y_true=y, labels=sorted_labels, digits=3))

              precision    recall  f1-score   support

        B-EV      0.608     0.250     0.354       809
        I-EV      0.580     0.324     0.416       978
       B-LOC      0.768     0.649     0.703      2225
       I-LOC      0.631     0.460     0.532       957
       B-ORG      0.646     0.398     0.493      2085
       I-ORG      0.564     0.382     0.455      1124
       B-PER      0.884     0.652     0.750      5564
       I-PER      0.868     0.751     0.806      3058
      B-PROD      0.880     0.750     0.810      3554
      I-PROD      0.721     0.455     0.558      1177
        B-WA      0.562     0.055     0.100       164
        I-WA      0.292     0.026     0.047       271

   micro avg      0.797     0.583     0.673     21966
   macro avg      0.667     0.429     0.502     21966
weighted avg      0.777     0.583     0.660     21966

Wall time: 2min 29s


In [24]:
crf3.fit(X,y)

CRF(algorithm='lbfgs', all_possible_transitions=False, c1=0.05912981852829489,
    c2=0.09113455878833218, keep_tempfiles=None, max_iterations=100)

In [25]:
eli5.show_weights(crf3, top=30)

From \ To,O,B-EV,I-EV,B-LOC,I-LOC,B-ORG,I-ORG,B-PER,I-PER,B-PROD,I-PROD,B-WA,I-WA
O,2.474,0.339,-2.795,0.677,-2.823,1.335,-2.491,1.924,-3.902,1.097,-2.649,0.854,-2.339
B-EV,-1.061,1.556,5.686,-1.514,0.0,0.0,0.0,-0.484,0.0,0.302,0.0,0.0,0.0
I-EV,-0.562,0.251,5.676,-1.48,0.0,-1.479,0.0,-0.695,0.0,0.0,0.0,0.0,0.0
B-LOC,0.722,0.0,0.0,1.543,6.256,-0.071,0.0,-1.351,0.0,0.0,0.0,0.104,0.0
I-LOC,-0.249,0.0,0.0,0.329,6.094,0.0,0.0,-0.69,0.0,-0.176,0.0,0.0,0.0
B-ORG,0.445,0.639,0.0,-0.676,0.0,0.655,6.153,-0.531,0.0,-0.342,0.0,1.316,0.0
I-ORG,0.387,0.0,0.0,-1.941,0.0,-0.174,6.541,-0.67,0.0,0.268,0.0,0.0,0.0
B-PER,0.575,-0.758,0.0,-0.263,0.0,0.007,0.0,1.606,5.129,-0.338,0.0,0.0,0.0
I-PER,0.437,0.0,0.0,-0.464,0.0,-0.005,0.0,-0.174,4.28,0.0,0.0,0.0,0.0
B-PROD,0.564,-0.317,0.0,-0.963,0.0,-0.003,0.0,-0.328,0.0,1.774,5.33,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12
+5.108,+1:word.lower():khofifah,,,,,,,,,,,
+4.228,bias,,,,,,,,,,,
+4.004,-1:word.lower():parawansa,,,,,,,,,,,
+3.904,+1:word.lower():ppat,,,,,,,,,,,
+3.787,-1:word.lower():sigit,,,,,,,,,,,
+3.645,-1:word.lower():5-10%,,,,,,,,,,,
+3.611,+1:word.lower():@ganjarpranowo,,,,,,,,,,,
+3.444,-1:word.lower():#bukabersamaonthescreen,,,,,,,,,,,
+3.435,-1:word.lower():pengasih,,,,,,,,,,,
+3.368,+1:word.lower():subian,,,,,,,,,,,

Weight?,Feature
+5.108,+1:word.lower():khofifah
+4.228,bias
+4.004,-1:word.lower():parawansa
+3.904,+1:word.lower():ppat
+3.787,-1:word.lower():sigit
+3.645,-1:word.lower():5-10%
+3.611,+1:word.lower():@ganjarpranowo
+3.444,-1:word.lower():#bukabersamaonthescreen
+3.435,-1:word.lower():pengasih
+3.368,+1:word.lower():subian

Weight?,Feature
+3.666,+1:word.lower():klean����
+3.362,-1:word.lower():acara
+2.913,-1:word.lower():ibadah
+2.753,+1:word.lower():bismillah
+2.659,-1:word.lower():peringati
+2.571,+1:word.lower():kali
+2.482,-1:word.lower():memperingati
+2.330,word.lower():musrenbang
+2.311,+1:word.lower():yakali
+2.271,word.lower():#panggungkahanan

Weight?,Feature
+3.476,+1:word.lower():kesiapan
+3.257,-1:word.lower():piala
+3.232,-1:word.lower():pandemi
+3.109,-1:word.lower():(
+2.908,-1:word.lower():gempa
+2.310,+1:word.lower():sbg
+2.271,-1:word.lower():hari
+2.241,word.lower():sale
+2.109,+1:word.lower():bagi
+2.106,"-1:word.lower():,"

Weight?,Feature
+4.793,word.lower():medan
+3.690,word.lower():seturan
+3.601,-1:word.lower():desa
+3.595,word.lower():banyuwangi
+3.419,word.lower():kediri
+3.269,"-1:word.lower():banyumas,jawa"
+3.192,-1:word.lower():daerah
+3.132,word.lower():surabaya
+3.052,word.lower():bekasi
+3.031,-1:word.lower():di

Weight?,Feature
+4.275,-1:word.lower():.
+3.207,-1:word.lower():desa
+3.136,-1:word.lower():pulau
+3.076,word.lower():selatan
+2.946,+1:word.lower():pasifik
+2.776,-1:word.lower():(
+2.766,-1:word.lower():kecamatan
+2.729,-1:word.lower():kantor
+2.683,-1:word.lower():pasar
+2.501,-1:word.lower():kota

Weight?,Feature
+4.083,word.lower():persebaya
+3.691,word.lower():persib
+3.574,+1:word.lower():giddy
+3.347,word.lower():bangtan
+3.278,word.lower():@farmaku
+3.268,+1:word.lower():jinie
+3.228,word[:3]:Kem
+3.193,+1:word.lower():egp
+3.139,-1:word.lower():demokrasi.dikibulin
+3.137,+1:word.lower():lirik

Weight?,Feature
+4.535,-1:word.lower():(
+4.004,+1:word.lower():pakar
+3.727,-1:word.lower():.
+3.362,word.lower():informatika
+2.957,+1:word.lower():statistik
+2.892,+1:word.lower():bertemu
+2.845,+1:word.lower():per
+2.691,-1:word.lower():partai
+2.669,+1:word.lower():https://t.co/ywplyvh1jp
+2.474,+1:word.lower():hormat

Weight?,Feature
+3.938,-1:word.lower():kamil-menkes
+3.909,word[:3]:hae
+3.802,-1:word.lower():madiun-gubernur
+3.801,word.lower():ganjar
+3.775,-1:word.lower():malang
+3.722,-1:word.lower():pks
+3.713,-1:word.lower():timur
+3.661,word.lower():gibran
+3.623,word.lower():puan
+3.503,word.lower():@santi_nya

Weight?,Feature
+4.865,-1:word.lower():.
+4.274,-1:word.lower():wakil
+4.056,word.lower():baswedan
+4.030,+1:word.lower():grobogan
+3.328,-1:word.lower():ketua
+3.282,word.lower():pandjaitan
+3.204,-1:word.lower():(
+2.960,-1:word.lower():gub
+2.942,-1:word.lower():sekjen
+2.927,-1:word.lower():kepala

Weight?,Feature
+4.528,-1:word.lower():wetv
+4.084,word.lower():samsung
+3.991,+1:word.lower():pilihanku
+3.968,word.lower():tiktok
+3.873,word.lower():����ｏｐｐａｂｅｔ����ｏｐｐａｂｅｔ����
+3.792,word.lower():linkaja
+3.636,word.lower():iphone
+3.524,word.lower():lazada
+3.442,word.lower():dana
+3.426,word.lower():canva

Weight?,Feature
+5.212,-1:word.lower():+
+3.296,+1:word.lower():1oo
+3.061,-1:word.lower():)
+2.965,+1:word.lower():✨1
+2.958,-1:word.lower():(
+2.905,-1:word.lower():vaksin
+2.811,"-1:word.lower():,"
+2.574,-1:word.lower():kri
+2.553,+1:word.lower():sbgai
+2.487,-1:word.lower():kartu

Weight?,Feature
+3.126,-1:word.lower():nonton
+2.622,word.lower():masjid-agung-purwokerto-hasil-rancangan-ridwan-kamil-mulai-dibangun
+2.622,+1:word.lower():https://t.co/bs28bqaiah
+2.234,word[-3:]:say
+1.933,word[-3:]:gun
+1.919,+1:word.lower():dari
+1.899,word.lower():daechwita
+1.878,-1:word.lower():awak
+1.853,+1:word.lower():mencapai
+1.782,-1:word.lower():bismillahhirrahmanirrahim

Weight?,Feature
+2.968,+1:word.lower():https://t.co/gh2dkrlajt
+1.852,+1:word.lower():podcast
+1.817,+1:word.lower():mantep
+1.602,word.lower():kompas
+1.506,-1:word.lower():?
+1.480,+1:word.lower():tentang
+1.456,-1:word.lower():the
+1.441,+1:word.lower():boa
+1.421,-1:word.lower():jalan
+1.338,-1:word.lower():kompas
