In [1]:
%load_ext autoreload
%autoreload 2

In this simple notebook I want to explore about simple sentence classification using tf-idf and RandomForest. 

In [111]:
from sklearn.model_selection import train_test_split
from lib.reader import *
from lib.preprocess import *
import nltk
from sklearn.pipeline import *
from sklearn.feature_extraction.text import *
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import *
from functools import partial

In [5]:
data_loc = "../data/id-syntax-sentence-classification.xlsx"
data: pd.DataFrame = load_as_dataframe(data_loc)

In [6]:
data.head()

Unnamed: 0,Kalimat,simple sentence,compound sentence,complex sentence,compound-complex sentence,incomplete sentence,transitive sentence,intransitive sentence,sentence in active voice,sentence in passive voice,Noun-predicate sentence,Adjective-predicate sentence,Prepositional phrase-predicate sentence,Numeral-predicate sentence,declarative sentence,interrogative sentence,imperative sentence,exclamative sentence,inverted sentence,sentences with dislocation
0,Saat ini ia menjabat sebagai Wakil Bupati Band...,True,False,False,False,False,False,True,True,False,False,False,False,False,True,False,False,False,False,False
1,Sahrul mengawali kariernya dari dunia model.,True,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,False,False
2,Kemudian merambah ke dunia tarik suara dan akt...,False,False,False,False,True,False,True,True,False,False,False,False,False,True,False,False,False,False,False
3,Sinetron yang melambungkan namanya adalah Jin ...,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True
4,Setelah cukup lama membintangi sinetron terseb...,False,False,False,True,False,True,True,True,False,False,False,False,False,True,False,False,False,False,False


At the previous notebook, I have found that there are some classes combination that could have been mislabeled. So in this exploration I should exclude those sentence, especially because the proportion is low.

In [13]:
filtered_data = data[~(
    (data["declarative sentence"] & data["interrogative sentence"]) |
    (data["simple sentence"] & data["complex sentence"]) |
    (data["declarative sentence"] & data["imperative sentence"]) |
    (data["simple sentence"] & data["compound-complex sentence"]) |
    (data["simple sentence"] & data["compound sentence"])
)].copy()

In [92]:
filtered_data.loc[:, "sentences with dislocation"] = filtered_data.loc[:, "sentences with dislocation"].astype(bool)

In [131]:
y_columns = [col for col in data.columns if col != "Kalimat"]

In [93]:
X = filtered_data[["Kalimat"]]
y = filtered_data[y_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=32)

In [157]:
train_count_vectorizer = CountVectorizer()
train_count_vectorizer.fit(X_train["Kalimat"])

test_count_vectorizer = CountVectorizer()
test_count_vectorizer.fit(X_test["Kalimat"])

CountVectorizer()

In [162]:
train_term_set = set(train_count_vectorizer.vocabulary_.keys())
test_term_set = set(test_count_vectorizer.vocabulary_.keys())

In [166]:
all_vocab_len = len(train_term_set.union(test_term_set))
intersected_vocab_len = len(train_term_set.intersection(test_term_set))
difference_vocab_len = len(test_term_set.difference(train_term_set))

In [168]:
pct_intersected = intersected_vocab_len / all_vocab_len
print(f"pct intersected terms between train and test={pct_intersected:.1%}")

pct intersected terms between train and test=38.5%


In [184]:
a = len(test_term_set.difference(train_term_set)) / len(test_term_set)
print(f"pct vocab in test not existed in train={a:.1%}")

pct vocab in test not existed in train=27.9%


In [96]:
p = make_pipeline(
    TfidfVectorizer(ngram_range=(1,2)),
    RandomForestClassifier()
)

In [98]:
p.fit(X_train["Kalimat"], y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(ngram_range=(1, 2))),
                ('randomforestclassifier', RandomForestClassifier())])

In [99]:
train_prediction = p.predict(X_train["Kalimat"])

In [121]:
def to_metrics(y_true, y_pred):
    evaluation_metrics = {
        "Accuracy": accuracy_score,
        "F1 Macro": partial(f1_score, average="macro"),
        "Recall Macro": partial(f1_score, average="macro"),
        "Precission Macro": partial(f1_score, average="macro"),
        "F1 Micro": partial(f1_score, average="micro"),
        "Recall Micro": partial(f1_score, average="micro"),
        "Precission Micro": partial(f1_score, average="micro"),
    }
    
    return {
        metric_name: evaluation_metrics[metric_name](y_true, y_pred)
        for metric_name 
        in evaluation_metrics
    }
    

In [125]:
to_metrics(y_train, train_prediction)

{'Accuracy': 0.9997133027522935,
 'F1 Macro': 0.9997055577181029,
 'Recall Macro': 0.9997055577181029,
 'Precission Macro': 0.9997055577181029,
 'F1 Micro': 0.9999501636294168,
 'Recall Micro': 0.9999501636294168,
 'Precission Micro': 0.9999501636294168}

In [126]:
to_metrics(y_test, test_prediction)

{'Accuracy': 0.19531772575250836,
 'F1 Macro': 0.25362837361609725,
 'Recall Macro': 0.25362837361609725,
 'Precission Macro': 0.25362837361609725,
 'F1 Micro': 0.7232150348183572,
 'Recall Micro': 0.7232150348183572,
 'Precission Micro': 0.7232150348183572}

In [134]:
print(classification_report(y_test, test_prediction, target_names=y_columns))

                                         precision    recall  f1-score   support

                        simple sentence       0.65      0.97      0.78      1800
                      compound sentence       0.00      0.00      0.00       167
                       complex sentence       0.56      0.07      0.13       741
              compound-complex sentence       0.00      0.00      0.00       204
                    incomplete sentence       0.00      0.00      0.00        89
                    transitive sentence       0.71      0.25      0.37      1197
                  intransitive sentence       0.67      0.80      0.73      1658
               sentence in active voice       0.75      0.97      0.85      2015
              sentence in passive voice       0.92      0.05      0.09       747
                Noun-predicate sentence       0.95      0.40      0.56       396
           Adjective-predicate sentence       1.00      0.00      0.01       234
Prepositional phrase-predic

  _warn_prf(average, modifier, msg_start, len(result))


Among the top performing sentence classes:
* `simple sentence`, `intransitive sentence`, `sentence in active voice` have too many false positive, could the model naively label all sentences as those class?
* `sentence in passive voice`, `Noun-predicate sentence`, `Adjective-predicate sentence`, `inverted sentence` have too many false negative, could the model naively label all sentences as those class?

In [135]:
false_positive_oriented = [
    "simple sentence", 
    "intransitive sentence",
    "sentence in active voice"
]

false_negative_oriented = [
    "sentence in passive voice",
    "Noun-predicate sentence",
    "Adjective-predicate sentence",
    "inverted sentence"
]

In [138]:
ctoi = {c: i for i, c in enumerate(y_columns)}

In [152]:
def pct_of(predictions, classes, ctoi, value=True):
    result = {}
    count_all_predictions = float(predictions.shape[0])
    for clz in classes:
        
        slices = predictions[:, ctoi[clz]]
        result[clz] = slices[slices == value].shape[0] / count_all_predictions
    return result

In [155]:
pct_of(test_prediction, false_positive_oriented, ctoi, value=True)

{'simple sentence': 0.8969899665551839, 'intransitive sentence': 0.662876254180602, 'sentence in active voice': 0.8759197324414716}

In [156]:
pct_of(test_prediction, false_negative_oriented, ctoi, value=False)

{'sentence in passive voice': 0.9876254180602007, 'Noun-predicate sentence': 0.9448160535117057, 'Adjective-predicate sentence': 0.9996655518394649, 'inverted sentence': 0.9709030100334448}

It seems like the model will naively guess:
`simple sentence`, `intransitive sentence`, `sentence in active voice` as `True` and
`sentence in passive voice`, `Noun-predicate sentence`, `Adjective-predicate sentence`, `inverted sentence` to be `False`

Of course with tf-idf the model will try to remember the words, which make this model non-ideal. Human will try to see the morphology of words in a sentence before determining what category is the current observed sentence is.


Additionally, there are some "hierarchy" of sentence category that should be independent with each others. Using this knowledge, might improve the model performance. 

Further Improvements:
1. Use available morphological analyzer, such as `MorphInd` and put the morphology structure as features.
2. Use pos tagging as features.