In [1]:
import sys
import numpy as np
import pandas as pd
import os
sys.path.append('..')
print(sys.path)
from nlp_pipeline.feature_extraction import FeatureExtractor
from nlp_pipeline.preprocess_text import TextPreprocessor
from model.evaluate_model import ModelEvaluator
from model.train_baseline import BaselineTrainer
from interpretation.lime_interpreter import LimeTextInterpreter
from interpretation.shap_interpreter import ShapInterpreter

['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/hop/home/muhammad_mufli_ramadhan/tfid/lib/python3.8/site-packages', '..']


  from .autonotebook import tqdm as notebook_tqdm


Load dataset

In [2]:
df = pd.read_csv('../data/features/1.csv', encoding='latin1')
print(df.columns)
df['label'] = df['label'].astype(str)
df['label'].value_counts()
df['label'].value_counts()

Index(['interaction_control', 'target_of_utterance', 'seg_id', 'vid_id',
       'start_time', 'end_time', 'speaker_id', 'role', 'label',
       'transcription', 'duration', 'utterance_type', 'annotation',
       'valid_segment', 'total_words', 'unique_words', 'num_sentences',
       'stopwords', 'num_adjectives', 'num_nouns', 'num_verbs', 'num_adverbs',
       'type_token_ratio', 'avg_words_per_sentence', 'soruce_file'],
      dtype='object')


label
NON ASD    2324
ASD        1244
Name: count, dtype: int64

Preprocessing text

In [3]:
text_preprocessor = TextPreprocessor()
df['clean_text'] = df['transcription'].apply(text_preprocessor.preprocess)
df[['transcription', 'clean_text']].head()

Unnamed: 0,transcription,clean_text
0,Tunjuk kaka coba,tunjuk kaka coba
1,Inii!,ini
2,Siapa namamu?,siapa nama
3,Iyaaaa?,
4,Kenalan duluu!,kenal duluu


Ekstraksi fitur with TFIDF

In [6]:
extractor = FeatureExtractor()
X = extractor.fit_transform_tfidf(df['clean_text'])
y = df['label'].values
print(X.shape, y.shape)

(3568, 4437) (3568,)


Ekstraksi fitur with IndoBERT

In [4]:
extractor = FeatureExtractor()
X = extractor.encode_series_bert(df['clean_text'])
y = df['label'].values
print(X.shape, y.shape)

(3568, 768) (3568,)


SVC WITH 5 FOLD

In [5]:
trainer = BaselineTrainer()

evaluator = ModelEvaluator(model_type='svm')

results = evaluator.cross_validate(X,y)


Fold 1
              precision    recall  f1-score   support

         ASD      0.679     0.723     0.700       249
     NON ASD      0.846     0.817     0.832       465

    accuracy                          0.784       714
   macro avg      0.763     0.770     0.766       714
weighted avg      0.788     0.784     0.786       714

Confusion Matrix:
 [[180  69]
 [ 85 380]]

Fold 2
              precision    recall  f1-score   support

         ASD      0.701     0.715     0.708       249
     NON ASD      0.846     0.837     0.841       465

    accuracy                          0.794       714
   macro avg      0.773     0.776     0.774       714
weighted avg      0.795     0.794     0.795       714

Confusion Matrix:
 [[178  71]
 [ 76 389]]

Fold 3
              precision    recall  f1-score   support

         ASD      0.677     0.767     0.719       249
     NON ASD      0.866     0.804     0.834       465

    accuracy                          0.791       714
   macro avg      0.

In [None]:
final_model = trainer.train(X, y)

lime_interpreter = LimeTextInterpreter(
    model=final_model,
    vectorizer=extractor.get_tfidf_vectorizer(),
    class_names=['NON ASD', 'ASD']
)

asd_samples = df[df['label'] == 'ASD'].sample(n=12, random_state=42)
non_asd_samples = df[df['label'] == 'NON ASD'].sample(n=12, random_state=42)
lime_samples = pd.concat([asd_samples, non_asd_samples]).reset_index(drop=True)

lime_interpreter.save_lime_explanation_to_csv(lime_samples,"../reports/lime_results.csv")

In [6]:
final_model = BaselineTrainer('svm').train(X,y)
interpreter = ShapInterpreter(model=final_model)
sampled_df = df.sample(n=100, random_state=42)
texts = sampled_df['clean_text'].tolist()
labels = sampled_df['label'].tolist()

shap_values = interpreter.explain(texts)
interpreter.save_shap_explanation_to_csv(shap_values, texts, labels, "../reports/shap_results_SVM.csv")

PartitionExplainer explainer: 101it [00:41,  2.15it/s]                         


AttributeError: 'ShapInterpreter' object has no attribute 'save_shap_explanation_to_csv'

LINEAR REGRETION WITH 5 FOLD

In [9]:
trainer = BaselineTrainer()
evaluator = ModelEvaluator()

results = evaluator.cross_validate(X,y)


Fold 1
              precision    recall  f1-score   support

         ASD      0.792     0.337     0.473       249
     NON ASD      0.729     0.953     0.826       465

    accuracy                          0.738       714
   macro avg      0.761     0.645     0.649       714
weighted avg      0.751     0.738     0.703       714

Confusion Matrix:
 [[ 84 165]
 [ 22 443]]

Fold 2
              precision    recall  f1-score   support

         ASD      0.783     0.289     0.422       249
     NON ASD      0.715     0.957     0.819       465

    accuracy                          0.724       714
   macro avg      0.749     0.623     0.621       714
weighted avg      0.739     0.724     0.680       714

Confusion Matrix:
 [[ 72 177]
 [ 20 445]]

Fold 3
              precision    recall  f1-score   support

         ASD      0.812     0.329     0.469       249
     NON ASD      0.728     0.959     0.827       465

    accuracy                          0.739       714
   macro avg      0.

In [10]:
final_model = trainer.train(X, y)

lime_interpreter = LimeTextInterpreter(
    model=final_model,
    vectorizer=extractor.get_tfidf_vectorizer(),
    class_names=['NON ASD', 'ASD']
)

asd_samples = df[df['label'] == 'ASD'].sample(n=12, random_state=42)
non_asd_samples = df[df['label'] == 'NON ASD'].sample(n=12, random_state=42)
lime_samples = pd.concat([asd_samples, non_asd_samples]).reset_index(drop=True)

lime_interpreter.save_lime_explanation_to_csv(lime_samples,"../reports/lime_results_LOGRES.csv")

Berhasil menyimpan!


SVM WITH 10 FOLD

In [11]:
trainer = BaselineTrainer()
evaluator = ModelEvaluator(model_type='svm', n_splits= 10, random_state=32)

results = evaluator.cross_validate(X,y)


Fold 1
              precision    recall  f1-score   support

         ASD      0.683     0.798     0.736       124
     NON ASD      0.882     0.803     0.840       233

    accuracy                          0.801       357
   macro avg      0.782     0.800     0.788       357
weighted avg      0.813     0.801     0.804       357

Confusion Matrix:
 [[ 99  25]
 [ 46 187]]

Fold 2
              precision    recall  f1-score   support

         ASD      0.746     0.758     0.752       124
     NON ASD      0.870     0.863     0.866       233

    accuracy                          0.826       357
   macro avg      0.808     0.810     0.809       357
weighted avg      0.827     0.826     0.827       357

Confusion Matrix:
 [[ 94  30]
 [ 32 201]]

Fold 3
              precision    recall  f1-score   support

         ASD      0.689     0.677     0.683       124
     NON ASD      0.830     0.837     0.833       233

    accuracy                          0.782       357
   macro avg      0.

In [12]:
final_model = trainer.train(X, y)

lime_interpreter = LimeTextInterpreter(
    model=final_model,
    vectorizer=extractor.get_tfidf_vectorizer(),
    class_names=['NON ASD', 'ASD']
)

asd_samples = df[df['label'] == 'ASD'].sample(n=12, random_state=42)
non_asd_samples = df[df['label'] == 'NON ASD'].sample(n=12, random_state=42)
lime_samples = pd.concat([asd_samples, non_asd_samples]).reset_index(drop=True)

lime_interpreter.save_lime_explanation_to_csv(lime_samples,"../reports/lime_results_SVM10FOLD.csv")

Berhasil menyimpan!


LINEAR REGRETION WITH 10 FOLD

In [8]:
trainer = BaselineTrainer()
evaluator = ModelEvaluator(n_splits= 10, random_state=32)

results = evaluator.cross_validate(X,y)


Fold 1
              precision    recall  f1-score   support

         ASD      0.746     0.427     0.544       124
     NON ASD      0.752     0.923     0.829       233

    accuracy                          0.751       357
   macro avg      0.749     0.675     0.686       357
weighted avg      0.750     0.751     0.730       357

Confusion Matrix:
 [[ 53  71]
 [ 18 215]]

Fold 2
              precision    recall  f1-score   support

         ASD      0.717     0.306     0.429       124
     NON ASD      0.717     0.936     0.812       233

    accuracy                          0.717       357
   macro avg      0.717     0.621     0.621       357
weighted avg      0.717     0.717     0.679       357

Confusion Matrix:
 [[ 38  86]
 [ 15 218]]

Fold 3
              precision    recall  f1-score   support

         ASD      0.762     0.258     0.386       124
     NON ASD      0.708     0.957     0.814       233

    accuracy                          0.714       357
   macro avg      0.