In [2]:
import sys
import numpy as np
import pandas as pd
import os
sys.path.append('..')
print(sys.path)
from nlp_pipeline.feature_extraction import FeatureExtractor
from nlp_pipeline.preprocess_text import TextPreprocessor
from model.evaluate_model import ModelEvaluator
from model.train_baseline import BaselineTrainer
from interpretation.lime_interpreter import LimeTextInterpreter
from interpretation.shap_interpreter import ShapInterpreter

['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/hop/home/muhammad_mufli_ramadhan/tfid/lib/python3.8/site-packages', '..']


  from .autonotebook import tqdm as notebook_tqdm


Load dataset

In [5]:
df = pd.read_csv('../data/features/1.csv', encoding='latin1')
print(df.columns)
df['label'] = df['label'].astype(str)
df['label'].value_counts()
df['label'].value_counts()

Index(['interaction_control', 'target_of_utterance', 'seg_id', 'vid_id',
       'start_time', 'end_time', 'speaker_id', 'role', 'label',
       'transcription', 'duration', 'utterance_type', 'annotation',
       'valid_segment', 'total_words', 'unique_words', 'num_sentences',
       'stopwords', 'num_adjectives', 'num_nouns', 'num_verbs', 'num_adverbs',
       'type_token_ratio', 'avg_words_per_sentence', 'soruce_file'],
      dtype='object')


label
NON ASD    2324
ASD        1244
Name: count, dtype: int64

Preprocessing text

In [7]:
text_preprocessor = TextPreprocessor()
df['clean_text'] = df['transcription'].apply(text_preprocessor.preprocess)
df[['transcription', 'clean_text']].head()

Unnamed: 0,transcription,clean_text
0,Tunjuk kaka coba,tunjuk kaka coba
1,Inii!,ini
2,Siapa namamu?,siapa nama
3,Iyaaaa?,
4,Kenalan duluu!,kenal duluu


Ekstraksi fitur with TFIDF

In [6]:
extractor = FeatureExtractor()
X = extractor.fit_transform_tfidf(df['clean_text'])
y = df['label'].values
print(X.shape, y.shape)

(3568, 4437) (3568,)


Ekstraksi fitur with IndoBERT

In [7]:
extractor = FeatureExtractor()
X = extractor.encode_series_bert(df['clean_text'])
y = df['label'].values
print(X.shape, y.shape)

(3568, 768) (3568,)


SVC WITH 5 FOLD

In [None]:
trainer = BaselineTrainer()

evaluator = ModelEvaluator(model_type='svm')

results = evaluator.cross_validate(X,y)

USE THIS IF YOU NOT USING BERT

In [None]:
final_model = trainer.train(X, y)

lime_interpreter = LimeTextInterpreter(
    model=final_model,
    vectorizer=extractor.get_tfidf_vectorizer(),
    class_names=['NON ASD', 'ASD']
)

asd_samples = df[df['label'] == 'ASD'].sample(n=12, random_state=42)
non_asd_samples = df[df['label'] == 'NON ASD'].sample(n=12, random_state=42)
lime_samples = pd.concat([asd_samples, non_asd_samples]).reset_index(drop=True)

lime_interpreter.save_lime_explanation_to_csv(lime_samples,"../reports/lime_results.csv")

In [12]:
final_model = BaselineTrainer('svm').train(X,y)
interpreter = ShapInterpreter(model=final_model)
sampled_df = df.sample(n=100, random_state=42)
texts = sampled_df['clean_text'].tolist()
labels = sampled_df['label'].tolist()

shap_values = interpreter.explain(texts)
interpreter.save_shap_explanations_to_csv(
    shap_values=shap_values,
    texts=texts,
    output_path="../reports/shap_results_SVM.csv",
    true_labels=labels
)

PartitionExplainer explainer: 101it [00:33,  2.06it/s]                         


✅ SHAP explanations saved to: ../reports/shap_results_SVM.csv


LINEAR REGRETION WITH 5 FOLD

In [None]:
trainer = BaselineTrainer()
evaluator = ModelEvaluator()

results = evaluator.cross_validate(X,y)

In [10]:
final_model = trainer.train(X, y)

lime_interpreter = LimeTextInterpreter(
    model=final_model,
    vectorizer=extractor.get_tfidf_vectorizer(),
    class_names=['NON ASD', 'ASD']
)

asd_samples = df[df['label'] == 'ASD'].sample(n=12, random_state=42)
non_asd_samples = df[df['label'] == 'NON ASD'].sample(n=12, random_state=42)
lime_samples = pd.concat([asd_samples, non_asd_samples]).reset_index(drop=True)

lime_interpreter.save_lime_explanation_to_csv(lime_samples,"../reports/lime_results_LOGRES.csv")

Berhasil menyimpan!


SVM WITH 10 FOLD

In [None]:
trainer = BaselineTrainer()
evaluator = ModelEvaluator(model_type='svm', n_splits= 10, random_state=32)

results = evaluator.cross_validate(X,y)

In [1]:
final_model = trainer.train(X, y)

lime_interpreter = LimeTextInterpreter(
    model=final_model,
    vectorizer=extractor.get_tfidf_vectorizer(),
    class_names=['NON ASD', 'ASD']
)

asd_samples = df[df['label'] == 'ASD'].sample(n=12, random_state=42)
non_asd_samples = df[df['label'] == 'NON ASD'].sample(n=12, random_state=42)
lime_samples = pd.concat([asd_samples, non_asd_samples]).reset_index(drop=True)

lime_interpreter.save_lime_explanation_to_csv(lime_samples,"../reports/lime_results_SVM10FOLD.csv")

NameError: name 'trainer' is not defined

In [None]:
final_model = BaselineTrainer('svm').train(X,y)
interpreter = ShapInterpreter(model=final_model)
sampled_df = df.sample(n=100, random_state=42)
texts = sampled_df['clean_text'].tolist()
labels = sampled_df['label'].tolist()

shap_values = interpreter.explain(texts)
interpreter.save_shap_explanations_to_csv(
    shap_values=shap_values,
    texts=texts,
    output_path="../reports/shap_results_SVM_10_fold.csv",
    true_labels=labels
)

LINEAR REGRETION WITH 10 FOLD

In [None]:
trainer = BaselineTrainer()
evaluator = ModelEvaluator(n_splits= 10, random_state=32)

results = evaluator.cross_validate(X,y)

In [None]:
final_model = BaselineTrainer('svm').train(X,y)
interpreter = ShapInterpreter(model=final_model)
sampled_df = df.sample(n=100, random_state=42)
texts = sampled_df['clean_text'].tolist()
labels = sampled_df['label'].tolist()

shap_values = interpreter.explain(texts)
interpreter.save_shap_explanations_to_csv(
    shap_values=shap_values,
    texts=texts,
    output_path="../reports/shap_results_Logres_10_fold.csv",
    true_labels=labels
)


PartitionExplainer explainer: 101it [00:34,  2.03it/s]                         


✅ SHAP explanations saved to: ../reports/shap_results_Logres_10_fold.csv


In [None]:
interpreter.visualize(shap_values= shap_values)

FUSION

In [8]:
extractor = FeatureExtractor()
X = extractor.extract_fused_features_bert(df)
y = df['label'].values

SVM WITH 5 FOLD

In [9]:
trainer = BaselineTrainer()

evaluator = ModelEvaluator(model_type='svm')

results = evaluator.cross_validate(X,y)


Fold 1
              precision    recall  f1-score   support

         ASD      0.672     0.723     0.696       249
     NON ASD      0.845     0.811     0.828       465

    accuracy                          0.780       714
   macro avg      0.758     0.767     0.762       714
weighted avg      0.785     0.780     0.782       714

Confusion Matrix:
 [[180  69]
 [ 88 377]]

Fold 2
              precision    recall  f1-score   support

         ASD      0.711     0.711     0.711       249
     NON ASD      0.845     0.845     0.845       465

    accuracy                          0.798       714
   macro avg      0.778     0.778     0.778       714
weighted avg      0.798     0.798     0.798       714

Confusion Matrix:
 [[177  72]
 [ 72 393]]

Fold 3
              precision    recall  f1-score   support

         ASD      0.666     0.775     0.716       249
     NON ASD      0.868     0.791     0.828       465

    accuracy                          0.786       714
   macro avg      0.