In [1]:
import sys
import numpy as np
import pandas as pd
import os
sys.path.append('..')
print(sys.path)
from nlp_pipeline.feature_extraction import FeatureExtractor
from nlp_pipeline.preprocess_text import TextPreprocessor
from model.evaluate_model import ModelEvaluator
from model.train_baseline import BaselineTrainer
from interpretation.lime_interpreter import LimeTextInterpreter
from interpretation.shap_interpreter import ShapInterpreter

['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/hop/home/muhammad_mufli_ramadhan/tfid/lib/python3.8/site-packages', '..']


  from .autonotebook import tqdm as notebook_tqdm


Load dataset

In [2]:
df = pd.read_csv('../data/features/1.csv', encoding='latin1')
print(df.columns)
df['label'] = df['label'].astype(str)
df['label'].value_counts()
df['label'].value_counts()

Index(['interaction_control', 'target_of_utterance', 'seg_id', 'vid_id',
       'start_time', 'end_time', 'speaker_id', 'role', 'label',
       'transcription', 'duration', 'utterance_type', 'annotation',
       'valid_segment', 'total_words', 'unique_words', 'num_sentences',
       'stopwords', 'num_adjectives', 'num_nouns', 'num_verbs', 'num_adverbs',
       'type_token_ratio', 'avg_words_per_sentence', 'soruce_file'],
      dtype='object')


label
NON ASD    2324
ASD        1244
Name: count, dtype: int64

Preprocessing text

In [3]:
text_preprocessor = TextPreprocessor()
df['clean_text'] = df['transcription'].apply(text_preprocessor.preprocess)
df[['transcription', 'clean_text']].head()

Unnamed: 0,transcription,clean_text
0,Tunjuk kaka coba,tunjuk kaka coba
1,Inii!,ini
2,Siapa namamu?,siapa nama
3,Iyaaaa?,
4,Kenalan duluu!,kenal duluu


Ekstraksi fitur with TFIDF

In [6]:
extractor = FeatureExtractor()
X = extractor.fit_transform_tfidf(df['clean_text'])
y = df['label'].values
print(X.shape, y.shape)

(3568, 4437) (3568,)


Ekstraksi fitur with IndoBERT

In [4]:
extractor = FeatureExtractor()
X = extractor.encode_series_bert(df['clean_text'])
y = df['label'].values
print(X.shape, y.shape)

(3568, 768) (3568,)


SVC WITH 5 FOLD

In [None]:
trainer = BaselineTrainer()

evaluator = ModelEvaluator(model_type='svm')

results = evaluator.cross_validate(X,y)

USE THIS IF YOU NOT USING BERT

In [None]:
final_model = trainer.train(X, y)

lime_interpreter = LimeTextInterpreter(
    model=final_model,
    vectorizer=extractor.get_tfidf_vectorizer(),
    class_names=['NON ASD', 'ASD']
)

asd_samples = df[df['label'] == 'ASD'].sample(n=12, random_state=42)
non_asd_samples = df[df['label'] == 'NON ASD'].sample(n=12, random_state=42)
lime_samples = pd.concat([asd_samples, non_asd_samples]).reset_index(drop=True)

lime_interpreter.save_lime_explanation_to_csv(lime_samples,"../reports/lime_results.csv")

In [None]:
final_model = BaselineTrainer('svm').train(X,y)
interpreter = ShapInterpreter(model=final_model)
sampled_df = df.sample(n=100, random_state=42)
texts = sampled_df['clean_text'].tolist()
labels = sampled_df['label'].tolist()

shap_values = interpreter.explain(texts)
interpreter.save_shap_explanations_to_csv(
    shap_values=shap_values,
    texts=texts,
    output_path="../reports/shap_results_SVM.csv",
    true_labels=labels
)

LINEAR REGRETION WITH 5 FOLD

In [None]:
trainer = BaselineTrainer()
evaluator = ModelEvaluator()

results = evaluator.cross_validate(X,y)

In [10]:
final_model = trainer.train(X, y)

lime_interpreter = LimeTextInterpreter(
    model=final_model,
    vectorizer=extractor.get_tfidf_vectorizer(),
    class_names=['NON ASD', 'ASD']
)

asd_samples = df[df['label'] == 'ASD'].sample(n=12, random_state=42)
non_asd_samples = df[df['label'] == 'NON ASD'].sample(n=12, random_state=42)
lime_samples = pd.concat([asd_samples, non_asd_samples]).reset_index(drop=True)

lime_interpreter.save_lime_explanation_to_csv(lime_samples,"../reports/lime_results_LOGRES.csv")

Berhasil menyimpan!


SVM WITH 10 FOLD

In [None]:
trainer = BaselineTrainer()
evaluator = ModelEvaluator(model_type='svm', n_splits= 10, random_state=32)

results = evaluator.cross_validate(X,y)

In [1]:
final_model = trainer.train(X, y)

lime_interpreter = LimeTextInterpreter(
    model=final_model,
    vectorizer=extractor.get_tfidf_vectorizer(),
    class_names=['NON ASD', 'ASD']
)

asd_samples = df[df['label'] == 'ASD'].sample(n=12, random_state=42)
non_asd_samples = df[df['label'] == 'NON ASD'].sample(n=12, random_state=42)
lime_samples = pd.concat([asd_samples, non_asd_samples]).reset_index(drop=True)

lime_interpreter.save_lime_explanation_to_csv(lime_samples,"../reports/lime_results_SVM10FOLD.csv")

NameError: name 'trainer' is not defined

In [None]:
final_model = BaselineTrainer('svm').train(X,y)
interpreter = ShapInterpreter(model=final_model)
sampled_df = df.sample(n=100, random_state=42)
texts = sampled_df['clean_text'].tolist()
labels = sampled_df['label'].tolist()

shap_values = interpreter.explain(texts)
interpreter.save_shap_explanations_to_csv(
    shap_values=shap_values,
    texts=texts,
    output_path="../reports/shap_results_SVM_10_fold.csv",
    true_labels=labels
)

LINEAR REGRETION WITH 10 FOLD

In [None]:
trainer = BaselineTrainer()
evaluator = ModelEvaluator(n_splits= 10, random_state=32)

results = evaluator.cross_validate(X,y)

In [None]:
final_model = BaselineTrainer('svm').train(X,y)
interpreter = ShapInterpreter(model=final_model)
sampled_df = df.sample(n=100, random_state=42)
texts = sampled_df['clean_text'].tolist()
labels = sampled_df['label'].tolist()

shap_values = interpreter.explain(texts)
interpreter.save_shap_explanations_to_csv(
    shap_values=shap_values,
    texts=texts,
    output_path="../reports/shap_results_Logres_10_fold.csv",
    true_labels=labels
)


In [None]:
interpreter.visualize(shap_values= shap_values)

FUSION

In [4]:
extractor = FeatureExtractor()
X = extractor.extract_fused_features_bert(df)
y = df['label'].values

In [5]:
evaluator = ModelEvaluator(model_type='svm', n_splits= 10, random_state=32)
results = evaluator.cross_validate_with_confusionmatrix(X, y)



 Fold 1
              precision    recall  f1-score   support

         ASD      0.630     0.742     0.681       124
     NON ASD      0.848     0.768     0.806       233

    accuracy                          0.759       357
   macro avg      0.739     0.755     0.744       357
weighted avg      0.773     0.759     0.763       357

Confusion Matrix:
 [[ 92  32]
 [ 54 179]]
Confusion Matrix (Fold 1):
[[ 92  32]
 [ 54 179]]

 Fold 2
              precision    recall  f1-score   support

         ASD      0.684     0.734     0.708       124
     NON ASD      0.853     0.820     0.836       233

    accuracy                          0.790       357
   macro avg      0.768     0.777     0.772       357
weighted avg      0.794     0.790     0.792       357

Confusion Matrix:
 [[ 91  33]
 [ 42 191]]
Confusion Matrix (Fold 2):
[[ 91  33]
 [ 42 191]]

 Fold 3
              precision    recall  f1-score   support

         ASD      0.688     0.694     0.691       124
     NON ASD      0.836   

Testing Model

In [5]:
trainer = BaselineTrainer(model_type='svm')
model = trainer.train(X, y)

new_text = "Apakah kamu suddah makan?"
clean_text = text_preprocessor.preprocess(new_text)
X_new = extractor.encode_series_bert([clean_text])
predicted_label = model.predict(X_new)[0]
print(f"Predicted label: {predicted_label}")

Predicted label: NON ASD


In [9]:
new_text = "Aku takut aku takut aku takut aku takut aku takut aku takut aku takut"
clean_text = text_preprocessor.preprocess(new_text)
X_new = extractor.encode_series_bert([clean_text])
predicted_label = model.predict(X_new)[0]
print(f"Predicted label: {predicted_label}")

Predicted label: ASD
