In [1]:
import sys
import numpy as np
import pandas as pd
import os
import matplotlib as plt
sys.path.append('..')
print(sys.path)
from nlp_pipeline.feature_extraction import FeatureExtractor
from nlp_pipeline.preprocess_text import TextPreprocessor
from model.evaluate_model import ModelEvaluator
from model.train_baseline import BaselineTrainer
from interpretation.lime_interpreter import LimeTextInterpreter
from interpretation.shap_interpreter import ShapInterpreter
from nlp_pipeline.back_translator import BackTranslationAugmentor
from nlp_pipeline.embedding_oversample import EmbeddingOversampler

['d:\\aco\\research\\Asd-classification\\notebook', 'c:\\Users\\MufliDevs\\anaconda3\\python312.zip', 'c:\\Users\\MufliDevs\\anaconda3\\DLLs', 'c:\\Users\\MufliDevs\\anaconda3\\Lib', 'c:\\Users\\MufliDevs\\anaconda3', '', 'c:\\Users\\MufliDevs\\anaconda3\\Lib\\site-packages', 'c:\\Users\\MufliDevs\\anaconda3\\Lib\\site-packages\\win32', 'c:\\Users\\MufliDevs\\anaconda3\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\MufliDevs\\anaconda3\\Lib\\site-packages\\Pythonwin', 'c:\\Users\\MufliDevs\\anaconda3\\Lib\\site-packages\\setuptools\\_vendor', '..']


Load dataset + Augmented

In [None]:
df = df = pd.read_csv('../data/feature/merged_final.csv', encoding='latin1')
columns = ['label', 'transcription', 'total_words', 'unique_words', 'num_sentences',
                   'stopwords', 'num_adjectives', 'num_nouns', 'num_verbs', 'num_adverbs',
                   'type_token_ratio', 'avg_words_per_sentence']
df = df[columns]
augmentor = BackTranslationAugmentor()
df_augmented = augmentor.augment_dataframe(df)
print(df_augmented.head)
df_augmented.to_csv('../data/feature/1_augmented.csv', index=False)

Preprocessing text

In [2]:
df = df = pd.read_csv('../data/feature/combined_augmented_asd.csv', encoding='latin1')
text_preprocessor = TextPreprocessor()
df['clean_text'] = df['transcription'].apply(text_preprocessor.preprocess)
df[['transcription', 'clean_text']].head()

Unnamed: 0,transcription,clean_text
0,Tunjuk kaka coba,tunjuk kaka coba
1,Inii!,ini
2,Siapa namamu?,siapa nama
3,Iyaaaa?,
4,Kenalan duluu!,kenal duluu


In [3]:
df['label'].value_counts()
df['label'].value_counts()

label
NON ASD    4648
ASD        3732
Name: count, dtype: int64

Ekstraksi fitur with TFIDF

In [None]:
extractor = FeatureExtractor()
X = extractor.fit_transform_tfidf(df['clean_text'])
y = df['label'].values
print(X.shape, y.shape)

Oversampling with TFIDF

In [None]:
X_dense = extractor.fit_transform_tfidf(df['clean_text']).toarray()

ling_cols = extractor.linguistic_cols
X_ling = df[ling_cols].values

y = (df['label'].str.upper().str.strip() == 'ASD').astype(int).values

oversampler = EmbeddingOversampler()
X_embed_bal, X_ling_bal, y_bal = oversampler.oversample(X_dense, X_ling, y)


Ekstraksi fitur with IndoBERT

In [5]:
extractor = FeatureExtractor()
X = extractor.encode_series_bert(df['clean_text'])
y = (df['label'].str.upper().str.strip() == 'ASD').astype(int).values
ling_cols = extractor.linguistic_cols
X_ling = df[ling_cols].values
oversampler = EmbeddingOversampler()
X_embed_bal, X_ling_bal, y_bal = oversampler.oversample(X, X_ling, y)

print(X_embed_bal.shape, y.shape)

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

(9296, 768) (8380,)


SVC WITH 5 FOLD

In [6]:
trainer = BaselineTrainer()

evaluator = ModelEvaluator(model_type='svm',pos_label=1)

results = evaluator.cross_validate_oversample_with_confusionmatrix(X_embed_bal,y_bal)


[Oversampled] Fold 1
              precision    recall  f1-score   support

           0      0.911     0.817     0.862       930
           1      0.834     0.920     0.875       930

    accuracy                          0.869      1860
   macro avg      0.873     0.869     0.868      1860
weighted avg      0.873     0.869     0.868      1860

Confusion Matrix:
 [[760 170]
 [ 74 856]]
Confusion Matrix (Fold 1):
[[760 170]
 [ 74 856]]

[Oversampled] Fold 2
              precision    recall  f1-score   support

           0      0.918     0.817     0.865       930
           1      0.835     0.927     0.879       929

    accuracy                          0.872      1859
   macro avg      0.876     0.872     0.872      1859
weighted avg      0.877     0.872     0.872      1859

Confusion Matrix:
 [[760 170]
 [ 68 861]]
Confusion Matrix (Fold 2):
[[760 170]
 [ 68 861]]

[Oversampled] Fold 3
              precision    recall  f1-score   support

           0      0.913     0.824     0.8

USE THIS IF YOU NOT USING BERT

In [None]:
final_model = trainer.train(X_embed_bal, y_bal)

lime_interpreter = LimeTextInterpreter(
    model=final_model,
    vectorizer=extractor.get_tfidf_vectorizer(),
    class_names=['NON ASD', 'ASD']
)

asd_samples = df[df['label'] == 'ASD'].sample(n=12, random_state=42)
non_asd_samples = df[df['label'] == 'NON ASD'].sample(n=12, random_state=42)
lime_samples = pd.concat([asd_samples, non_asd_samples]).reset_index(drop=True)

lime_interpreter.save_lime_explanation_to_csv(lime_samples,"../reports/lime_results.csv")

In [None]:
final_model = BaselineTrainer('svm').train(X,y)
interpreter = ShapInterpreter(model=final_model)
sampled_df = df.sample(n=100, random_state=42)
texts = sampled_df['clean_text'].tolist()
labels = sampled_df['label'].tolist()

shap_values = interpreter.explain(texts)
interpreter.save_shap_explanations_to_csv(
    shap_values=shap_values,
    texts=texts,
    output_path="../reports/shap_results_SVM.csv",
    true_labels=labels
)

LINEAR REGRETION WITH 5 FOLD

In [7]:
trainer = BaselineTrainer()
evaluator = ModelEvaluator(pos_label=1)

results = evaluator.cross_validate_oversample_with_confusionmatrix(X_embed_bal,y_bal)


[Oversampled] Fold 1
              precision    recall  f1-score   support

           0      0.893     0.824     0.857       930
           1      0.836     0.901     0.867       930

    accuracy                          0.862      1860
   macro avg      0.865     0.862     0.862      1860
weighted avg      0.865     0.862     0.862      1860

Confusion Matrix:
 [[766 164]
 [ 92 838]]
Confusion Matrix (Fold 1):
[[766 164]
 [ 92 838]]

[Oversampled] Fold 2
              precision    recall  f1-score   support

           0      0.909     0.819     0.862       930
           1      0.835     0.918     0.875       929

    accuracy                          0.869      1859
   macro avg      0.872     0.869     0.868      1859
weighted avg      0.872     0.869     0.868      1859

Confusion Matrix:
 [[762 168]
 [ 76 853]]
Confusion Matrix (Fold 2):
[[762 168]
 [ 76 853]]

[Oversampled] Fold 3
              precision    recall  f1-score   support

           0      0.895     0.828     0.8

In [None]:
final_model = trainer.train(X, y)

lime_interpreter = LimeTextInterpreter(
    model=final_model,
    vectorizer=extractor.get_tfidf_vectorizer(),
    class_names=['NON ASD', 'ASD']
)

asd_samples = df[df['label'] == 'ASD'].sample(n=12, random_state=42)
non_asd_samples = df[df['label'] == 'NON ASD'].sample(n=12, random_state=42)
lime_samples = pd.concat([asd_samples, non_asd_samples]).reset_index(drop=True)

lime_interpreter.save_lime_explanation_to_csv(lime_samples,"../reports/lime_results_LOGRES.csv")

SVM WITH 10 FOLD

In [8]:
trainer = BaselineTrainer()
evaluator = ModelEvaluator(model_type='svm', n_splits= 10, random_state=32,pos_label=1)

results = evaluator.cross_validate_oversample_with_confusionmatrix(X_embed_bal,y_bal)


[Oversampled] Fold 1
              precision    recall  f1-score   support

           0      0.928     0.834     0.879       465
           1      0.850     0.935     0.890       465

    accuracy                          0.885       930
   macro avg      0.889     0.885     0.885       930
weighted avg      0.889     0.885     0.885       930

Confusion Matrix:
 [[388  77]
 [ 30 435]]
Confusion Matrix (Fold 1):
[[388  77]
 [ 30 435]]

[Oversampled] Fold 2
              precision    recall  f1-score   support

           0      0.933     0.804     0.864       465
           1      0.828     0.942     0.881       465

    accuracy                          0.873       930
   macro avg      0.880     0.873     0.873       930
weighted avg      0.880     0.873     0.873       930

Confusion Matrix:
 [[374  91]
 [ 27 438]]
Confusion Matrix (Fold 2):
[[374  91]
 [ 27 438]]

[Oversampled] Fold 3
              precision    recall  f1-score   support

           0      0.921     0.847     0.8

In [None]:
final_model = trainer.train(X, y)

lime_interpreter = LimeTextInterpreter(
    model=final_model,
    vectorizer=extractor.get_tfidf_vectorizer(),
    class_names=['NON ASD', 'ASD']
)

asd_samples = df[df['label'] == 'ASD'].sample(n=12, random_state=42)
non_asd_samples = df[df['label'] == 'NON ASD'].sample(n=12, random_state=42)
lime_samples = pd.concat([asd_samples, non_asd_samples]).reset_index(drop=True)

lime_interpreter.save_lime_explanation_to_csv(lime_samples,"../reports/lime_results_SVM10FOLD.csv")

In [None]:
final_model = BaselineTrainer('svm').train(X,y)
interpreter = ShapInterpreter(model=final_model)
sampled_df = df.sample(n=100, random_state=42)
texts = sampled_df['clean_text'].tolist()
labels = sampled_df['label'].tolist()

shap_values = interpreter.explain(texts)
interpreter.save_shap_explanations_to_csv(
    shap_values=shap_values,
    texts=texts,
    output_path="../reports/shap_results_SVM_10_fold.csv",
    true_labels=labels
)

LINEAR REGRETION WITH 10 FOLD

In [None]:
trainer = BaselineTrainer()
evaluator = ModelEvaluator(n_splits= 10, random_state=32,pos_label=1)

results = evaluator.cross_validate_oversample_with_confusionmatrix(X_embed_bal,y_bal)

In [None]:
final_model = BaselineTrainer('svm').train(X,y)
interpreter = ShapInterpreter(model=final_model)
sampled_df = df.sample(n=100, random_state=42)
texts = sampled_df['clean_text'].tolist()
labels = sampled_df['label'].tolist()

shap_values = interpreter.explain(texts)
interpreter.save_shap_explanations_to_csv(
    shap_values=shap_values,
    texts=texts,
    output_path="../reports/shap_results_Logres_10_fold.csv",
    true_labels=labels
)


In [None]:
interpreter.visualize(shap_values= shap_values)

FUSION

In [None]:
extractor = FeatureExtractor()
X = extractor.extract_fused_features_bert(df)
y = df['label'].values

In [None]:
evaluator = ModelEvaluator(model_type='svm', n_splits= 10, random_state=32)
results = evaluator.cross_validate_with_confusionmatrix(X, y)


Testing Model

In [9]:
trainer = BaselineTrainer(model_type='svm')
model = trainer.train(X_embed_bal, y_bal)

new_text = "Apakah kamu suddah makan?"
clean_text = text_preprocessor.preprocess(new_text)
X_new = extractor.encode_series_bert([clean_text])
predicted_label = model.predict(X_new)[0]
print(f"Predicted label: {predicted_label}")

Predicted label: 1


In [15]:
new_text = "Apakah kamu sudah makan?"
clean_text = text_preprocessor.preprocess(new_text)
X_new = extractor.encode_series_bert([clean_text])
predicted_label = model.predict(X_new)[0]
print(f"Predicted label: {predicted_label}")

Predicted label: 0
