In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import (
    cross_validate,  # Memuat fungsi cross_validate untuk melakukan validasi silang dengan metrik evaluasi yang lebih lengkap
    GridSearchCV,  # Memuat kelas GridSearchCV untuk pencarian hiperparameter secara sistematis
)
import warnings

warnings.simplefilter("ignore")  # Mengabaikan peringatan (warnings) yang muncul


## **Modeling Data Dataset 2 (Tanpa Augmentasi)**

- ### **_Stemmed Data_**


In [2]:
df_noAug_stem = pd.read_csv(
    "../dataset/INA_TweetsPPKM_TFRF_DS2.csv", sep="\t"
)  # Membaca file CSV dengan nama "../dataset/INA_TweetsPPKM_TFRF_DS2.csv" dan menggunakan "\t" sebagai pemisah kolom
df_noAug_stem.shape  # Menampilkan bentuk (shape) dari DataFrame df_noAug_stem


(5938, 11258)

In [3]:
df_noAug_stem.head()

Unnamed: 0,stimulasi,optimal,laku,online,penuh,butuh,sebut,tetap,operasi,sesuai,...,wawancara,koyak,distancing,phisical,kunyuk,perubahanperubahan,acau,siihh,disiniii,sentiment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [5]:
X = df_noAug_stem.drop(
    "sentiment", axis=1
)  # Mengambil fitur dengan menghapus kolom "sentiment" dari df_noAug_stem
y = df_noAug_stem["sentiment"]  # Mengambil variabel target "sentiment" dari df_noAug_stem
print(X.shape, y.shape)  # Menampilkan bentuk (shape) dari X dan y


(5938, 11257) (5938,)


- **5 Fold**

In [9]:
from sklearn.model_selection import GridSearchCV

model = xgb.XGBClassifier(
    gamma=0.5,  # Nilai gamma untuk kontrol pruning
    subsample=0.5,  # Persentase sampel yang digunakan untuk melatih setiap pohon
    nthread=4,  # Jumlah thread yang akan digunakan dalam pemodelan
)

scoring = {
    "accuracy": "accuracy",  # Metrik evaluasi yang digunakan: akurasi
    "precision": "precision_macro",  # Metrik evaluasi yang digunakan: presisi dengan skema makro
    "recall": "recall_macro",  # Metrik evaluasi yang digunakan: recall dengan skema makro
}

grid = {
    "n_estimators": [
        25,
        30,
    ],  # Daftar nilai yang akan diuji untuk hyperparameter n_estimators
    "max_depth": [3, 4],  # Daftar nilai yang akan diuji untuk hyperparameter max_depth
    "learning_rate": [
        0.02,
        0.1,
    ],  # Daftar nilai yang akan diuji untuk hyperparameter learning_rate
}

numFold = 5  # Jumlah lipatan (folds) dalam validasi silang

xgb_gr = GridSearchCV(
    estimator=model, param_grid=grid, scoring="accuracy", cv=numFold
)  # Membuat objek GridSearchCV dengan model, grid hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian grid hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% ({precision_scores.std()*100:.2f}%) | Recall = {recall_scores.mean()*100:.2f}% ({recall_scores.std()*100:.2f}%)"
)


- **10 Fold**

In [None]:
numFold = 10  # Jumlah lipatan (folds) dalam validasi silang

xgb_gr = GridSearchCV(
    estimator=model, param_grid=grid, scoring="accuracy", cv=numFold
)  # Membuat objek GridSearchCV dengan model, grid hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian grid hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% ({precision_scores.std()*100:.2f}%) | Recall = {recall_scores.mean()*100:.2f}% ({recall_scores.std()*100:.2f}%)"
)


- ### **_Non Stemmed_**


In [None]:
df_noAug_noStem = pd.read_csv("../dataset/ina", sep="\t")


## **Modeling Data Dataset 3 (Augmentasi)**

- ### ***Stemmed***

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score


In [None]:
df_aug_stem = pd.read_csv("../dataset/INA_TweetsPPKM_TFRF_DS3.csv", sep="\t")
df_aug_stem.shape

In [None]:
df_aug_stem.head(5)

In [None]:
X = df_aug_stem.drop("sentiment", axis=1)
y = df_aug_stem["sentiment"]
X.shape, y.shape


In [None]:
model = xgb.XGBClassifier(
    gamma=0.5,  # Nilai gamma untuk kontrol pruning
    subsample=0.5,  # Persentase sampel yang digunakan untuk melatih setiap pohon
    nthread=4,  # Jumlah thread yang akan digunakan dalam pemodelan
)

scoring = {
    "accuracy": "accuracy",  # Metrik evaluasi yang digunakan: akurasi
    "precision": "precision_macro",  # Metrik evaluasi yang digunakan: presisi dengan skema makro
    "recall": "recall_macro",  # Metrik evaluasi yang digunakan: recall dengan skema makro
}

grid = {
    "n_estimators": [
        25,
        30,
    ],  # Daftar nilai yang akan diuji untuk hyperparameter n_estimators
    "max_depth": [3, 4],  # Daftar nilai yang akan diuji untuk hyperparameter max_depth
    "learning_rate": [
        0.02,
        0.1,
    ],  # Daftar nilai yang akan diuji untuk hyperparameter learning_rate
}

numFold = 3  # Jumlah lipatan (folds) dalam validasi silang

xgb_gr = GridSearchCV(
    estimator=model, param_grid=grid, scoring="accuracy", cv=numFold
)  # Membuat objek GridSearchCV dengan model, grid hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian grid hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% | Recall = {recall_scores.mean()*100:.2f}%"
)


In [None]:
kfold = KFold(n_splits=10)
SKFold2 = StratifiedKFold(n_splits=10)
result2 = cross_val_score(model, X=X, y=y, cv=SKFold2)
print("Accuracy: %.2f%% (%.2f%%)" % (result2.mean() * 100, result2.std() * 100))


- ### ***Non Stemmed***

In [None]:
df_aug_noStem = pd.read_csv("", sep="\t")
