In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import (
    cross_validate,  # Memuat fungsi cross_validate untuk melakukan validasi silang dengan metrik evaluasi yang lebih lengkap
    GridSearchCV,  # Memuat kelas GridSearchCV untuk pencarian hiperparameter secara sistematis
)
import warnings

warnings.simplefilter("ignore")  # Mengabaikan peringatan (warnings) yang muncul


### **_Skenario Pengujian_**

<ol>
  <li>Dataset 1 (Tanpa Augmentasi)
    <ol>
      <li>Stemmed Data</li>
          <ul>
          <li>5 Fold</li>
          <li>10 Fold</li>
          </ul>
      <li>Non Stemmed Data
          <ul>
          <li>5 Fold</li>
          <li>10 Fold</li>
          </ul>
      </li>
    </ol>
  </li>
  <br>
  <li>Dataset 2 (Augmentasi)
    <ol>
      <li>Stemmed Data</li>
          <ul>
          <li>5 Fold</li>
          <li>10 Fold</li>
          </ul>
      <li>Non Stemmed Data
          <ul>
          <li>5 Fold</li>
          <li>10 Fold</li>
          </ul>
      </li>
    </ol>
  </li>
</ol>


## **Dataset 1 (Tanpa Augmentasi)**

**1. Stemmed Data**


In [3]:
df_noAug_stem = pd.read_csv(
    "../dataset/INA_TweetsPPKM_TFRF_DS1.csv", sep="\t"
)  # Membaca file CSV dengan nama "../dataset/INA_TweetsPPKM_TFRF_DS2.csv" dan menggunakan "\t" sebagai pemisah kolom
df_noAug_stem.shape  # Menampilkan bentuk (shape) dari DataFrame df_noAug_stem


(5938, 11258)

In [4]:
df_noAug_stem.head()

Unnamed: 0,stimulasi,optimal,laku,online,penuh,butuh,sebut,tetap,operasi,sesuai,...,wawancara,koyak,distancing,phisical,kunyuk,perubahanperubahan,acau,siihh,disiniii,sentiment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [6]:
X = df_noAug_stem.drop(
    "sentiment", axis=1
)  # Mengambil fitur dengan menghapus kolom "sentiment" dari df_noAug_stem
y = df_noAug_stem["sentiment"]  # Mengambil variabel target "sentiment" dari df_noAug_stem
print(X.shape, y.shape)  # Menampilkan bentuk (shape) dari X dan y


(5938, 11257) (5938,)


- **Model Initialize**

In [7]:
model = xgb.XGBClassifier(
    nthread=4,
)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision_macro",
    "recall": "recall_macro",
}

params = {
    "n_estimators": [
        100,
        1000,
    ],
    "max_depth": [3, 6],
    "learning_rate": [
        0.01,
        0.1,
    ],
    "gamma": [0],
    "subsample": [1],
}


- **5 Fold**

In [8]:
numFold = 5  # Jumlah lipatan (folds) dalam validasi silang

xgb_gr = GridSearchCV(
    estimator=model, param_grid=params, scoring="accuracy", cv=numFold
)  # Membuat objek GridSearchCV dengan model, params hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian params hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% ({precision_scores.std()*100:.2f}%) | Recall = {recall_scores.mean()*100:.2f}% ({recall_scores.std()*100:.2f}%)"
)


Optimal Hyperparameter : {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 1}

Cross-Validation Results:
Fold 1 : 
Accuracy = 82.74% | Precision = 81.11% | Recall = 78.90%

Fold 2 : 
Accuracy = 83.00% | Precision = 83.77% | Recall = 76.82%

Fold 3 : 
Accuracy = 83.59% | Precision = 82.51% | Recall = 79.34%

Fold 4 : 
Accuracy = 82.31% | Precision = 81.27% | Recall = 77.37%

Fold 5 : 
Accuracy = 84.08% | Precision = 84.33% | Recall = 78.69%

Average Results : 
Accuracy = 83.14% (0.62%) | Precision = 82.60% (1.29%) | Recall = 78.23% (0.96%)


- **10 Fold**

In [9]:
numFold = 10  # Jumlah lipatan (folds) dalam validasi silang

xgb_gr = GridSearchCV(
    estimator=model, param_grid=params, scoring="accuracy", cv=numFold
)  # Membuat objek GridSearchCV dengan model, params hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian params hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% ({precision_scores.std()*100:.2f}%) | Recall = {recall_scores.mean()*100:.2f}% ({recall_scores.std()*100:.2f}%)"
)


Optimal Hyperparameter : {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 1}

Cross-Validation Results:
Fold 1 : 
Accuracy = 84.01% | Precision = 82.06% | Recall = 81.46%

Fold 2 : 
Accuracy = 85.02% | Precision = 84.11% | Recall = 81.18%

Fold 3 : 
Accuracy = 83.00% | Precision = 83.23% | Recall = 77.21%

Fold 4 : 
Accuracy = 83.33% | Precision = 83.37% | Recall = 77.85%

Fold 5 : 
Accuracy = 81.48% | Precision = 79.94% | Recall = 76.86%

Fold 6 : 
Accuracy = 85.69% | Precision = 85.31% | Recall = 81.55%

Fold 7 : 
Accuracy = 86.87% | Precision = 86.59% | Recall = 83.08%

Fold 8 : 
Accuracy = 79.97% | Precision = 78.49% | Recall = 74.43%

Fold 9 : 
Accuracy = 83.31% | Precision = 84.20% | Recall = 77.10%

Fold 10 : 
Accuracy = 85.50% | Precision = 85.69% | Recall = 80.70%

Average Results : 
Accuracy = 83.82% (1.96%) | Precision = 83.30% (2.41%) | Recall = 79.14% (2.65%)


**2. Non Stemmed Data**


- **5 Fold**

In [10]:
df_noAug_noStem = pd.read_csv("../dataset/INA_TweetsPPKM_TFRF_DS1_NoStem.csv", sep="\t")
df_noAug_noStem.shape

(5938, 14773)

In [11]:
df_noAug_noStem.shape

(5938, 14773)

In [12]:
X = df_noAug_noStem.drop(
    "sentiment", axis=1
)  # Mengambil fitur dengan menghapus kolom "sentiment" dari df_noAug_stem
y = df_noAug_noStem["sentiment"]  # Mengambil variabel target "sentiment" dari df_noAug_stem
print(X.shape, y.shape)  # Menampilkan bentuk (shape) dari X dan y


(5938, 14772) (5938,)


- **Model Initialize**

In [13]:
model = xgb.XGBClassifier(
    nthread=4,
)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision_macro",
    "recall": "recall_macro",
}

params = {
    "n_estimators": [
        100,
        1000,
    ],
    "max_depth": [3, 6],
    "learning_rate": [
        0.01,
        0.1,
    ],
    "gamma": [0],
    "subsample": [1],
}

- **5 Fold**

In [14]:
numFold = 5  # Jumlah lipatan (folds) dalam validasi silang

xgb_gr = GridSearchCV(
    estimator=model, param_grid=params, scoring="accuracy", cv=numFold
)  # Membuat objek GridSearchCV dengan model, params hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian params hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% ({precision_scores.std()*100:.2f}%) | Recall = {recall_scores.mean()*100:.2f}% ({recall_scores.std()*100:.2f}%)"
)


Optimal Hyperparameter : {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 1}

Cross-Validation Results:
Fold 1 : 
Accuracy = 82.32% | Precision = 80.58% | Recall = 78.46%

Fold 2 : 
Accuracy = 83.16% | Precision = 84.31% | Recall = 76.82%

Fold 3 : 
Accuracy = 83.33% | Precision = 82.27% | Recall = 78.95%

Fold 4 : 
Accuracy = 81.55% | Precision = 81.12% | Recall = 75.64%

Fold 5 : 
Accuracy = 82.14% | Precision = 81.89% | Recall = 76.34%

Average Results : 
Accuracy = 82.50% (0.66%) | Precision = 82.03% (1.28%) | Recall = 77.24% (1.26%)


- **10 Fold**

In [15]:
numFold = 10  # Jumlah lipatan (folds) dalam validasi silang

xgb_gr = GridSearchCV(
    estimator=model, param_grid=params, scoring="accuracy", cv=numFold
)  # Membuat objek GridSearchCV dengan model, params hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian params hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% ({precision_scores.std()*100:.2f}%) | Recall = {recall_scores.mean()*100:.2f}% ({recall_scores.std()*100:.2f}%)"
)

## **Dataset 2 (Augmentasi)**

**1. Stemmed**

In [None]:
df_aug_stem = pd.read_csv("../dataset/INA_TweetsPPKM_TFRF_DS3.csv", sep="\t")
df_aug_stem.shape

In [None]:
df_aug_stem.head(5)

In [None]:
X = df_aug_stem.drop("sentiment", axis=1)
y = df_aug_stem["sentiment"]
X.shape, y.shape


- **Model Initialize**

In [None]:
model = xgb.XGBClassifier(
    nthread=4,
)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision_macro",
    "recall": "recall_macro",
}

params = {
    "n_estimators": [
        100,
        1000,
    ],
    "max_depth": [3, 6],
    "learning_rate": [
        0.02,
        0.1,
    ],
    "gamma": [0],
    "subsample": [1],
}


- **5 Fold**

In [None]:
numFold = 3  # Jumlah lipatan (folds) dalam validasi silang

xgb_gr = GridSearchCV(
    estimator=model, param_grid=params, scoring="accuracy", cv=numFold
)  # Membuat objek GridSearchCV dengan model, params hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian params hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% | Recall = {recall_scores.mean()*100:.2f}%"
)

- **10 Fold**

In [None]:
kfold = KFold(n_splits=10)
SKFold2 = StratifiedKFold(n_splits=10)
result2 = cross_val_score(model, X=X, y=y, cv=SKFold2)
print("Accuracy: %.2f%% (%.2f%%)" % (result2.mean() * 100, result2.std() * 100))


**2. Non Stemmed**

In [None]:
df_aug_noStem = pd.read_csv("", sep="\t")

- **5 Fold**

- **10 Fold**