In [5]:
import numpy as np
import pandas as pd
import xgboost as xgb
import joblib
from sklearn.model_selection import (
    cross_validate,  # Memuat fungsi cross_validate untuk melakukan validasi silang dengan metrik evaluasi yang lebih lengkap
    GridSearchCV,  # Memuat kelas GridSearchCV untuk pencarian hiperparameter secara sistematis
)
import warnings

warnings.simplefilter("ignore")  # Mengabaikan peringatan (warnings) yang muncul


### **_Skenario Pengujian_**

<ol>
  <li>Dataset 1 (Tanpa Augmentasi)
    <ol>
      <li>Stemmed Data</li>
          <ul>
          <li>5 Fold</li>
          <li>10 Fold</li>
          </ul>
      <li>Non Stemmed Data
          <ul>
          <li>5 Fold</li>
          <li>10 Fold</li>
          </ul>
      </li>
    </ol>
  </li>
  <br>
  <li>Dataset 2 (Augmentasi)
    <ol>
      <li>Stemmed Data</li>
          <ul>
          <li>5 Fold</li>
          <li>10 Fold</li>
          </ul>
      <li>Non Stemmed Data
          <ul>
          <li>5 Fold</li>
          <li>10 Fold</li>
          </ul>
      </li>
    </ol>
  </li>
</ol>


## **Dataset 1 (Tanpa Augmentasi)**

### **1. Stemmed Data**


In [3]:
df_noAug_stem = pd.read_csv(
    "../dataset/INA_TweetsPPKM_TFRF_DS1.csv", sep="\t"
)  # Membaca file CSV dengan nama "../dataset/INA_TweetsPPKM_TFRF_DS2.csv" dan menggunakan "\t" sebagai pemisah kolom
df_noAug_stem.shape  # Menampilkan bentuk (shape) dari DataFrame df_noAug_stem


(5938, 11258)

In [4]:
df_noAug_stem.head()

Unnamed: 0,stimulasi,optimal,laku,online,penuh,butuh,sebut,tetap,operasi,sesuai,...,wawancara,koyak,distancing,phisical,kunyuk,perubahanperubahan,acau,siihh,disiniii,sentiment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [6]:
X = df_noAug_stem.drop(
    "sentiment", axis=1
)  # Mengambil fitur dengan menghapus kolom "sentiment" dari df_noAug_stem
y = df_noAug_stem["sentiment"]  # Mengambil variabel target "sentiment" dari df_noAug_stem
print(X.shape, y.shape)  # Menampilkan bentuk (shape) dari X dan y


(5938, 11257) (5938,)


- **Model Initialize**

In [7]:
model = xgb.XGBClassifier(
    nthread=4,
)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision_macro",
    "recall": "recall_macro",
}

params = {
    "n_estimators": [
        100,
        1000,
    ],
    "max_depth": [3, 6],
    "learning_rate": [
        0.01,
        0.1,
    ],
    "gamma": [0],
    "subsample": [1],
}


- **5 Fold**

In [8]:
numFold = 5  # Jumlah lipatan (folds) dalam validasi silang

xgb_gr = GridSearchCV(
    estimator=model, param_grid=params, scoring="accuracy", cv=numFold
)  # Membuat objek GridSearchCV dengan model, params hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian params hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% ({precision_scores.std()*100:.2f}%) | Recall = {recall_scores.mean()*100:.2f}% ({recall_scores.std()*100:.2f}%)"
)


Optimal Hyperparameter : {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 1}

Cross-Validation Results:
Fold 1 : 
Accuracy = 82.74% | Precision = 81.11% | Recall = 78.90%

Fold 2 : 
Accuracy = 83.00% | Precision = 83.77% | Recall = 76.82%

Fold 3 : 
Accuracy = 83.59% | Precision = 82.51% | Recall = 79.34%

Fold 4 : 
Accuracy = 82.31% | Precision = 81.27% | Recall = 77.37%

Fold 5 : 
Accuracy = 84.08% | Precision = 84.33% | Recall = 78.69%

Average Results : 
Accuracy = 83.14% (0.62%) | Precision = 82.60% (1.29%) | Recall = 78.23% (0.96%)


- **10 Fold**

In [9]:
numFold = 10  # Jumlah lipatan (folds) dalam validasi silang

xgb_gr = GridSearchCV(
    estimator=model, param_grid=params, scoring="accuracy", cv=numFold
)  # Membuat objek GridSearchCV dengan model, params hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian params hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% ({precision_scores.std()*100:.2f}%) | Recall = {recall_scores.mean()*100:.2f}% ({recall_scores.std()*100:.2f}%)"
)


Optimal Hyperparameter : {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 1}

Cross-Validation Results:
Fold 1 : 
Accuracy = 84.01% | Precision = 82.06% | Recall = 81.46%

Fold 2 : 
Accuracy = 85.02% | Precision = 84.11% | Recall = 81.18%

Fold 3 : 
Accuracy = 83.00% | Precision = 83.23% | Recall = 77.21%

Fold 4 : 
Accuracy = 83.33% | Precision = 83.37% | Recall = 77.85%

Fold 5 : 
Accuracy = 81.48% | Precision = 79.94% | Recall = 76.86%

Fold 6 : 
Accuracy = 85.69% | Precision = 85.31% | Recall = 81.55%

Fold 7 : 
Accuracy = 86.87% | Precision = 86.59% | Recall = 83.08%

Fold 8 : 
Accuracy = 79.97% | Precision = 78.49% | Recall = 74.43%

Fold 9 : 
Accuracy = 83.31% | Precision = 84.20% | Recall = 77.10%

Fold 10 : 
Accuracy = 85.50% | Precision = 85.69% | Recall = 80.70%

Average Results : 
Accuracy = 83.82% (1.96%) | Precision = 83.30% (2.41%) | Recall = 79.14% (2.65%)


### **2. Non Stemmed Data**


- **5 Fold**

In [10]:
df_noAug_noStem = pd.read_csv("../dataset/INA_TweetsPPKM_TFRF_DS1_NoStem.csv", sep="\t")
df_noAug_noStem.shape

(5938, 14773)

In [11]:
df_noAug_noStem.shape

(5938, 14773)

In [12]:
X = df_noAug_noStem.drop(
    "sentiment", axis=1
)  # Mengambil fitur dengan menghapus kolom "sentiment" dari df_noAug_stem
y = df_noAug_noStem["sentiment"]  # Mengambil variabel target "sentiment" dari df_noAug_stem
print(X.shape, y.shape)  # Menampilkan bentuk (shape) dari X dan y


(5938, 14772) (5938,)


- **Model Initialize**

In [13]:
model = xgb.XGBClassifier(
    nthread=4,
)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision_macro",
    "recall": "recall_macro",
}

params = {
    "n_estimators": [
        100,
        1000,
    ],
    "max_depth": [3, 6],
    "learning_rate": [
        0.01,
        0.1,
    ],
    "gamma": [0],
    "subsample": [1],
}

- **5 Fold**

In [14]:
numFold = 5  # Jumlah lipatan (folds) dalam validasi silang

xgb_gr = GridSearchCV(
    estimator=model, param_grid=params, scoring="accuracy", cv=numFold
)  # Membuat objek GridSearchCV dengan model, params hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian params hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% ({precision_scores.std()*100:.2f}%) | Recall = {recall_scores.mean()*100:.2f}% ({recall_scores.std()*100:.2f}%)"
)


Optimal Hyperparameter : {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 1}

Cross-Validation Results:
Fold 1 : 
Accuracy = 82.32% | Precision = 80.58% | Recall = 78.46%

Fold 2 : 
Accuracy = 83.16% | Precision = 84.31% | Recall = 76.82%

Fold 3 : 
Accuracy = 83.33% | Precision = 82.27% | Recall = 78.95%

Fold 4 : 
Accuracy = 81.55% | Precision = 81.12% | Recall = 75.64%

Fold 5 : 
Accuracy = 82.14% | Precision = 81.89% | Recall = 76.34%

Average Results : 
Accuracy = 82.50% (0.66%) | Precision = 82.03% (1.28%) | Recall = 77.24% (1.26%)


- **10 Fold**

In [15]:
numFold = 10  # Jumlah lipatan (folds) dalam validasi silang

xgb_gr = GridSearchCV(
    estimator=model, param_grid=params, scoring="accuracy", cv=numFold
)  # Membuat objek GridSearchCV dengan model, params hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian params hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan


accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% ({precision_scores.std()*100:.2f}%) | Recall = {recall_scores.mean()*100:.2f}% ({recall_scores.std()*100:.2f}%)"
)

Optimal Hyperparameter : {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 1}

Cross-Validation Results:
Fold 1 : 
Accuracy = 81.14% | Precision = 78.94% | Recall = 77.64%

Fold 2 : 
Accuracy = 85.02% | Precision = 84.58% | Recall = 80.66%

Fold 3 : 
Accuracy = 82.66% | Precision = 84.10% | Recall = 75.93%

Fold 4 : 
Accuracy = 81.65% | Precision = 81.69% | Recall = 75.43%

Fold 5 : 
Accuracy = 79.12% | Precision = 77.34% | Recall = 73.55%

Fold 6 : 
Accuracy = 85.86% | Precision = 85.09% | Recall = 82.20%

Fold 7 : 
Accuracy = 85.86% | Precision = 86.02% | Recall = 81.29%

Fold 8 : 
Accuracy = 80.64% | Precision = 79.98% | Recall = 74.55%

Fold 9 : 
Accuracy = 82.97% | Precision = 83.53% | Recall = 76.85%

Fold 10 : 
Accuracy = 84.32% | Precision = 83.79% | Recall = 79.68%

Average Results : 
Accuracy = 82.92% (2.19%) | Precision = 82.51% (2.74%) | Recall = 77.78% (2.86%)


## **Dataset 2 (Augmentasi)**

### **1. Stemmed**

In [22]:
import dataframe_image as dfi

In [23]:
pd.options.display.max_columns
pd.set_option("display.max_columns", 10)


In [24]:
df_aug_stem = pd.read_csv("../dataset/INA_TweetsPPKM_TFRF_DS2.csv", sep="\t")
df_aug_stem.shape


(7896, 14652)

In [17]:
df_aug_stem_head = df_aug_stem.head(10)
dfi.export(df_aug_stem_head,'../img/tfrf_dataframe_augstem.png',max_cols=10)


In [25]:
X = df_aug_stem.drop("sentiment", axis=1)
y = df_aug_stem["sentiment"]
X.shape, y.shape


((7896, 14651), (7896,))

In [26]:
X_train = X.values

- **Model Initialize**

In [27]:
model = xgb.XGBClassifier(
    nthread=4,
)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision_macro",
    "recall": "recall_macro",
}

params = {
    "n_estimators": [
        100,
        1000,
    ],
    "max_depth": [3, 6],
    "learning_rate": [
        0.02,
        0.1,
    ],
    "gamma": [0],
    "subsample": [1],
}


- **5 Fold**

In [28]:
numFold = 5  # Jumlah lipatan (folds) dalam validasi silang

xgb_gr = GridSearchCV(
    estimator=model, param_grid=params, scoring="accuracy", cv=3
)  # Membuat objek GridSearchCV dengan model, params hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian params hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan


In [29]:
#saving best model
joblib.dump(xgb_gr.best_estimator_, 'model.pkl')

['model.pkl']

In [None]:
results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% | Recall = {recall_scores.mean()*100:.2f}%"
)


- **10 Fold**

In [7]:
numFold = 10  # Jumlah lipatan (folds) dalam validasi silang

# xgb_gr = GridSearchCV(
#     estimator=model, param_grid=params, scoring="accuracy", cv=3
# )  # Membuat objek GridSearchCV dengan model, params hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

# xgb_gr.fit(
#     X, y
# )  # Melakukan pencarian params hyperparameter terbaik menggunakan fitur (X) dan target (y)
# best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% | Recall = {recall_scores.mean()*100:.2f}%"
)


Optimal Hyperparameter : {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 1}

Cross-Validation Results:
Fold 1 : 
Accuracy = 77.59% | Precision = 78.28% | Recall = 77.53%

Fold 2 : 
Accuracy = 79.37% | Precision = 82.02% | Recall = 79.26%

Fold 3 : 
Accuracy = 82.15% | Precision = 84.07% | Recall = 82.06%

Fold 4 : 
Accuracy = 80.00% | Precision = 81.67% | Recall = 79.91%

Fold 5 : 
Accuracy = 78.73% | Precision = 79.57% | Recall = 78.67%

Fold 6 : 
Accuracy = 92.91% | Precision = 92.99% | Recall = 92.93%

Fold 7 : 
Accuracy = 90.24% | Precision = 90.28% | Recall = 90.23%

Fold 8 : 
Accuracy = 89.10% | Precision = 89.12% | Recall = 89.11%

Fold 9 : 
Accuracy = 91.76% | Precision = 91.80% | Recall = 91.75%

Fold 10 : 
Accuracy = 90.87% | Precision = 90.90% | Recall = 90.89%

Average Results : 
Accuracy = 85.27% (5.88%) | Precision = 86.07% | Recall = 85.23%


### **2. Non Stemmed**

In [7]:
df_aug_noStem = pd.read_csv('../dataset/INA_TweetsPPKM_TFRF_DS2_NoStem.csv',sep='\t')

- **5 Fold**

In [8]:
df_aug_noStem.shape


(7896, 18468)

In [9]:
df_aug_noStem.head()

Unnamed: 0,hadir,boss,mega,win,jackpot,jutaan,rupiah,kemenangan,extra,menanti,...,menggali,dikoyak,distancing,phisical,kunyuk,arahnya,perubahanperubahan,amburadulnya,mengacaukan,sentiment
0,0.95424,0.727,1.11394,1.25527,1.07918,0.716,0.86034,1.23045,1.21748,0.87506,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [10]:
X = df_aug_noStem.drop(
    "sentiment", axis=1
)  # Mengambil fitur dengan menghapus kolom "sentiment" dari df_noAug_stem
y = df_aug_noStem["sentiment"]  # Mengambil variabel target "sentiment" dari df_noAug_stem
print(X.shape, y.shape)  # Menampilkan bentuk (shape) dari X dan y


(7896, 18467) (7896,)


In [14]:
xgb_gr = GridSearchCV(
    estimator=model, param_grid=params, scoring="accuracy", cv=3
)  # Membuat objek GridSearchCV dengan model, params hyperparameter, metrik evaluasi, dan jumlah lipatan yang ditentukan

xgb_gr.fit(
    X, y
)  # Melakukan pencarian params hyperparameter terbaik menggunakan fitur (X) dan target (y)
best_params = xgb_gr.best_params_  # Mendapatkan hyperparameter terbaik yang ditemukan


In [15]:
numFold = 5

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% ({precision_scores.std()*100:.2f}%) | Recall = {recall_scores.mean()*100:.2f}% ({recall_scores.std()*100:.2f}%)"
)


Optimal Hyperparameter : {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 1}

Cross-Validation Results:
Fold 1 : 
Accuracy = 76.33% | Precision = 78.39% | Recall = 76.23%

Fold 2 : 
Accuracy = 77.14% | Precision = 79.70% | Recall = 77.02%

Fold 3 : 
Accuracy = 85.43% | Precision = 85.62% | Recall = 85.40%

Fold 4 : 
Accuracy = 88.22% | Precision = 88.23% | Recall = 88.21%

Fold 5 : 
Accuracy = 89.87% | Precision = 89.87% | Recall = 89.86%

Average Results : 
Accuracy = 83.40% (5.63%) | Precision = 84.36% (4.57%) | Recall = 83.34% (5.68%)


- **10 Fold**

In [16]:
numFold = 10

results = cross_validate(
    xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model terbaik yang ditemukan, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print(f"Optimal Hyperparameter : {best_params}\n")
print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% ({precision_scores.std()*100:.2f}%) | Recall = {recall_scores.mean()*100:.2f}% ({recall_scores.std()*100:.2f}%)"
)



Optimal Hyperparameter : {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 1}

Cross-Validation Results:
Fold 1 : 
Accuracy = 77.97% | Precision = 78.44% | Recall = 77.93%

Fold 2 : 
Accuracy = 78.48% | Precision = 81.10% | Recall = 78.37%

Fold 3 : 
Accuracy = 81.52% | Precision = 84.00% | Recall = 81.42%

Fold 4 : 
Accuracy = 77.59% | Precision = 79.63% | Recall = 77.49%

Fold 5 : 
Accuracy = 76.71% | Precision = 77.86% | Recall = 76.63%

Fold 6 : 
Accuracy = 92.28% | Precision = 92.32% | Recall = 92.29%

Fold 7 : 
Accuracy = 88.97% | Precision = 89.12% | Recall = 88.95%

Fold 8 : 
Accuracy = 88.47% | Precision = 88.47% | Recall = 88.46%

Fold 9 : 
Accuracy = 91.00% | Precision = 91.18% | Recall = 90.97%

Fold 10 : 
Accuracy = 90.11% | Precision = 90.15% | Recall = 90.13%

Average Results : 
Accuracy = 84.31% (6.05%) | Precision = 85.23% (5.35%) | Recall = 84.26% (6.09%)
