In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import (
    StratifiedKFold,  # Memuat kelas StratifiedKFold untuk validasi silang dengan mempertahankan distribusi kelas yang seimbang
    KFold,  # Memuat kelas KFold untuk validasi silang
    cross_val_score,  # Memuat fungsi cross_val_score untuk melakukan validasi silang dan menghitung skor
    cross_validate,  # Memuat fungsi cross_validate untuk melakukan validasi silang dengan metrik evaluasi yang lebih lengkap
    GridSearchCV,  # Memuat kelas GridSearchCV untuk pencarian hiperparameter secara sistematis
)
from sklearn.metrics import (
    accuracy_score,  # Memuat fungsi accuracy_score untuk menghitung skor akurasi
    precision_score,  # Memuat fungsi precision_score untuk menghitung skor presisi
    recall_score,  # Memuat fungsi recall_score untuk menghitung skor recall
    make_scorer,  # Memuat fungsi make_scorer untuk membuat metrik evaluasi khusus
)
import warnings

warnings.simplefilter("ignore")  # Mengabaikan peringatan (warnings) yang muncul


In [None]:
# import xgboost as xgb
# from sklearn.model_selection import KFold, cross_val_score


# # Create an XGBoost classifier with specified parameters
# model = xgb.XGBClassifier(
#     n_estimators=1000,  # Number of trees in the forest
#     gamma=1,  # Minimum loss reduction required to make a further partition on a leaf node
#     learning_rate=1,  # Step size shrinkage used in update to prevent overfitting
#     subsample=1,  # Subsample ratio of the training instances
#     max_depth=4,  # Maximum depth of a tree
# )

# # Create a K-Fold cross-validation object with 5 splits
# kfold = KFold(n_splits=10)

# # Perform cross-validation and get the accuracy scores
# result = cross_val_score(model, X=X, y=y, cv=kfold)

# # Print the mean and standard deviation of the accuracy scores
# print("Accuracy: %.2f%% (%.2f%%)" % (result.mean() * 100, result.std() * 100))


## **Modeling Data Dataset 2 (Tanpa Augmentasi)**

In [2]:
df_noAug = pd.read_csv(
    "../dataset/INA_TweetsPPKM_TFRF_DS2.csv", sep="\t"
)  # Membaca file CSV dengan nama "../dataset/INA_TweetsPPKM_TFRF_DS2.csv" dan menggunakan "\t" sebagai pemisah kolom
df_noAug.shape  # Menampilkan bentuk (shape) dari DataFrame df_noAug


(5938, 11258)

In [3]:
df_noAug.head()

Unnamed: 0,stimulasi,optimal,laku,online,penuh,butuh,sebut,tetap,operasi,sesuai,...,wawancara,koyak,distancing,phisical,kunyuk,perubahanperubahan,acau,siihh,disiniii,sentiment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [4]:
X = df_noAug.drop(
    "sentiment", axis=1
)  # Mengambil fitur dengan menghapus kolom "sentiment" dari df_noAug
y = df_noAug["sentiment"]  # Mengambil variabel target "sentiment" dari df_noAug
print(X.shape, y.shape)  # Menampilkan bentuk (shape) dari X dan y


(5938, 11257) (5938,)


- **Testing Using Cross Validate Method**

In [None]:
model = xgb.XGBClassifier(
    n_estimators=100,  # Jumlah pohon keputusan yang akan dibangun dalam model
    gamma=0.5,  # Parameter untuk kontrol pruning, mengatur ambang batas pemangkasan pohon
    learning_rate=0.1,  # Tingkat pembelajaran (learning rate) untuk mengontrol kontribusi setiap pohon
    subsample=0.5,  # Persentase sampel yang digunakan untuk melatih setiap pohon
    max_depth=3,  # Kedalaman maksimum dari setiap pohon keputusan
    nthread=4,  # Jumlah thread yang akan digunakan dalam pemodelan
)

scoring = {
    "accuracy": "accuracy",  # Metrik evaluasi yang digunakan: akurasi
    "precision": "precision_macro",  # Metrik evaluasi yang digunakan: presisi dengan skema makro
    "recall": "recall_macro",  # Metrik evaluasi yang digunakan: recall dengan skema makro
}

numFold = 3  # Jumlah lipatan (folds) dalam validasi silang

results = cross_validate(
    model, X, y, cv=numFold, scoring=scoring
)  # Melakukan validasi silang dengan model, fitur (X), target (y), jumlah lipatan, dan metrik evaluasi yang telah ditentukan sebelumnya

accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print("Cross-Validation Results:")

for i in range(numFold):  # Melakukan perulangan untuk setiap lipatan (fold)
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )

print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% | Precision = {precision_scores.mean()*100:.2f}% | Recall = {recall_scores.mean()*100:.2f}%"
)


- **Testing Using Combination Of Cross Validation And Gridsearch**

In [5]:
from sklearn.model_selection import GridSearchCV

model = xgb.XGBClassifier(
    gamma=0.5,  #
    subsample=0.5,
    nthread=4,
)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision_macro",
    "recall": "recall_macro",
}

grid = {
    "n_estimators": [25, 30],
    "max_depth": [3, 4],
    "learning_rate": [0.02, 0.1],
}

numFold = 3

xgb_gr = GridSearchCV(
    estimator=model, param_grid=grid, scoring='accuracy', cv=numFold, n_jobs=-1, refit=True
)


results = cross_validate(xgb_gr, X, y, cv=numFold, scoring=scoring)


# print("Cross-Validation Results:")

# for i in range(numFold):
#     print(f"Fold {i+1} : ")
#     print(
#         f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
#     )
# print(
#     f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% | Precision = {precision_scores.mean()*100:.2f}% | Recall = {recall_scores.mean()*100:.2f}%"
# )


In [17]:
xgb_gr.fit(X,y)

In [18]:
best_params = xgb_gr.best_params_
print(best_params)

{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 30}


In [19]:
results = cross_validate(xgb_gr.best_estimator_, X, y, cv=numFold, scoring=scoring)

In [23]:
accuracy_scores = results[
    "test_accuracy"
]  # Menyimpan skor akurasi dari hasil validasi silang
precision_scores = results[
    "test_precision"
]  # Menyimpan skor presisi dari hasil validasi silang
recall_scores = results[
    "test_recall"
]  # Menyimpan skor recall dari hasil validasi silang

print("Cross-Validation Results:")

for i in range(numFold):
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )
print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% | Precision = {precision_scores.mean()*100:.2f}% | Recall = {recall_scores.mean()*100:.2f}%"
)
print(f"\nBest Hyperparameter : {best_params}")


Cross-Validation Results:
Fold 1 : 
Accuracy = 74.24% | Precision = 76.47% | Recall = 62.89%

Fold 2 : 
Accuracy = 77.01% | Precision = 80.61% | Recall = 66.72%

Fold 3 : 
Accuracy = 72.51% | Precision = 75.43% | Recall = 59.80%

Average Results : 
Accuracy = 74.59% | Precision = 77.50% | Recall = 63.14%

Best Hyperparameter : {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 30}


In [16]:
xgb_gr.best_params_()

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [15]:
print(results['test_accuracy'].mean())
print(results)

0.7458746127265581
{'fit_time': array([204.82278681, 189.44186544, 190.03686118]), 'score_time': array([0.52416444, 0.50572109, 0.49767184]), 'test_accuracy': array([0.74242424, 0.7700859 , 0.72511369]), 'test_precision': array([0.76470714, 0.80607049, 0.75433586]), 'test_recall': array([0.62893999, 0.66715288, 0.59803526])}


In [None]:
# Create an XGBoost classifier with specified parameters
model = xgb.XGBClassifier(
    n_estimators=1000,  # Number of trees in the forest
    gamma=1,  # Minimum loss reduction required to make a further partition on a leaf node
    learning_rate=1,  # Step size shrinkage used in update to prevent overfitting
    subsample=1,  # Subsample ratio of the training instances
    max_depth=4,  # Maximum depth of a tree
)

# Create a K-Fold cross-validation object with 5 splits
kfold_1 = KFold(n_splits=5)
SKFold_1 = StratifiedKFold(n_splits=5)

# Perform cross-validation and get the accuracy scores
accuracy_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="accuracy")
precision_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="precision_macro")
recall_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="recall_macro")

# Print the mean and standard deviation of the scores
print(
    "Accuracy: {:.2f} (+/- {:.2f})".format(
        accuracy_scores.mean(), accuracy_scores.std() * 2
    )
)
print(
    "Precision: {:.2f} (+/- {:.2f})".format(
        precision_scores.mean(), precision_scores.std() * 2
    )
)
print(
    "Recall: {:.2f} (+/- {:.2f})".format(recall_scores.mean(), recall_scores.std() * 2)
)


In [None]:
# Create an XGBoost classifier with specified parameters
model = xgb.XGBClassifier(
    n_estimators=1000,  # Number of trees in the forest
    gamma=1,  # Minimum loss reduction required to make a further partition on a leaf node
    learning_rate=1,  # Step size shrinkage used in update to prevent overfitting
    subsample=1,  # Subsample ratio of the training instances
    max_depth=4,  # Maximum depth of a tree
)

# Create a K-Fold cross-validation object with 5 splits
numFold = 5
kfold_1 = KFold(n_splits=5)
SKFold_1 = StratifiedKFold(n_splits=numFold, random_state=42, shuffle=True)

accuracy_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="accuracy")
precision_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="precision_macro")
recall_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="recall_macro")


for i in range(numFold):
    print(
        f"Fold {i+1} : Accuracy = {accuracy_scores[i] * 100:.2f}% | Precision : {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}% \n"
    )

print(
    f"Average Scores : Accuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% | Recall = {recall_scores.mean()*100:.2f}%"
)


In [None]:
kfold = KFold(n_splits=10)
SKFold_2 = StratifiedKFold(n_splits=10)
result2 = cross_val_score(model, X=X, y=y, cv=SKFold_2)
print("Accuracy: %.2f%% (%.2f%%)" % (result2.mean() * 100, result2.std() * 100))


## **Modeling Data Dataset 3 (Augmentasi)**

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score


In [None]:
df = pd.read_csv("../dataset/INA_TweetsPPKM_TFRF_DS3.csv", sep="\t")


In [None]:
df.head(5)

In [None]:
# implementing cv in xgboost model
df.shape

In [None]:
X = df.drop("sentiment", axis=1)
y = df["sentiment"]
print(X.shape)
print(y.shape)


In [None]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold


# Create an XGBoost classifier with specified parameters
model = xgb.XGBClassifier(
    n_estimators=1000,  # Number of trees in the forest
    gamma=0.5,  # Minimum loss reduction required to make a further partition on a leaf node
    learning_rate=0.1,  # Step size shrinkage used in update to prevent overfitting
    subsample=0.5,  # Subsample ratio of the training instances
    max_depth=4,  # Maximum depth of a tree

)

numFold = 5
kfold_1 = KFold(n_splits=5)
SKFold_1 = StratifiedKFold(n_splits=numFold, random_state=42, shuffle=True)

accuracy_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="accuracy")
precision_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="precision_macro")
recall_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="recall_macro")


for i in range(numFold):
    print(
        f"Fold {i+1} : Accuracy = {accuracy_scores[i] * 100:.2f}% | Precision : {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}% \n"
    )

print(
    f"Average Scores : Accuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% | Recall = {recall_scores.mean()*100:.2f}%"
)


In [None]:
kfold = KFold(n_splits=10)
SKFold2 = StratifiedKFold(n_splits=10)
result2 = cross_val_score(model, X=X, y=y, cv=SKFold2)
print("Accuracy: %.2f%% (%.2f%%)" % (result2.mean() * 100, result2.std() * 100))


In [None]:
result2