In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
from sklearn.model_selection import (
    StratifiedKFold,
    KFold,
    cross_val_score,
    cross_validate,
    GridSearchCV,
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, make_scorer
import warnings

warnings.simplefilter("ignore")


In [None]:
# import xgboost as xgb
# from sklearn.model_selection import KFold, cross_val_score


# # Create an XGBoost classifier with specified parameters
# model = xgb.XGBClassifier(
#     n_estimators=1000,  # Number of trees in the forest
#     gamma=1,  # Minimum loss reduction required to make a further partition on a leaf node
#     learning_rate=1,  # Step size shrinkage used in update to prevent overfitting
#     subsample=1,  # Subsample ratio of the training instances
#     max_depth=4,  # Maximum depth of a tree
# )

# # Create a K-Fold cross-validation object with 5 splits
# kfold = KFold(n_splits=10)

# # Perform cross-validation and get the accuracy scores
# result = cross_val_score(model, X=X, y=y, cv=kfold)

# # Print the mean and standard deviation of the accuracy scores
# print("Accuracy: %.2f%% (%.2f%%)" % (result.mean() * 100, result.std() * 100))


## **Modeling Data Dataset 2 (Tanpa Augmentasi)**

In [None]:
df_noAug = pd.read_csv("../dataset/INA_TweetsPPKM_TFRF_DS2.csv", sep="\t")


In [None]:
df_noAug.head()

In [None]:
# implementing cv in xgboost model
df_noAug.shape

In [None]:
X = df_noAug.drop("sentiment", axis=1)
y = df_noAug["sentiment"]
print(X.shape, y.shape)
X.head()


- **Testing Using Cross Validate Method**

In [None]:
# Create an XGBoost classifier with specified parameters
model = xgb.XGBClassifier(
    n_estimators=100,  # Number of trees in the forest
    gamma=0.5,  # Minimum loss reduction required to make a further partition on a leaf node
    learning_rate=0.1,  # Step size shrinkage used in update to prevent overfitting
    subsample=0.5,  # Subsample ratio of the training instances
    max_depth=3,
    nthread=4,
)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision_macro",
    "recall": "recall_macro",
}

numFold = 3
cv = StratifiedKFold(n_splits=numFold)


results = cross_validate(model, X, y, cv=cv, scoring=scoring)

accuracy_scores = results["test_accuracy"]
precision_scores = results["test_precision"]
recall_scores = results["test_recall"]

print("Cross-Validation Results:")

for i in range(numFold):
    print(f"Fold {i+1} : ")
    print(
        f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
    )
print(
    f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% | Precision = {precision_scores.mean()*100:.2f}% | Recall = {recall_scores.mean()*100:.2f}%"
)


- **Testing Using Combination Of Cross Validation And Gridsearch**

In [17]:
from sklearn.model_selection import GridSearchCV

# Create an XGBoost classifier with specified parameters
model = xgb.XGBClassifier(
    gamma=0.5,  # Minimum loss reduction required to make a further partition on a leaf node
    subsample=0.5,  # Subsample ratio of the training instances
    nthread=4,
)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision_macro",
    "recall": "recall_macro",
}

grid = {
    "n_estimators": [100, 150],
    "max_depth": [3, 4],
    "learning_rate": [0.01, 0.02, 0.1],
}

numFold = 3
cv = StratifiedKFold(n_splits=numFold)

xgb_gr = GridSearchCV(
    estimator=model, param_grid=grid, scoring="accuracy", cv=cv, n_jobs=-1
)


results = cross_validate(xgb_gr, X, y, cv=cv, scoring=scoring)


accuracy_scores = results["test_accuracy"]
precision_scores = results["test_precision"]
recall_scores = results["test_recall"]
print(xgb_gr.best_params_)
print(accuracy_scores.mean())

# print("Cross-Validation Results:")

# for i in range(numFold):
#     print(f"Fold {i+1} : ")
#     print(
#         f"Accuracy = {accuracy_scores[i]*100:.2f}% | Precision = {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}%\n"
#     )
# print(
#     f"Average Results : \nAccuracy = {accuracy_scores.mean()*100:.2f}% | Precision = {precision_scores.mean()*100:.2f}% | Recall = {recall_scores.mean()*100:.2f}%"
# )


In [None]:
# Create an XGBoost classifier with specified parameters
model = xgb.XGBClassifier(
    n_estimators=1000,  # Number of trees in the forest
    gamma=1,  # Minimum loss reduction required to make a further partition on a leaf node
    learning_rate=1,  # Step size shrinkage used in update to prevent overfitting
    subsample=1,  # Subsample ratio of the training instances
    max_depth=4,  # Maximum depth of a tree
)

# Create a K-Fold cross-validation object with 5 splits
kfold_1 = KFold(n_splits=5)
SKFold_1 = StratifiedKFold(n_splits=5)

# Perform cross-validation and get the accuracy scores
accuracy_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="accuracy")
precision_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="precision_macro")
recall_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="recall_macro")

# Print the mean and standard deviation of the scores
print(
    "Accuracy: {:.2f} (+/- {:.2f})".format(
        accuracy_scores.mean(), accuracy_scores.std() * 2
    )
)
print(
    "Precision: {:.2f} (+/- {:.2f})".format(
        precision_scores.mean(), precision_scores.std() * 2
    )
)
print(
    "Recall: {:.2f} (+/- {:.2f})".format(recall_scores.mean(), recall_scores.std() * 2)
)


In [None]:
# Create an XGBoost classifier with specified parameters
model = xgb.XGBClassifier(
    n_estimators=1000,  # Number of trees in the forest
    gamma=1,  # Minimum loss reduction required to make a further partition on a leaf node
    learning_rate=1,  # Step size shrinkage used in update to prevent overfitting
    subsample=1,  # Subsample ratio of the training instances
    max_depth=4,  # Maximum depth of a tree
)

# Create a K-Fold cross-validation object with 5 splits
numFold = 5
kfold_1 = KFold(n_splits=5)
SKFold_1 = StratifiedKFold(n_splits=numFold, random_state=42, shuffle=True)

accuracy_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="accuracy")
precision_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="precision_macro")
recall_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="recall_macro")


for i in range(numFold):
    print(
        f"Fold {i+1} : Accuracy = {accuracy_scores[i] * 100:.2f}% | Precision : {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}% \n"
    )

print(
    f"Average Scores : Accuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% | Recall = {recall_scores.mean()*100:.2f}%"
)


In [None]:
kfold = KFold(n_splits=10)
SKFold_2 = StratifiedKFold(n_splits=10)
result2 = cross_val_score(model, X=X, y=y, cv=SKFold_2)
print("Accuracy: %.2f%% (%.2f%%)" % (result2.mean() * 100, result2.std() * 100))


## **Modeling Data Dataset 3 (Augmentasi)**

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score


In [None]:
df = pd.read_csv("../dataset/INA_TweetsPPKM_TFRF_DS3.csv", sep="\t")


In [None]:
df.head(5)

In [None]:
# implementing cv in xgboost model
df.shape

In [None]:
X = df.drop("sentiment", axis=1)
y = df["sentiment"]
print(X.shape)
print(y.shape)


In [None]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold


# Create an XGBoost classifier with specified parameters
model = xgb.XGBClassifier(
    n_estimators=1000,  # Number of trees in the forest
    gamma=0.5,  # Minimum loss reduction required to make a further partition on a leaf node
    learning_rate=0.1,  # Step size shrinkage used in update to prevent overfitting
    subsample=0.5,  # Subsample ratio of the training instances
    max_depth=4,  # Maximum depth of a tree

)

numFold = 5
kfold_1 = KFold(n_splits=5)
SKFold_1 = StratifiedKFold(n_splits=numFold, random_state=42, shuffle=True)

accuracy_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="accuracy")
precision_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="precision_macro")
recall_scores = cross_val_score(model, X, y, cv=SKFold_1, scoring="recall_macro")


for i in range(numFold):
    print(
        f"Fold {i+1} : Accuracy = {accuracy_scores[i] * 100:.2f}% | Precision : {precision_scores[i]*100:.2f}% | Recall = {recall_scores[i]*100:.2f}% \n"
    )

print(
    f"Average Scores : Accuracy = {accuracy_scores.mean()*100:.2f}% ({accuracy_scores.std()*100:.2f}%) | Precision = {precision_scores.mean()*100:.2f}% | Recall = {recall_scores.mean()*100:.2f}%"
)


In [None]:
kfold = KFold(n_splits=10)
SKFold2 = StratifiedKFold(n_splits=10)
result2 = cross_val_score(model, X=X, y=y, cv=SKFold2)
print("Accuracy: %.2f%% (%.2f%%)" % (result2.mean() * 100, result2.std() * 100))


In [None]:
result2