In [494]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [495]:
column = ['sentimen', 'komen']

In [497]:
df_training = pd.read_csv('data/data_training_50.txt', sep='\t', header=None)
df_training.set_axis(column, axis=1, inplace=True)

In [499]:
df_testing = pd.read_csv('data/data_testing_50.txt', sep='\t', header=None)
df_testing.set_axis(column, axis=1, inplace=True)

In [500]:
X_trainval, X_test, y_trainval, y_test = df_training['komen'], df_testing['komen'], df_training['sentimen'], df_testing['sentimen']

In [502]:
tfidf_vectorizer = TfidfVectorizer()

In [504]:
X_trainval_tfidf = tfidf_vectorizer.fit_transform(X_trainval)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [505]:
feature = pd.DataFrame(X_trainval_tfidf.todense().T,
                       index = tfidf_vectorizer.get_feature_names_out(),
                       columns=[f'D{i+1}' for i in range(len(X_trainval))])

In [506]:
# Step 4: Training and Hyperparameter Tuning
param_grid = {
            #   'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            #   'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'gamma': [100],
              'C': [0.0001],
              'kernel': ['poly'],
              'max_iter': [10000],
              'degree':[2],
              'random_state' : [42]}
svm_model = SVC()
grid_search = GridSearchCV(svm_model, param_grid, cv=10, n_jobs=-1)
grid_search.fit(X_trainval_tfidf, y_trainval)
best_svm_model = grid_search.best_estimator_
best_svm_model


SVC(C=0.0001, degree=2, gamma=100, kernel='poly', max_iter=10000,
    random_state=42)

In [509]:
X_trainval = pd.array(X_trainval)
y_trainval = pd.array(y_trainval)

In [510]:
k_fold = 10

In [512]:
skf = StratifiedKFold(n_splits=k_fold)

target_names = ['negatif', 'positif']
acc_score = []
kfold_report=[]

kf_model = best_svm_model

kf_tfidf_vectorizer = TfidfVectorizer(min_df=2,max_df=0.9)

for train_index, test_index in skf.split(X_trainval, y_trainval):
        kf_x_train, kf_x_test = X_trainval[train_index], X_trainval[test_index]
        kf_y_train, kf_y_test = y_trainval[train_index], y_trainval[test_index]

        # Fit and transform the training data using TF-IDF
        kf_x_train_tfidf = kf_tfidf_vectorizer.fit_transform(kf_x_train)

        # Transform the test data using the fitted TF-IDF vectorizer
        kf_x_test_tfidf = kf_tfidf_vectorizer.transform(kf_x_test)

        # Train the SVM model on the TF-IDF features
        kf_model.fit(kf_x_train_tfidf, kf_y_train)

        pred_values = kf_model.predict(kf_x_test_tfidf)
        
        acc = accuracy_score(kf_y_test, pred_values)

        # trained_model.append(classifier)
        acc_score.append(acc)
        kfold_report.append(classification_report(kf_y_test, pred_values, target_names=target_names, digits=4, output_dict=True))

average_accuracy = np.mean(acc_score)
max_accuracy = max(acc_score)

In [514]:
# Step 6: Final Testing
testing_model = best_svm_model
testing_model.fit(X_trainval_tfidf, y_trainval)

final_predictions = testing_model.predict(X_test_tfidf)
accuracy_test = accuracy_score(y_test, final_predictions)
precision_test = precision_score(y_test, final_predictions, average='micro')
recall_test = recall_score(y_test, final_predictions, average='micro')
f1_test = f1_score(y_test, final_predictions, average='micro')

# Print the results
print("Average accuracy during cross-validation:", average_accuracy)
print("Accuracy on the test set:", accuracy_test)
print("Precision on the test set:", precision_test)
print("Recall on the test set:", recall_test)
print("F1 score on the test set:", f1_test)
# print(classification_report(y_true=y_test, y_pred=final_predictions, digits=4, output_dict=True))

Average accuracy during cross-validation: 0.7949999999999999
Accuracy on the test set: 0.805
Precision on the test set: 0.805
Recall on the test set: 0.805
F1 score on the test set: 0.805
