In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [2]:
column = ['sentimen', 'komen']


In [3]:
df_training = pd.read_csv('data_training_90.txt', sep='\t', header=None)
df_training.set_axis(column, axis=1, inplace=True)

  df_training.set_axis(column, axis=1, inplace=True)


In [4]:
df_testing = pd.read_csv('data_testing_10.txt', sep='\t', header=None)
df_testing.set_axis(column, axis=1, inplace=True)


  df_testing.set_axis(column, axis=1, inplace=True)


In [5]:
X_trainval, X_test, y_trainval, y_test = df_training['komen'], df_testing['komen'], df_training['sentimen'], df_testing['sentimen']


In [6]:

tfidf_vectorizer = TfidfVectorizer()
X_trainval_tfidf = tfidf_vectorizer.fit_transform(X_trainval)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
feature = pd.DataFrame(X_trainval_tfidf.todense().T,
                       index = tfidf_vectorizer.get_feature_names_out(),
                       columns=[f'D{i+1}' for i in range(len(X_trainval))])






In [7]:
# Step 4: Training and Hyperparameter Tuning
param_grid = {
            #   'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            #   'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'gamma': [100],
              'C': [0.0001],
              'kernel': ['poly'],
              'max_iter': [10000],
              'degree':[2],
              'random_state' : [42]}
svm_model = SVC()
grid_search = GridSearchCV(svm_model, param_grid, cv=10, n_jobs=-1)
grid_search.fit(X_trainval_tfidf, y_trainval)
best_svm_model = grid_search.best_estimator_
best_svm_model
X_trainval = pd.array(X_trainval)
y_trainval = pd.array(y_trainval)
k_fold = 10
skf = StratifiedKFold(n_splits=k_fold)

target_names = ['negatif', 'positif']
acc_score = []
kfold_report=[]

kf_model = best_svm_model

kf_tfidf_vectorizer = TfidfVectorizer(min_df=2,max_df=0.9)

for train_index, test_index in skf.split(X_trainval, y_trainval):
        kf_x_train, kf_x_test = X_trainval[train_index], X_trainval[test_index]
        kf_y_train, kf_y_test = y_trainval[train_index], y_trainval[test_index]

        # Fit and transform the training data using TF-IDF
        kf_x_train_tfidf = kf_tfidf_vectorizer.fit_transform(kf_x_train)

        # Transform the test data using the fitted TF-IDF vectorizer
        kf_x_test_tfidf = kf_tfidf_vectorizer.transform(kf_x_test)

        # Train the SVM model on the TF-IDF features
        kf_model.fit(kf_x_train_tfidf, kf_y_train)

        pred_values = kf_model.predict(kf_x_test_tfidf)

        acc = accuracy_score(kf_y_test, pred_values)

        # trained_model.append(classifier)
        acc_score.append(acc)
        kfold_report.append(classification_report(kf_y_test, pred_values, target_names=target_names, digits=4, output_dict=True))

average_accuracy = np.mean(acc_score)
max_accuracy = max(acc_score)



# Step 6: Final Testing
testing_model = best_svm_model
testing_model.fit(X_trainval_tfidf, y_trainval)

final_predictions = testing_model.predict(X_test_tfidf)
accuracy_test = accuracy_score(y_test, final_predictions)
precision_test = precision_score(y_test, final_predictions, average='micro')
recall_test = recall_score(y_test, final_predictions, average='micro')
f1_test = f1_score(y_test, final_predictions, average='micro')

# Print the results
print("Average accuracy during cross-validation:", average_accuracy)
print("Accuracy on the test set:", accuracy_test)
print("Precision on the test set:", precision_test)
print("Recall on the test set:", recall_test)
print("F1 score on the test set:", f1_test)
print(classification_report(y_true=y_test, y_pred=final_predictions, digits=4, output_dict=True))

Average accuracy during cross-validation: 0.8722222222222223
Accuracy on the test set: 0.925
Precision on the test set: 0.925
Recall on the test set: 0.925
F1 score on the test set: 0.925
{'0': {'precision': 1.0, 'recall': 0.85, 'f1-score': 0.9189189189189189, 'support': 20}, '1': {'precision': 0.8695652173913043, 'recall': 1.0, 'f1-score': 0.9302325581395349, 'support': 20}, 'accuracy': 0.925, 'macro avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40}, 'weighted avg': {'precision': 0.9347826086956521, 'recall': 0.925, 'f1-score': 0.9245757385292268, 'support': 40}}


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd


# Hyperparameter Tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'random_state': [42]
}

rf_model = RandomForestClassifier()
grid_search = GridSearchCV(rf_model, param_grid, cv=5, n_jobs=-1)  # Reduced to 5 folds
grid_search.fit(X_trainval_tfidf, y_trainval)
best_rf_model = grid_search.best_estimator_

# Cross-validation using StratifiedKFold
k_fold = 5
skf = StratifiedKFold(n_splits=k_fold)

target_names = ['negatif', 'positif']
acc_score = []
kfold_report = []

kf_model = best_rf_model

for train_index, test_index in skf.split(X_trainval_tfidf, y_trainval):
    kf_x_train, kf_x_test = X_trainval_tfidf[train_index], X_trainval_tfidf[test_index]
    kf_y_train, kf_y_test = y_trainval[train_index], y_trainval[test_index]

    # Train the Random Forest model on the TF-IDF features
    kf_model.fit(kf_x_train, kf_y_train)

    pred_values = kf_model.predict(kf_x_test)

    acc = accuracy_score(kf_y_test, pred_values)

    acc_score.append(acc)
    kfold_report.append(classification_report(kf_y_test, pred_values, target_names=target_names, digits=4, output_dict=True))

average_accuracy = np.mean(acc_score)
max_accuracy = max(acc_score)

# Final Testing
final_predictions = kf_model.predict(X_test_tfidf)
accuracy_test = accuracy_score(y_test, final_predictions)
precision_test = precision_score(y_test, final_predictions, average='micro')
recall_test = recall_score(y_test, final_predictions, average='micro')
f1_test = f1_score(y_test, final_predictions, average='micro')

# Print the results
print("Average accuracy during cross-validation:", average_accuracy)
print("Accuracy on the test set:", accuracy_test)
print("Precision on the test set:", precision_test)
print("Recall on the test set:", recall_test)
print("F1 score on the test set:", f1_test)
print(classification_report(y_true=y_test, y_pred=final_predictions, digits=4, target_names=target_names, output_dict=True))


Average accuracy during cross-validation: 0.8416666666666668
Accuracy on the test set: 0.825
Precision on the test set: 0.825
Recall on the test set: 0.825
F1 score on the test set: 0.825
{'negatif': {'precision': 0.8823529411764706, 'recall': 0.75, 'f1-score': 0.8108108108108107, 'support': 20}, 'positif': {'precision': 0.782608695652174, 'recall': 0.9, 'f1-score': 0.8372093023255814, 'support': 20}, 'accuracy': 0.825, 'macro avg': {'precision': 0.8324808184143222, 'recall': 0.825, 'f1-score': 0.824010056568196, 'support': 40}, 'weighted avg': {'precision': 0.8324808184143222, 'recall': 0.825, 'f1-score': 0.824010056568196, 'support': 40}}


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_trainval)
X_trainval_sequences = tokenizer.texts_to_sequences(X_trainval)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure equal length
max_len = max(max(len(seq) for seq in X_trainval_sequences), max(len(seq) for seq in X_test_sequences))
X_trainval_padded = pad_sequences(X_trainval_sequences, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len, padding='post')

# Convert Pandas Series to NumPy arrays
X_trainval_padded = np.array(X_trainval_padded)
X_test_padded = np.array(X_test_padded)
y_trainval = np.array(y_trainval)
y_test = np.array(y_test)

# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_len))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_trainval_padded, y_trainval, epochs=5, batch_size=32, validation_split=0.1)

# Evaluate the model on the test set
final_predictions_prob = model.predict(X_test_padded)
final_predictions = np.round(final_predictions_prob)

# Convert predictions to binary (assuming binary classification)
final_predictions = np.round(final_predictions)

# Print the results
print("Accuracy on the test set:", accuracy_score(y_test, final_predictions))
print("Precision on the test set:", precision_score(y_test, final_predictions, average='micro'))
print("Recall on the test set:", recall_score(y_test, final_predictions, average='micro'))
print("F1 score on the test set:", f1_score(y_test, final_predictions, average='micro'))
print(classification_report(y_true=y_test, y_pred=final_predictions, digits=4, output_dict=True))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy on the test set: 0.5
Precision on the test set: 0.5
Recall on the test set: 0.5
F1 score on the test set: 0.5
{'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 20}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 20}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 40}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 40}}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Logistic Regression model
logistic_regression_model = LogisticRegression(max_iter=1000, random_state=42)

# Cross-validation
k_fold = 10
skf = StratifiedKFold(n_splits=k_fold)

target_names = ['negatif', 'positif']
acc_score = []
kfold_report = []

for train_index, test_index in skf.split(X_trainval, y_trainval):
    kf_x_train, kf_x_test = X_trainval[train_index], X_trainval[test_index]
    kf_y_train, kf_y_test = y_trainval[train_index], y_trainval[test_index]

    # Fit and transform the training data using TF-IDF
    kf_x_train_tfidf = tfidf_vectorizer.transform(kf_x_train)

    # Transform the test data using the fitted TF-IDF vectorizer
    kf_x_test_tfidf = tfidf_vectorizer.transform(kf_x_test)

    # Train the Logistic Regression model on the TF-IDF features
    logistic_regression_model.fit(kf_x_train_tfidf, kf_y_train)

    # Predictions on the test set
    pred_values = logistic_regression_model.predict(kf_x_test_tfidf)

    # Calculate accuracy
    acc = accuracy_score(kf_y_test, pred_values)

    # Append accuracy score and classification report to lists
    acc_score.append(acc)
    kfold_report.append(classification_report(kf_y_test, pred_values, target_names=target_names, digits=4, output_dict=True))

# Calculate average accuracy and maximum accuracy
average_accuracy = np.mean(acc_score)
max_accuracy = max(acc_score)

# Final Testing
final_predictions = logistic_regression_model.predict(X_test_tfidf)

accuracy_test = accuracy_score(y_test, final_predictions)
precision_test = precision_score(y_test, final_predictions, average='micro')
recall_test = recall_score(y_test, final_predictions, average='micro')
f1_test = f1_score(y_test, final_predictions, average='micro')

# Print the results
print("Average accuracy during cross-validation:", average_accuracy)
print("Accuracy on the test set:", accuracy_test)
print("Precision on the test set:", precision_test)
print("Recall on the test set:", recall_test)
print("F1 score on the test set:", f1_test)
print(classification_report(y_true=y_test, y_pred=final_predictions, digits=4, output_dict=True))


Average accuracy during cross-validation: 0.8777777777777779
Accuracy on the test set: 0.875
Precision on the test set: 0.875
Recall on the test set: 0.875
F1 score on the test set: 0.875
{'0': {'precision': 0.9411764705882353, 'recall': 0.8, 'f1-score': 0.8648648648648648, 'support': 20}, '1': {'precision': 0.8260869565217391, 'recall': 0.95, 'f1-score': 0.8837209302325583, 'support': 20}, 'accuracy': 0.875, 'macro avg': {'precision': 0.8836317135549872, 'recall': 0.875, 'f1-score': 0.8742928975487115, 'support': 40}, 'weighted avg': {'precision': 0.8836317135549873, 'recall': 0.875, 'f1-score': 0.8742928975487114, 'support': 40}}


In [11]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Assuming X_trainval, y_trainval, X_test, y_test are your text data and labels

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_trainval)
X_trainval_sequences = tokenizer.texts_to_sequences(X_trainval)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure equal length
max_len = max(max(len(seq) for seq in X_trainval_sequences), max(len(seq) for seq in X_test_sequences))
X_trainval_padded = pad_sequences(X_trainval_sequences, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len, padding='post')

# Convert labels to one-hot encoding
y_trainval_one_hot = to_categorical(y_trainval)
y_test_one_hot = to_categorical(y_test)

# Define the CNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_len))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(2, activation='softmax'))  # Assuming binary classification

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_trainval_padded, y_trainval_one_hot, epochs=5, batch_size=32, validation_split=0.1)

# Evaluate the model on the test set
final_predictions_prob = model.predict(X_test_padded)
final_predictions = np.argmax(final_predictions_prob, axis=1)

accuracy_test = accuracy_score(y_test, final_predictions)
precision_test = precision_score(y_test, final_predictions, average='micro')
recall_test = recall_score(y_test, final_predictions, average='micro')
f1_test = f1_score(y_test, final_predictions, average='micro')

# Print the results
print("Accuracy on the test set:", accuracy_test)
print("Precision on the test set:", precision_test)
print("Recall on the test set:", recall_test)
print("F1 score on the test set:", f1_test)
print(classification_report(y_true=y_test, y_pred=final_predictions, digits=4, output_dict=True))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy on the test set: 0.825
Precision on the test set: 0.825
Recall on the test set: 0.825
F1 score on the test set: 0.825
{'0': {'precision': 0.8095238095238095, 'recall': 0.85, 'f1-score': 0.8292682926829269, 'support': 20}, '1': {'precision': 0.8421052631578947, 'recall': 0.8, 'f1-score': 0.8205128205128205, 'support': 20}, 'accuracy': 0.825, 'macro avg': {'precision': 0.8258145363408521, 'recall': 0.825, 'f1-score': 0.8248905565978737, 'support': 40}, 'weighted avg': {'precision': 0.825814536340852, 'recall': 0.825, 'f1-score': 0.8248905565978737, 'support': 40}}
