Nama: Asyifa Nurul Shafira

NIM: 13521125

# Preprocessing

In [20]:
import pandas as pd
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

# Load dataset
train_df = pd.read_csv('../data/train_preprocess.tsv', delimiter='\t', header=None, names=['text', 'label'])
test_df = pd.read_csv('../data/test_preprocess.tsv', delimiter='\t', header=None, names=['text', 'label'])

# Set stopwords untuk Bahasa Indonesia
stop_words = set(stopwords.words('indonesian'))

# Fungsi preprocessing
def preprocess_text(text):
    # Mengubah teks menjadi huruf kecil
    text = text.lower()
    # Memecah teks menjadi token
    tokens = text.split()
    # Menghapus stop-word
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Mengaplikasikan preprocessing pada data latih dan data uji
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asyifashfr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Ekstraksi Fitur

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Memisahkan data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(train_df['text'], train_df['label'], test_size=0.2, random_state=42)

# Menggunakan CountVectorizer untuk mengubah teks menjadi representasi numerik berbasis frekuensi kata (Bag of Words)
vectorizer = CountVectorizer()

# Model

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Penyeimbangan data dengan RandomOversampler untuk kelas minoritas
oversampler = RandomOverSampler(random_state=42)

# Pipeline Naive Bayes dengan oversampling
nb_pipeline = ImbPipeline([
    ('vectorizer', vectorizer),
    ('oversampler', oversampler),
    ('classifier', MultinomialNB())
])

# Pipeline Logistic Regression dengan GridSearchCV dan oversampling
logreg_pipeline = ImbPipeline([
    ('vectorizer', vectorizer),
    ('oversampler', oversampler),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Hyperparameter tuning untuk Logistic Regression
logreg_param_grid = {
    'classifier__C': [0.1, 1, 10],  # Mengatur tingkat regularisasi C, nilai kecil memberikan regularisasi lebih besar
    'classifier__solver': ['liblinear', 'lbfgs'] # Solver yang digunakan untuk menghitung Logistic Regression
}
logreg_grid = GridSearchCV(logreg_pipeline, logreg_param_grid, cv=5) # Melakukan pencarian parameter dengan cross-validation 5-fold

# Pipeline SVM dengan oversampling
svm_pipeline = ImbPipeline([
    ('vectorizer', vectorizer),
    ('oversampler', oversampler),
    ('classifier', SVC())
])

# Pelatihan dan Evaluasi

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Fungsi untuk melatih model dan mengevaluasinya
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Melatih model dengan data training
    model.fit(X_train, y_train)

    # Prediksi data test
    y_pred = model.predict(X_test)

    # Menghitung metrik evaluasi
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"\n=== {model_name} Results ===")
    print(f"Accuracy: {accuracy}")
    print(f"Precision (weighted): {precision}")
    print(f"Recall (weighted): {recall}")
    print(f"F1-Score (weighted): {f1}\n")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("="*50)

# Evaluasi model Naive Bayes
evaluate_model(nb_pipeline, X_train, X_test, y_train, y_test, "Naive Bayes")

# Evaluasi model Logistic Regression setelah tuning hyperparameter dengan Grid Search
logreg_grid.fit(X_train, y_train) # Melakukan tuning
evaluate_model(logreg_grid, X_train, X_test, y_train, y_test, "Logistic Regression (with tuning)")

# Evaluasi model SVM
evaluate_model(svm_pipeline, X_train, X_test, y_train, y_test, "SVM")


=== Naive Bayes Results ===
Accuracy: 0.8313636363636364
Precision (weighted): 0.8303951673296521
Recall (weighted): 0.8313636363636364
F1-Score (weighted): 0.8306650149195586

Classification Report:
               precision    recall  f1-score   support

    negative       0.78      0.74      0.76       680
     neutral       0.74      0.76      0.75       239
    positive       0.87      0.89      0.88      1281

    accuracy                           0.83      2200
   macro avg       0.80      0.80      0.80      2200
weighted avg       0.83      0.83      0.83      2200


=== Logistic Regression (with tuning) Results ===
Accuracy: 0.8418181818181818
Precision (weighted): 0.8466360088430017
Recall (weighted): 0.8418181818181818
F1-Score (weighted): 0.8434669697208951

Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.81      0.79       680
     neutral       0.72      0.80      0.76       239
    positive       0.91      0