In [None]:
# import libraries

import joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import sparse
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from xgboost import XGBClassifier
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

In [None]:
# load review sentiment data

review_df = pd.read_csv('data/review_sentiment.csv')

review_df

In [None]:
feature_set = {
    0: 'bag_of_words',
    1: 'one_hot',
    2: 'n_grams',
    3: 'tf_idf',
    4: 'word2vec',
    5: 'combined_features'
}
# Load all feature sets
features = {}
for key, feature_name in feature_set.items():
    if key == 4:
        features[key] = np.load('features/' + feature_name + '.npy')
    else:
        features[key] = sparse.load_npz('features/' + feature_name + '.npz')

In [None]:
# target labels

y = review_df['sentiment'].to_numpy()

y.shape

In [None]:
# Load word2vec features for grid search
word2vec_features = features[4]
bow_features = features[0]


In [None]:
# Split data into training and test sets using word2vec features
X_train_w2v, X_test_w2v, y_train, y_test = train_test_split(word2vec_features, y, test_size=0.20, random_state=42)


Classifiers

In [None]:
# classifiers

classifiers = {
    'gaussian_nb': GaussianNB(),
    'multino_nb': MultinomialNB(),
    'decision_tree': DecisionTreeClassifier(),
    'random_forest': RandomForestClassifier(),
    'svm': SVC(),
    'perceptron': Perceptron(tol=1e-3, random_state=0),
    'xgb': XGBClassifier(),
    'logistic_regression': LogisticRegression(max_iter=1000, random_state=0)
}

In [None]:
param_grids = {
    # 'gaussian_nb': {},
    # 'decision_tree': {
    #     'max_depth': [None, 10, 20]
    # },
    # 'random_forest': {
    #     'n_estimators': [100, 200],
    #     'max_depth': [None, 10]
    # },
    # 'svm': {
    #     'C': [0.1, 1.0],
    #     'kernel': ['linear', 'rbf']
    # },
    # 'perceptron': {
    #     'alpha': [0.0001, 0.001],
    #     'penalty': [None, 'l2']
    # },
    'xgb': {
        'max_depth': [3, 5],
        'learning_rate': [0.01, 0.1],
        'n_estimators': [100, 200]
    },
    'logistic_regression': {
        'C': [0.1, 1.0],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    },
}


In [None]:

# Convert numpy arrays to pandas Series
y_train_series = pd.Series(y_train)
y_test_series = pd.Series(y_test)

# Remap labels: -1 -> 0, 0 -> 1, 1 -> 2
y_train_mapped = y_train_series.map({-1: 0, 0: 1, 1: 2})
y_test_mapped = y_test_series.map({-1: 0, 0: 1, 1: 2})


Cross validation

In [None]:
# cv_scores = {}

# # Cross-validation for each classifier
# for clf_name, clf in classifiers.items():
#     print(f"Cross-validating {clf_name}...")
#     if clf_name == 'xgb':
#         cv_results = cross_validate(clf, X_train_w2v, y_train_mapped, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], cv=5)

#     else:
#         cv_results = cross_validate(clf, X_train_w2v, y_train, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], cv=5)

#     cv_scores[clf_name] = cv_results

# for clf_name, cv_result in cv_scores.items():
#     print(f"Classifier: {clf_name}")
#     print("Accuracy:", cv_result['test_accuracy'].mean())
#     print("Precision (Macro):", cv_result['test_precision_macro'].mean())
#     print("Recall (Macro):", cv_result['test_recall_macro'].mean())
#     print("F1 Score (Macro):", cv_result['test_f1_macro'].mean())
#     print("-------------------------")

Grid search

In [None]:
# best_estimators = {}

# # Apply grid search to each classifier using word2vec features
# for clf_name, clf in classifiers.items():
#     print(f"Grid search for {clf_name}")
#     param_grid = param_grids[clf_name]
#     if clf_name == 'xgb':
#         grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=3)
#         grid_search.fit(X_train_w2v, y_train_mapped)
#     else:
#         grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=3)
#         grid_search.fit(X_train_w2v, y_train)
#     best_estimators[clf_name] = grid_search.best_estimator_
#     print(f"Best parameters for {clf_name}: {grid_search.best_params_}")
#     print(f"Best accuracy for {clf_name}: {grid_search.best_score_}")
#     print("---------------------------------------------")


In [None]:
best_classifiers = {
    'xgb': XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=200),
    'gaussian_nb': GaussianNB(),
    'multinomial_nb': MultinomialNB(),
    'decision_tree': DecisionTreeClassifier(max_depth=10),
    'random_forest': RandomForestClassifier(max_depth=10, n_estimators=200),
    'svm': SVC(C=1.0, kernel='rbf'),
    'perceptron': Perceptron(tol=1e-3, random_state=0, alpha=0.0001, penalty=None),
    'logistic_regression': LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=1000)
}


In [None]:
from sklearn.metrics import accuracy_score

# Loop through each feature set
for feature_key, feature_name in feature_set.items():

    X_train, X_test, y_train, y_test = train_test_split(features[feature_key], y, test_size=0.20, random_state=42)

    # Oversample to balance the classes
    oversampler = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
    
    # change -1 to 2 because of xbg
    y_train_resampled_series = pd.Series(y_train_resampled)
    y_test_series = pd.Series(y_test)
    y_train_resampled_mapped = y_train_resampled_series.map({-1: 0, 0: 1, 1: 2})
    y_test_mapped = y_test_series.map({-1: 0, 0: 1, 1: 2})


    print(f"Using {feature_name} features:")
    
    # Loop through each classifier
    for clf_name, clf in best_classifiers.items():
        print(f"Training {clf_name} with {feature_name} features...")
        
        
        # Convert sparse to dense if necessary
        if feature_key != 4 and clf_name in ['gaussian_nb', 'multinomial_nb', 'perceptron']:
            X_train_dense = X_train_resampled.toarray()
            X_test_dense = X_test.toarray()
            clf.fit(X_train_dense, y_train_resampled)
            y_pred = clf.predict(X_test_dense)
        else:
            if clf_name == 'xgb':
                clf.fit(X_train_resampled, y_train_resampled_mapped)
            else:
                clf.fit(X_train_resampled, y_train_resampled)

            y_pred = clf.predict(X_test)
        
        # Evaluate
        if clf_name == 'xgb':
            acc = accuracy_score(y_test_mapped, y_pred)
        else:
            acc = accuracy_score(y_test, y_pred)
        
        print(f"Accuracy of {clf_name} with {feature_name} features: {acc}")
    
    print("---------------------------------------------")
