# Model Training

In [1]:
%load_ext autoreload
%autoreload 2

import common
import pandas as pd
import scipy.sparse
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,StackingClassifier,GradientBoostingClassifier,VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import wandb
import numpy as np
import os

In [2]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnontaphat-c[0m ([33mnontaphat-c-nus[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Parameters

In [None]:
# change this accordingly
project_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

save_path = f"{project_path}/scicite_preprocessed"
dataset = "selected-features"
dataset2 = 'selecetd-features-glove'
project_name = "model-comparison-smote-ensemble-manual-voting(2-rf)"

## 1. Load dataset

In [4]:
train_df = pd.read_csv(f"{save_path}/train-{dataset}.csv")
test_df = pd.read_csv(f"{save_path}/test-{dataset}.csv")
label_encoder = joblib.load(f"{save_path}/label_encoder.pkl")

X_train = train_df.drop(columns=['label'])
y_train = train_df["label"]
X_test = test_df.drop(columns=['label'])
y_test = test_df["label"]

smote = SMOTE(sampling_strategy='all', random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
train_df2 = pd.read_csv(f"{save_path}/train-{dataset2}.csv")
test_df2 = pd.read_csv(f"{save_path}/test-{dataset2}.csv")
label_encoder2 = joblib.load(f"{save_path}/label_encoder.pkl")

X_train2 = train_df2.drop(columns=['label'])
y_train2 = train_df2["label"]
X_test2 = test_df2.drop(columns=['label'])
y_test2 = test_df2["label"]

smote2 = SMOTE(sampling_strategy='all', random_state=42)
X_train2, y_train2 = smote2.fit_resample(X_train2, y_train2)

## 2. Train-validation split

In [5]:
# Split into 80% train and 20% validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## 4. Train Random Forest Classifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
y_pred_rf_prob = rf_model.predict_proba(X_test)

wandb.init(entity='cs4248-nlp-project',project=f"{project_name}-{dataset}", name="random-forest-classifier-selected-features")
common.evaluate(y_test, y_pred_rf, label_encoder, "Random Forest Classifier selected-features")
wandb.finish()

In [None]:
rf_model2 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model2.fit(X_train2, y_train2)
y_pred_rf2 = rf_model2.predict(X_test)
y_pred_rf_prob2 = rf_model2.predict_proba(X_test)

wandb.init(entity='cs4248-nlp-project',project=f"{project_name}-{dataset}", name="random-forest-classifier-selected_features-glove")
common.evaluate(y_test, y_pred_rf, label_encoder, "Random Forest Classifier selected-features-glove")
wandb.finish()

In [None]:
avg_probs = (y_pred_rf_prob+y_pred_rf_prob2) / 2
y_pred_voted = np.argmax(avg_probs, axis=1)

In [None]:
wandb.init(entity='cs4248-nlp-project',project=f"{project_name}-{dataset}", name="random-forest-classifier-selected_features-voted")
common.evaluate(y_test, y_pred_voted, label_encoder, "Random Forest Classifier selected-features-voted")
wandb.finish()