# Model Training

In [1]:
%load_ext autoreload
%autoreload 2

import common
import pandas as pd
import scipy.sparse
import joblib
from sklearn.metrics import classification_report,confusion_matrix,ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import wandb
import pickle
import os

In [4]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnontaphat-c[0m ([33mnontaphat-c-nus[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
def evaluate(y_true,y_pred,digits=5):
    print(classification_report(y_true,y_pred,digits=digits))
    confusion_mtx = confusion_matrix(y_true,y_pred)
    disp = ConfusionMatrixDisplay(confusion_mtx)
    disp.plot()
    plt.show()

## Parameters

In [6]:
# change this accordingly
project_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

save_path = f"{project_path}/scicite_preprocessed"
dataset = "selected-features"
project_name = "model-comparison-smote-svc-fine-tuned"

## 1. Load dataset

In [7]:
train_df = pd.read_csv(f"{save_path}/train-{dataset}.csv")
test_df = pd.read_csv(f"{save_path}/test-{dataset}.csv")
label_encoder = joblib.load(f"{save_path}/label_encoder.pkl")

X_train = train_df.drop(columns=['label'])
y_train = train_df["label"]
X_test = test_df.drop(columns=['label'])
y_test = test_df["label"]

smote = SMOTE(sampling_strategy='all', random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

## Load fine-tuned model

In [None]:
# load all the fined-tuned-model for ensemble
# load fine-tuned log regression

with open(r"E:\cs4248_proj\CS4248-NLP-Project\model\fine-tuend-log-reg.pkl","rb") as f:
    log_model = pickle.load(f)
# load random forest model
with open(r"E:\cs4248_proj\CS4248-NLP-Project\model\fine-tuned-rf.pkl","rb") as f:
    rf_model = pickle.load(f)

# Essemble Classifier

In [None]:

estimators=[ 
        ('logistic', log_model),
        ('random_forest', rf_model)
        ]
# Define stacking classifier with pre-trained models
stacking_clf = StackingClassifier(
    estimators= estimators,
    final_estimator=LogisticRegression(max_iter=1000000),
    passthrough=False, 
    cv='prefit',  # Cross-validation for meta-classifier training
    stack_method="auto",  # Uses "predict_proba" or "decision_function" if available
    
    verbose=1
)

# Train the stacking classifier
stacking_clf.fit(X_train, y_train)

# Make predictions
y_pred = stacking_clf.predict(X_test)

# Evaluate the model
wandb.init(entity='cs4248-nlp-project',project=f"{project_name}-{dataset}", name="Trained-ensemble(log,rf)_log_prefit")
common.evaluate(y_test, y_pred, label_encoder, "Trained-ensemble(log,rf)_log_prefit")
wandb.finish()


# fine tuned ensemble

In [None]:
params = {"final_estimator":[LogisticRegression(max_iter=1000000)],'passthrough':[True,False]}

In [None]:
model = StackingClassifier(
    estimators= estimators,
    cv='prefit',  # Cross-validation for meta-classifier training
    stack_method="auto",  # Uses "predict_proba" or "decision_function" if available
    
    verbose=1
)
model = GridSearchCV(model,params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test,y_pred)
wandb.init(entity='cs4248-nlp-project',project=f"{project_name}-{dataset}", name="log-reg-fine-tuned")
common.evaluate(y_test, y_pred, label_encoder, "Logistic Regrssion Classifier fine tuned")
wandb.finish()
with open(r"E:\cs4248_proj\CS4248-NLP-Project\model\fine-tuned-ensemble.pkl", "wb") as f:
    pickle.dump(model)