## 1. Project Setup

**Load Data & Packages**

In [None]:
import numpy as np
from sklearn.naive_bayes import ComplementNB, GaussianNB, BernoulliNB, MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import os
import analyze_k
SEED = 12

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
os.chdir(parent_dir)

import load_data as ld
from evaluate_classification import evaluate_classification
import loops

df, train_indices, valid_indices = ld.load_train_data(filepath = 'Kaggle_download/train.csv')
scaler = MinMaxScaler()

import warnings
warnings.filterwarnings('ignore')

In [None]:
# below lines unnecessary?
# X_train = df.drop(columns="Target")
# y_train = df.loc[:, 'Target']

### Out of the Box

#### Naive Bayes

In [None]:
nb = ComplementNB()
nb_results = loops.loop_model(nb, df, train_indices, valid_indices, inc_cm=False)

In [None]:
avg_nb = analyze_k.average_outcome(nb_results)
avg_nb

### KNN 

In [None]:
knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
knn_results = loops.loop_model(knn, df, train_indices, valid_indices, inc_cm=False)

In [None]:
avg_knn = analyze_k.average_outcome(knn_results)
avg_knn

#### Random Forest

In [None]:
clf = RandomForestClassifier(random_state = SEED)
clf_results = loops.loop_model(clf, df, train_indices, valid_indices, inc_cm=False)


In [None]:
avg_clf = analyze_k.average_outcome(clf_results)
avg_clf

#### Logistic Regression

In [None]:
lr = LogisticRegression(solver='liblinear', penalty='l2')
lr_results = loops.loop_model(lr, df, train_indices, valid_indices, inc_cm=False)

In [None]:
avg_lr = analyze_k.average_outcome(lr_results)
avg_lr

### Final Models

#### Naive Bayes

In [None]:
nb = ComplementNB()
nb_results = loops.loop_model(nb, df, train_indices, valid_indices, oversample=ld.gen_SMOTE_data, var_thresh=False, scaler=scaler, inc_cm=False)

In [None]:
avg_nb = analyze_k.average_outcome(nb_results)
avg_nb

In [None]:
knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
knn_results = loops.loop_model(knn, df, train_indices, valid_indices, scaler=scaler, var_thresh=True, inc_cm=False)

In [None]:
avg_knn = analyze_k.average_outcome(knn_results)
avg_knn

In [None]:
# set seed to its value in Random_Forest, where extensive testing occurred
SEED = 0

X_smote, y_smote = ld.gen_SMOTE_data(df, seed=SEED)

clf = RandomForestClassifier(random_state = SEED, 
                            n_estimators = 400,
                            min_samples_split = 2,
                            min_samples_leaf = 1,
                            max_features = 'sqrt',
                            max_depth = None,
                            bootstrap = False)

clf.fit(X_smote, y_smote)

clf_results = loops.loop_model(clf, df, train_indices, valid_indices, scaler=scaler, var_thresh=True, inc_cm=False)

In [None]:
avg_clf = analyze_k.average_outcome(clf_results)
avg_clf

In [None]:
# Change seed back to original value
SEED = 12

#### Logistic Regression

In [None]:
lr = LogisticRegression(solver='liblinear', penalty='l2')
lr_results = loops.loop_model(lr, df, train_indices, valid_indices, oversample=ld.gen_SMOTE_data, var_thresh = True, inc_cm=False)


In [None]:
avg_lr = analyze_k.average_outcome(lr_results)
avg_lr

## Binary Model

In [None]:
# Recode target classes in the DataFrame
df['Target'] = df['Target'].replace({1: 1, 2: 1, 3: 0, 4: 0})

### Naive Bayes

In [None]:
nb_results = loops.loop_model(nb, df, train_indices, valid_indices, scaler=scaler, inc_cm=True)
analyze_k.average_outcome(nb_results)

In [None]:
avg_nb = analyze_k.average_outcome(nb_results)
avg_nb

#### Random Forest

In [None]:
clf_results = loops.loop_model(clf, df, train_indices, valid_indices, scaler=scaler, inc_cm=False)

In [None]:
avg_clf = analyze_k.average_outcome(clf_results)
avg_clf

#### KNN

In [None]:
knn_results = loops.loop_model(knn, df, train_indices, valid_indices, scaler=scaler, inc_cm=False)

In [None]:
avg_knn = analyze_k.average_outcome(knn_results)
avg_knn

#### Logistic Regression

In [None]:
lr_results = loops.loop_model(lr, df, train_indices, valid_indices, scaler=scaler, inc_cm=False)

In [None]:
lr_knn = analyze_k.average_outcome(knn_results)
lr_knn

# Two Stage Binary Model

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Standardize the features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Stage 1: create a binary target separating classes 1/2 from 3/4
y_train_binary = y_train.copy()
y_train_binary[y_train_binary.isin([1, 2])] = 1
y_train_binary[y_train_binary.isin([3, 4])] = 0

# Fit the first model
model_1_2_vs_3_4 = ComplementNB()
model_1_2_vs_3_4.fit(X_train_scaled, y_train_binary)

# Make predictions and evaluate
y_pred = model_1_2_vs_3_4.predict(X_test_scaled)
evaluate_classification(y_pred, y_test)

# Stage 2: separate class 1 from class 2
mask_1_2 = y_train_binary == 1
model_1_vs_2 = ComplementNB()
model_1_vs_2.fit(X_train_scaled[mask_1_2], y_train[mask_1_2])

# Make predictions and evaluate
y_pred = model_1_vs_2.predict(X_test_scaled)
evaluate_classification(y_pred, y_test)

# Stage 3: separate class 3 from class 4
mask_3_4 = y_train_binary == 0
model_3_vs_4 = ComplementNB()
model_3_vs_4.fit(X_train_scaled[mask_3_4], y_train[mask_3_4])

# Make predictions and evaluate
y_pred = model_3_vs_4.predict(X_test_scaled)
evaluate_classification(y_pred, y_test)