## 1. Project Setup

**Load Data & Packages**

In [None]:
import numpy as np
from sklearn.naive_bayes import ComplementNB
from sklearn.preprocessing import MinMaxScaler
import os
import analyze_k
SEED = 12

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
os.chdir(parent_dir)

import load_data as ld
from evaluate_classification import evaluate_classification
import loops

df, X_valid, y_valid, train_indices, valid_indices = ld.load_train_data(filepath = 'Kaggle_download/train.csv')
scaler = MinMaxScaler()

In [None]:
scaler = MinMaxScaler()
X_train = df.iloc[:, :-1]
y_train = df.loc[:, 'Target']

### Out of the Box

#### Naive Bayes

In [None]:
nb = ComplementNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_valid)
evaluate_classification(y_pred, y_valid, cm=False)

### KNN 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_valid)
evaluate_classification(y_pred, y_valid, cm=False)

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state = SEED,
                            n_estimators = 1600,
                            min_samples_split = 2,
                            min_samples_leaf = 1,
                            max_features = 'sqrt',
                            max_depth = 100,
                            bootstrap = False
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_valid)
evaluate_classification(y_pred, y_valid, cm=False)


#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear', penalty='l2')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_valid)
evaluate_classification(y_pred, y_valid, cm=False)

### Final Models

#### Naive Bayes

In [None]:
nb = ComplementNB()
nb_results = loops.loop_model(nb, df, train_indices, valid_indices, scaler=scaler)
ind = analyze_k.select_best(nb_results,'recall',4)
nb.fit(df.iloc[train_indices[ind],:].drop(columns='Target'), df.iloc[train_indices[ind],[-1]])
y_pred = nb.predict(X_valid)
print('Final Classification')
evaluate_classification(y_pred, y_valid)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
knn_results = loops.loop_model(knn, df, train_indices, valid_indices, scaler=scaler)
ind = analyze_k.select_best(knn_results,'recall',4)
knn.fit(df.iloc[train_indices[ind],:].drop(columns='Target'), df.iloc[train_indices[ind],[-1]])
y_pred = nb.predict(X_valid)
print('Final Classification')
evaluate_classification(y_pred, y_valid)

In [None]:
#from sklearn.ensemble import RandomForestClassifier
#clf = RandomForestClassifier(random_state = SEED,
                            #n_estimators = 1600,
                            #min_samples_split = 2,
                            #min_samples_leaf = 1,
                            #max_features = 'sqrt',
                            #max_depth = 100,
                            #bootstrap = False
#)

#clf_results = loops.loop_model(clf, df, train_indices, valid_indices, scaler=scaler)
#ind = analyze_k.select_best(clf_results,'recall',4)
#clf.fit(df.iloc[train_indices[ind],:].drop(columns='Target'), df.iloc[train_indices[ind],[-1]])
#y_pred = clf.predict(X_valid)
#print('Final Classification')
#evaluate_classification(y_pred, y_valid)

#### Logistic Regression

In [None]:
reg = LogisticRegression(solver='liblinear', penalty='l2')
reg_results = loops.loop_model(reg, df, train_indices, valid_indices, scaler=scaler)
ind = analyze_k.select_best(reg_results,'recall',4)
reg.fit(df.iloc[train_indices[ind],:].drop(columns='Target'), df.iloc[train_indices[ind],[-1]])
y_pred = clf.predict(X_valid)
print('Final Classification')
evaluate_classification(y_pred, y_valid)


## Binary Model

In [None]:
# Recode target classes in the DataFrame
df['Target'] = df['Target'].replace({1: 1, 2: 1, 3: 0, 4: 0})

# Recode target classes in the validation data
y_valid['Target'] = y_valid['Target'].replace({1: 1, 2: 1, 3: 0, 4: 0})


### Naive Bayes

In [None]:
nb = ComplementNB()
nb_results = loops.loop_model(nb, df, train_indices, valid_indices, scaler=scaler)
#ind = analyze_k.select_best(nb_results,'recall',4)
#nb.fit(df.iloc[train_indices[ind],:].drop(columns='Target'), df.iloc[train_indices[ind],[-1]])
#y_pred = nb.predict(X_valid)
#print('Final Classification')
#evaluate_classification(y_pred, y_valid)

#### Random Forest

In [None]:
#clf_results = loops.loop_model(clf, df, train_indices, valid_indices, scaler=scaler)
#ind = analyze_k.select_best(clf_results,'recall',4)
#clf.fit(df.iloc[train_indices[ind],:].drop(columns='Target'), df.iloc[train_indices[ind],[-1]])
#y_pred = clf.predict(X_valid)
#print('Final Classification')
#evaluate_classification(y_pred, y_valid)

#### KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_valid)
evaluate_classification(y_pred, y_valid, cm=False)

#### Logistic Regression

In [None]:
reg_results = loops.loop_model(reg, df, train_indices, valid_indices, scaler=scaler)
#ind = analyze_k.select_best(reg_results,'recall',4)
#reg.fit(df.iloc[train_indices[ind],:].drop(columns='Target'), df.iloc[train_indices[ind],[-1]])
#y_pred = clf.predict(X_valid)
#print('Final Classification')
#evaluate_classification(y_pred, y_valid)


# Two Stage Binary Model

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Standardize the features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_valid)

# Stage 1: create a binary target separating classes 1/2 from 3/4
y_train_binary = y_train.copy()
y_train_binary[y_train_binary.isin([1, 2])] = 1
y_train_binary[y_train_binary.isin([3, 4])] = 0

# Fit the first model
model_1_2_vs_3_4 = ComplementNB()
model_1_2_vs_3_4.fit(X_train_scaled, y_train_binary)

# Make predictions and evaluate
y_pred = model_1_2_vs_3_4.predict(X_test_scaled)
evaluate_classification(y_pred, y_valid)

# Stage 2: separate class 1 from class 2
mask_1_2 = y_train_binary == 1
model_1_vs_2 = ComplementNB()
model_1_vs_2.fit(X_train_scaled[mask_1_2], y_train[mask_1_2])

# Make predictions and evaluate
y_pred = model_1_vs_2.predict(X_test_scaled)
evaluate_classification(y_pred, y_valid)

# Stage 3: separate class 3 from class 4
mask_3_4 = y_train_binary == 0
model_3_vs_4 = ComplementNB()
model_3_vs_4.fit(X_train_scaled[mask_3_4], y_train[mask_3_4])

# Make predictions and evaluate
y_pred = model_3_vs_4.predict(X_test_scaled)
evaluate_classification(y_pred, y_valid)