## 1. Project Setup

**Load Data & Packages**

In [None]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import ComplementNB, GaussianNB, BernoulliNB, MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE
import os
import analyze_k
SEED = 12

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
os.chdir(parent_dir)

import load_data as ld
df, train_indices, valid_indices= ld.load_train_data()
from evaluate_classification import evaluate_classification
from sklearn.model_selection import train_test_split
from evaluate_classification import evaluate_classification
import loops

df, train_indices, valid_indices = ld.load_train_data(filepath = 'Kaggle_download/train.csv')
scaler = MinMaxScaler()

import warnings
warnings.filterwarnings('ignore')

In [None]:
# %cd "/Users/andrewdunn/Desktop/Classes/UChicago/"CAPP 30254 - Machine Learning"/Costa-Rican-Household-Poverty-Level-Prediction/ml_model_testing"


### Out of the Box

#### Naive Bayes

In [None]:
nb = ComplementNB()
nb_results = loops.loop_model(nb, df, train_indices, valid_indices, inc_cm=False)

In [None]:
avg_nb = analyze_k.average_outcome(nb_results)
avg_nb

### KNN 

In [None]:
knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
knn_results = loops.loop_model(knn, df, train_indices, valid_indices, inc_cm=False)

In [None]:
avg_knn = analyze_k.average_outcome(knn_results)
avg_knn

#### Random Forest

In [None]:
clf = RandomForestClassifier(random_state = SEED)
clf_results = loops.loop_model(clf, df, train_indices, valid_indices, inc_cm=False)


In [None]:
avg_clf = analyze_k.average_outcome(clf_results)
avg_clf

#### Logistic Regression

In [None]:
lr = LogisticRegression(solver='liblinear', penalty='l2')
lr_results = loops.loop_model(lr, df, train_indices, valid_indices, inc_cm=False)

In [None]:
avg_lr = analyze_k.average_outcome(lr_results)
avg_lr

### Final Models

#### Naive Bayes

In [None]:
nb = ComplementNB()
nb_results = loops.loop_model(nb, df, train_indices, valid_indices, oversample=ld.gen_SMOTE_data, var_thresh=False, scaler=scaler, inc_cm=False)

In [None]:
avg_nb = analyze_k.average_outcome(nb_results)
avg_nb

#### KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
knn_results = loops.loop_model(knn, df, train_indices, valid_indices, scaler=scaler, var_thresh=True, inc_cm=False)

In [None]:
avg_knn = analyze_k.average_outcome(knn_results)
avg_knn

#### Random Forest

In [None]:
# set seed to its value in Random_Forest, where extensive testing occurred
SEED = 0

os_clf = make_pipeline(RandomOverSampler(random_state=SEED),
                        RandomForestClassifier(random_state = SEED,
                                               n_estimators = 600,
                                                min_samples_split = 5,
                                                min_samples_leaf = 4,
                                                max_features = 'auto',
                                                max_depth = 10,
                                                bootstrap = True))


clf_results = loops.loop_model(os_clf, df, train_indices, valid_indices, scaler=scaler, var_thresh=True, inc_cm=False)
analyze_k.average_outcome(clf_results)

In [None]:
# Change seed back to original value
SEED = 12


#### Logistic Regression

In [None]:
reg = LogisticRegression(solver='liblinear', penalty='l2')
results = loops.loop_model(reg,df,train_indices,valid_indices,oversample=ld.gen_SMOTE_data,var_thresh=True)
avg = analyze_k.average_outcome(results)
avg


### Two Stage Classification

In [None]:
X = df.drop(columns="Target")
y = df.loc[:, 'Target']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
train = pd.concat([X_train, y_train], axis=1)
train_df_resampled, train_y_resampled = ld.gen_oversample_data(train, seed = 12)
X_smote, y_smote = ld.gen_SMOTE_data(train, seed = 12)


def Two_stage(model1, model2):
    #Fit First Layer
    reg=model1.fit(X_smote, y_smote)
    y_pred = reg.predict(X_valid)
   
    #Take first layer predictions and get rid of obs we predicted
    #4 on in test set
    y_pred= pd.DataFrame(y_pred, columns=["pred"])
    a= pd.concat([X_valid.reset_index(drop=True), y_pred], axis=1)
    new_test_set= a.loc[a.loc[:,"pred"]!= 4,:]

    #Run Layer 2 on non-4 obs
    rev_dataset=pd.concat([X_smote, y_smote], axis=1)
    rev_dataset= rev_dataset.loc[rev_dataset.loc[:,"Target"]!= 4, :]
    Y= rev_dataset.loc[:,"Target"]
    X= rev_dataset.drop(columns="Target")
    reg= model2.fit(X, Y)

    #run fit on revised test set
    new_pred = reg.predict(new_test_set.drop(columns="pred"))
    #combine predictions
    a.loc[a.loc[:,"pred"]!= 4,"pred"]=new_pred
    #evaluate
    evaluate_classification(a.loc[:,"pred"], y_true = y_valid, l=[1,2,3,4], cm = True)

os_clf = make_pipeline(RandomOverSampler(random_state=0),
                        RandomForestClassifier(random_state = 0,
                                               n_estimators = 600,
                                                min_samples_split = 5,
                                                min_samples_leaf = 4,
                                                max_features = 'auto',
                                                max_depth = 10,
                                                bootstrap = True))


Two_stage(LogisticRegression(solver='liblinear', penalty='l2'),os_clf)
Two_stage(os_clf,LogisticRegression(solver='liblinear', penalty='l2'))

### Binary Model

In [None]:
# Recode target classes in the DataFrame
bin_df = df.copy()
bin_df['Target'] = bin_df['Target'].replace({1: 1, 2: 1, 3: 0, 4: 0})

X_train = bin_df.drop(columns="Target")
y_train = bin_df.loc[:, 'Target']

### Naive Bayes

In [None]:
nb_results = loops.loop_model(nb, bin_df, train_indices, valid_indices, scaler=scaler, inc_cm=True)
analyze_k.average_outcome(nb_results)

In [None]:
avg_nb = analyze_k.average_outcome(nb_results)
avg_nb

#### Random Forest

In [None]:
clf_results = loops.loop_model(clf, bin_df, train_indices, valid_indices, scaler=scaler, inc_cm=False)

In [None]:
avg_clf = analyze_k.average_outcome(clf_results)
avg_clf

#### KNN

In [None]:
knn_results = loops.loop_model(knn, bin_df, train_indices, valid_indices, scaler=scaler, inc_cm=False)

In [None]:
avg_knn = analyze_k.average_outcome(knn_results)
avg_knn

#### Logistic Regression

In [None]:
reg = LogisticRegression(solver='liblinear', penalty='l2')
results = loops.loop_model(reg,bin_df,train_indices,valid_indices,oversample=ld.gen_SMOTE_data,var_thresh=True)
avg = analyze_k.average_outcome(results)
avg

### Two Stage Binary Model

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Standardize the features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Stage 1: create a binary target separating classes 1/2 from 3/4
y_train_binary = y_train.copy()
y_train_binary[y_train_binary.isin([1, 2])] = 1
y_train_binary[y_train_binary.isin([3, 4])] = 0

# Fit the first model
model_1_2_vs_3_4 = ComplementNB()
model_1_2_vs_3_4.fit(X_train_scaled, y_train_binary)

# Make predictions and evaluate
y_pred = model_1_2_vs_3_4.predict(X_test_scaled)
evaluate_classification(y_pred, y_test)

# Stage 2: separate class 1 from class 2
mask_1_2 = y_train_binary == 1
model_1_vs_2 = ComplementNB()
model_1_vs_2.fit(X_train_scaled[mask_1_2], y_train[mask_1_2])

# Make predictions and evaluate
y_pred = model_1_vs_2.predict(X_test_scaled)
evaluate_classification(y_pred, y_test)

# Stage 3: separate class 3 from class 4
mask_3_4 = y_train_binary == 0
model_3_vs_4 = ComplementNB()
model_3_vs_4.fit(X_train_scaled[mask_3_4], y_train[mask_3_4])

# Make predictions and evaluate
y_pred = model_3_vs_4.predict(X_test_scaled)
evaluate_classification(y_pred, y_test)