## 1. Project Setup

**Load Data & Packages**

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, recall_score, classification_report, precision_recall_fscore_support, balanced_accuracy_score
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
import os
import analyze_k
SEED = 12

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
os.chdir(parent_dir)
scaler = MinMaxScaler()

import load_data as ld
from evaluate_classification import evaluate_classification
df, X_valid, y_valid, train_indices, valid_indices = ld.load_train_data(filepath = 'Kaggle_download/train.csv')

In [None]:
import loops 
df, X_valid, y_valid, train_indices, valid_indices = ld.load_train_data(filepath="Kaggle_download/train.csv", seed=SEED)
scaler = MinMaxScaler()

X_train = df.iloc[:, :-1]
y_train = df.loc[:, 'Target']

### Out of the Box

#### Naive Bayes

In [None]:
nb = ComplementNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_valid)
evaluate_classification(y_pred, y_valid, cm=False)

### KNN 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_valid)
evaluate_classification(y_pred, y_valid, cm=False)

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state = SEED,
                            n_estimators = 1600,
                            min_samples_split = 2,
                            min_samples_leaf = 1,
                            max_features = 'sqrt',
                            max_depth = 100,
                            bootstrap = False
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_valid)
evaluate_classification(y_pred, y_valid, cm=False)


#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear', penalty='l2')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_valid)
evaluate_classification(y_pred, y_valid, cm=False)

### Final Models

#### Naive Bayes

In [None]:
nb = ComplementNB()
nb_results = loops.loop_model(nb, df, train_indices, valid_indices, scaler=scaler)
ind = analyze_k.select_best(nb_results,'recall',4)
nb.fit(df.iloc[train_indices[ind],:].drop(columns='Target'), df.iloc[train_indices[ind],[-1]])
y_pred = nb.predict(X_valid)
print('Final Classification')
evaluate_classification(y_pred, y_valid)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
loops.loop_model(knn, df, train_indices, valid_indices, scaler=scaler)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state = SEED,
                            n_estimators = 1600,
                            min_samples_split = 2,
                            min_samples_leaf = 1,
                            max_features = 'sqrt',
                            max_depth = 100,
                            bootstrap = False
)
loops.loop_model(clf, df, train_indices, valid_indices, scaler=None, 
               oversample=ld.gen_SMOTE_data, var_thresh=True)

#### Logistic Regression

In [None]:
reg = LogisticRegression(solver='liblinear', penalty='l2')
loops.loop_model(reg,df,train_indices,valid_indices,oversample=ld.gen_SMOTE_data,var_thresh=True)


# Two Step Model

In [None]:
reg = RandomForestClassifier(random_state = SEED,
                            n_estimators = 1600,
                            min_samples_split = 2,
                            min_samples_leaf = 1,
                            max_features = 'sqrt',
                            max_depth = 100,
                            bootstrap = False
)
ld.two_step(reg, df, train_indices, valid_indices, ld.gen_oversample_data, True)

## Binary Model

In [None]:
# Recode target classes in the DataFrame
df['Target'] = df['Target'].replace({1: 1, 2: 1, 3: 0, 4: 0})

# Recode target classes in the validation data
y_valid['Target'] = y_valid['Target'].replace({1: 1, 2: 1, 3: 0, 4: 0})


### Naive Bayes

In [None]:
nb = ComplementNB()
loops.loop_model(nb, df, train_indices, valid_indices, scaler=scaler, 
               oversample=ld.gen_SMOTE_data, var_thresh=True)

#### Random Forest

In [None]:
loops.loop_model(clf, df, train_indices, valid_indices, scaler=scaler, 
               oversample=ld.gen_SMOTE_data, var_thresh=True)

In [None]:
knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
loops.loop_model(knn, df, train_indices, valid_indices, scaler=scaler)

In [None]:
reg = LogisticRegression(solver='liblinear', penalty='l2')
loops.loop_model(reg,df,train_indices,valid_indices,oversample=ld.gen_SMOTE_data,var_thresh=True)
