# Importing Libraries

In [None]:
# import basics
import pandas as pd
import numpy as np
import scipy
import seaborn as sns
from seaborn import heatmap
from collections import Counter
import matplotlib.pyplot as plt

# import preprocessing and scoring
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, cross_validate, KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from prince import MCA

# import models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

# import misscelaneous
from vecstack import StackingTransformer
from pycm import ConfusionMatrix
import warnings
warnings.filterwarnings('ignore')

# Preprocessing Data 

## Redundant Feature Removal and Grouping

In [None]:
# import data
df = pd.read_csv('https://raw.githubusercontent.com/YohanJhaveri/Costa-Rica-Housing-Poverty/master/train.csv')

# drop columns with missing values
df = df.dropna(axis=1, thresh=len(df))

# drop columns with ambiguous values
df = df.drop(columns = ['dependency', 'edjefe','edjefa'])

# group instances by household id (idhogar) and taking the mean of feature values
df = df.groupby(['idhogar']).mean()

# round the target column after labels have been combined
df['Target'] = df['Target'].round()

## Identifying Numerical vs Categorical Features 

In [None]:
binary = lambda f: (len(set(df[f])) <= 2)

def find_categorical():
    return filter(lambda f: binary(f) and f != 'Target', df)

def find_numerical():
    return filter(lambda f: not binary(f) and f != 'Target', df)

## Scaling Numerical Data 


In [None]:
# Standard Scaling and Min-Max Scaling of Numerical Variables

def standard_scale():
    numerical = find_numerical()

    for f in numerical:
        mean = np.mean(df[f])
        std = np.std(df[f])
        df[f] = df[f].apply(lambda x: (x - mean) / std)
     
    
def min_max_scale():
    numerical = find_numerical()

    for f in numerical:
        minimum = min(df[f])
        maximum = max(df[f])
        df[f] = df[f].apply(lambda x: (x - minimum) / (maximum - minimum))

## Feature Selection 

### Feature Selection for Categorical Variables using Gini Index 

In [None]:
# Feature Selection for Categorical Variables [GINI INDEX COEFFICIENT]
def select_categorical(n):
    categorical = find_categorical()
    
    def gini(array):
        array = np.array(array).flatten()
        if np.amin(array) < 0: array -= np.amin(array)
        array = np.sort(array) * 1.0
        index = np.arange(1, array.shape[0]+1)
        n = array.shape[0]
        return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array)))
    
    ginis = [(gini(np.array(df[feature])), feature) for feature in categorical]
    
    best = set(np.array(sorted(ginis)[:n])[:,-1])
    drop = set(categorical).difference(best)
    return df.drop(columns=drop)

In [None]:
# Feature Selection for Numerical Variables [PEARSON CORRELATION MATRIX]
def select_numerical(threshold):
    numerical = find_numerical()
    df_num = df[numerical]

    def find_max_corr_col(matrix):
        correlations = []
        for i, row in enumerate(matrix):
            for j, col in enumerate(matrix[row]):
                if i != j: correlations.append((abs(col), row))
        return max(correlations)

    drop = []
    max_corr = 1

    while True:
        matrix = df_num.corr()
        max_corr, max_col = find_max_corr_col(matrix)

        if max_corr >= threshold: 
            drop.append(max_col)
            df_num = df_num.drop(columns=[max_col])
        else: break

    return df.drop(columns=drop)

# Dimensionality Reduction

### Dimensionality Reduction for Numerical Variables using PCA

In [None]:
# DIMENSIONALITY REDUCTION for Numerical Variables
def reduce_numerical(n):
    numerical = find_numerical()
    pca = PCA(n_components=n)
    df_num = df[numerical]

    components = pca.fit_transform(df_num)

    for i in range(n):
        df['PCA'+str(i+1)] = components[:,i]

    return df.drop(columns=numerical)

### Dimensionality Reduction for Categorical Variables using MCA

In [None]:
# DIMENSIONALITY REDUCTION for Categorical Variables
def reduce_categorical(n):
    categorical = find_categorical()
    mca = MCA(n_components=n)
    df_num = df[categorical]

    components = mca.fit_transform(df_num)
    
    for i in range(n):
        df['MCA'+str(i+1)] = components[i]

    return df.drop(columns=categorical)

In [None]:
# Preprocessing:
LIMIT = 50
THRESHOLD = 0.8 
NUM_N_COMPONENTS = 10
NUM_C_COMPONENTS = 10

# standard_scale()
min_max_scale()
df = select_categorical(LIMIT)
# df = select_numerical(THRESHOLD)
# df = reduce_numerical(NUM_N_COMPONENTS)
df = reduce_categorical(NUM_C_COMPONENTS)

# Model Selection

## Test Train Split

In [None]:
x = df.copy().drop(columns=['Target'])
y = df[['Target']]

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.7, random_state=42)

## Grid Search

In [None]:
def grid_search(model, params): # grid search (using f1-macro as scoring metric and k-fold as cross validation)
    rf_grid = GridSearchCV(model, scoring='f1_macro',param_grid = params, cv=KFold(n_splits=3, shuffle=True, random_state=32))
    rf_grid.fit(xTrain, yTrain)
    return rf_grid.best_params_

def predict(model):
    model.fit(xTrain, yTrain)
    return model.predict(xTest).flatten()

def test(model):
    yHat = predict(model)
    from sklearn import metrics 
#     return metrics.accuracy_score(yTest, yHat)
    return f1_score(yTest, yHat, average='macro')

def results(model, params):
    optHyper = grid_search(model(), params)
    optModel = model(**optHyper)
    score = test(optModel)
    return optHyper, score

## Finding optimum hyperparameters 

### K-Nearest Neighbors

In [None]:
hyper, score = results(KNeighborsClassifier, {
    'n_neighbors': list(range(1, 10))
})

print('KNN -------------------')
print('Hyper:', hyper)
print('Score:', score)

### Naive Bayes


In [None]:
hyper, score = results(GaussianNB, {})
print('GNB -------------------')
print('Hyper:', hyper)
print('Score:', score)

hyper, score = results(BernoulliNB, {})
print('BNB -------------------')
print('Hyper:', hyper)
print('Score:', score)

### Decision Tree

In [None]:
hyper, score = results(DecisionTreeClassifier, {
    'criterion': ['gini', 'entropy'],
    'max_depth': list(range(1, 30, 5)),
    'min_samples_leaf': list(range(1, 30, 5))
})

print('DT -------------------')
print('Hyper:', hyper)
print('Score:', score)

### Adaboost 

In [None]:
hyper, score = results(AdaBoostClassifier, {
    'n_estimators': range(1, 50),
    'learning_rate': [0.1, 1, 1.5]
})

print('ADA -------------------')
print('Hyper:', hyper)
print('Score:', score)

### XGBoost

In [None]:
hyper, score = results(XGBClassifier, {
    'max_depth': range(30),
    'learning_rate': [0.1, 1],
    'num_rounds': [1]
})

print('XGB -------------------')
print('Hyper:', hyper)
print('Score:', score)

### Stacking

In [None]:
xgb_opt = {'learning_rate': 0.1, 'max_depth': 27, 'num_rounds': 10}
optXGB = XGBClassifier(**xgb_opt)

ada_opt = {'learning_rate': 1.5, 'n_estimators': 49}
optADA = AdaBoostClassifier(**ada_opt)

optBNB = BernoulliNB()

estimators = [('ada', optADA), ('bnb', optBNB)]

stack = StackingTransformer(estimators, regression=False, verbose=2)
stack = stack.fit(xTrain, yTrain)

S_train = stack.transform(xTrain)
S_test = stack.transform(xTest)

optXGB.fit(S_train, yTrain)
yHat = optXGB.predict(S_test)
yTest = np.array(yTest).flatten()
score = f1_score(yTest, yHat, average='macro')
print(score)

### Ensemble

In [None]:
ensemble = VotingClassifier(estimators=[('xgb', optXGB), ('ada', optADA), ('bnb', optBNB)], voting='hard')
ensemble = ensemble1.fit(xTrain, yTrain)
ensemble_prediction=ensemble.predict(xTest)
score = f1_score(yTest, ensemble1_prediction, average='macro')
print(score)