# Homework 3. Carlos Alvarado

### Initial Settings

In [None]:
import sys
sys.path.append('./source')

from step1_read_data import read_data
from step2_explore import describe_column, tabular, histogram, print_explore
from step3_preprocess import fill_miss, convert_column_type
from step4_create_features import discretize, make_dummies
from step5_classifiers import make_model
from step6_evaluate import evaluate, precision_at_k

%matplotlib inline


### Config Variables

In [None]:
FILENAME = 'data/credit-data.csv'

large_grid = { 
    'RF': {'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
    'BA': {},
    'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
    'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]},
    'SVM':{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
    'KNN':{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
           }

test_grid = { 
    'RF': {'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]},
    'BA': {},
    'AB': { 'algorithm': ['SAMME'], 'n_estimators': [1]},
    'LR': { 'penalty': ['l1'], 'C': [0.01]},
    'SVM':{'C' :[0.01],'kernel':['linear']},
    'DT': {'criterion': ['gini'], 'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]},    
    'KNN':{'n_neighbors': [5],'weights': ['uniform'],'algorithm': ['auto']}
           }

In [None]:
large_grid

### Step 1. Read Data

In [None]:
# Read Data into Pandas object
data = read_data(FILENAME)

### Step 2. Explore data

In [None]:
print_explore(data)

### Step 3. Pre-process data

In [None]:
# Convert SeriousDlqin2yrs to booleans (this is Y, the outcome variable)
convert_column_type(data, 'SeriousDlqin2yrs', 'bool', value_if_true = 1)

#Convert NumberOfDependents to integers
convert_column_type(data, 'NumberOfDependents', 'int')

In [None]:
# Debt Ratio over 10000% is very unlikely
data['DebtRatio'].loc[data['DebtRatio'] > 10000] = None

# Note to myself: create winsorize function for next assignment

In [None]:
#Variables with missing values: MonthlyIncome, NumberOfDependents
data = fill_miss(data, 'MonthlyIncome', method='mean')
data = fill_miss(data, 'NumberOfDependents', method='median')
data = fill_miss(data, 'DebtRatio', method='median')

# Note to me: add more methods in the future: nearest neighbor or other classification method

### Step 4. Create Features

In [None]:
discretize(data, 'MonthlyIncome', nbins=5, cut_type='quantile')
discretize(data, 'DebtRatio', nbins=5, cut_type='logspace')

In [None]:
make_dummies(data, 'NumberOfDependents')
make_dummies(data, 'DebtRatio_cat')
make_dummies(data, 'MonthlyIncome_cat')

In [None]:
#get column names to select features
data.columns

### Steps 5 and 6. Classify and Evaluate

In [None]:
try:
    from sklearn.model_selection import train_test_split
except:
    from sklearn.cross_validation import train_test_split

from sklearn.grid_search import ParameterGrid
from sklearn.metrics import roc_auc_score

import pandas as pd

FEATURES = ['NumberOfDependents', 'age', 'MonthlyIncome_cat_0',
       'MonthlyIncome_cat_1', 'MonthlyIncome_cat_2', 'MonthlyIncome_cat_3',
       'MonthlyIncome_cat_4', 'DebtRatio_cat_0',
       'DebtRatio_cat_1', 'DebtRatio_cat_2', 'DebtRatio_cat_3',
       'DebtRatio_cat_4', 'DebtRatio_cat_5']

MODELS_TO_RUN = ['RF', 'BA', 'AB', 'LR', 'SVM', 'DT', 'KNN']

GRID = test_grid

X = data.loc[:, FEATURES ]

Y = data['SeriousDlqin2yrs']

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=None)

results_df =  pd.DataFrame(columns=(
        'model_type','clf', 'parameters', 'auc-roc','p_at_5', 'p_at_10', 'p_at_20'))

for model_name in MODELS_TO_RUN:
    print(model_name)
    clf = make_model(model_name)
    parameter_values = GRID[model_name]

    for p in ParameterGrid(parameter_values):
        clf.set_params(**p)
        y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
        y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_pred_probs, y_test), reverse=True))
        results_df.loc[len(results_df)] = [model_name, clf, p,
                                       roc_auc_score(y_test, y_pred_probs),
                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,5.0),
                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,10.0),
                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,20.0)]


In [None]:
print(model.model.coef_)

### Step 6. Evaluate

In [None]:
model.model.score(X_test, Y_test)

In [None]:
import pandas as pd

probas = model.model.predict_proba(X_test)

pd.crosstab(Y_test, probas[:,1] > 0.2, rownames=["Actual"], colnames=["Predicted"])

In [None]:
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve

In [None]:
accuracy_score(Y_test, probas[:,1] > 0.2)

In [None]:
print(classification_report(Y_test, probas[:,1] > 0.2))

In [None]:
precision_recall_curve(Y_test, probas[:,1])

In [None]:
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix

confusion_matrix(Y_test, model.model.predict(X_test))


In [None]:
probas = model.model.predict_proba(X_train)

In [None]:
sum(probas[:,1] > 0.2)

In [None]:
model.model.predict(X_test)