In [1]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.tree as tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
import hw2_pipeline



In [2]:
# get data
cred_df = pd.read_csv('data/credit_data.csv')

# predictor
predictor = 'SeriousDlqin2yrs'
# columns to ignore for some functions that include the final predictor and create the
# dataframe without them
ignore_with_predictor = ['zipcode', 'PersonID', 'SeriousDlqin2yrs']
df_sans_ignore_with_predictor = cred_df.drop(ignore_with_predictor, axis=1)

# columns to ignore for some functions without the final predictor and create the
# dataframe without them
ignore_without_predictor = ['zipcode', 'PersonID']
df_sans_ignore_without_predictor = cred_df.drop(ignore_without_predictor, axis=1)

In [3]:
# run correlations on each of the columns to see what's going on in the data
# with the final predictor
hw2_pipeline.get_correlations(df_sans_ignore_with_predictor)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
RevolvingUtilizationOfUnsecuredLines,1.0,-0.008003,-0.001999,0.02225,0.005832,-0.01459,-0.001686,0.004763,-0.001413,0.005342
age,-0.008003,1.0,-0.068696,0.038828,0.048138,0.159866,-0.069036,0.049168,-0.063622,-0.211002
NumberOfTime30-59DaysPastDueNotWorse,-0.001999,-0.068696,1.0,-0.01162,-0.015224,-0.070704,0.984465,-0.037863,0.98853,-0.00784
DebtRatio,0.02225,0.038828,-0.01162,1.0,-0.022988,0.082791,-0.01479,0.177858,-0.01329,-0.070558
MonthlyIncome,0.005832,0.048138,-0.015224,-0.022988,1.0,0.1071,-0.017954,0.127313,-0.015336,0.060528
NumberOfOpenCreditLinesAndLoans,-0.01459,0.159866,-0.070704,0.082791,0.1071,1.0,-0.098176,0.442776,-0.087154,0.060218
NumberOfTimes90DaysLate,-0.001686,-0.069036,0.984465,-0.01479,-0.017954,-0.098176,1.0,-0.054661,0.992143,-0.015737
NumberRealEstateLoansOrLines,0.004763,0.049168,-0.037863,0.177858,0.127313,0.442776,-0.054661,1.0,-0.047996,0.11488
NumberOfTime60-89DaysPastDueNotWorse,-0.001413,-0.063622,0.98853,-0.01329,-0.015336,-0.087154,0.992143,-0.047996,1.0,-0.016493
NumberOfDependents,0.005342,-0.211002,-0.00784,-0.070558,0.060528,0.060218,-0.015737,0.11488,-0.016493,1.0


In [4]:
# run correlations on each of the columns to see what's going on in the data without the final
# predictor
hw2_pipeline.get_correlations(df_sans_ignore_without_predictor)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
SeriousDlqin2yrs,1.0,-0.004586,-0.173728,0.149334,-0.013502,-0.03281,-0.039898,0.139609,-0.010641,0.121886,0.065708
RevolvingUtilizationOfUnsecuredLines,-0.004586,1.0,-0.008003,-0.001999,0.02225,0.005832,-0.01459,-0.001686,0.004763,-0.001413,0.005342
age,-0.173728,-0.008003,1.0,-0.068696,0.038828,0.048138,0.159866,-0.069036,0.049168,-0.063622,-0.211002
NumberOfTime30-59DaysPastDueNotWorse,0.149334,-0.001999,-0.068696,1.0,-0.01162,-0.015224,-0.070704,0.984465,-0.037863,0.98853,-0.00784
DebtRatio,-0.013502,0.02225,0.038828,-0.01162,1.0,-0.022988,0.082791,-0.01479,0.177858,-0.01329,-0.070558
MonthlyIncome,-0.03281,0.005832,0.048138,-0.015224,-0.022988,1.0,0.1071,-0.017954,0.127313,-0.015336,0.060528
NumberOfOpenCreditLinesAndLoans,-0.039898,-0.01459,0.159866,-0.070704,0.082791,0.1071,1.0,-0.098176,0.442776,-0.087154,0.060218
NumberOfTimes90DaysLate,0.139609,-0.001686,-0.069036,0.984465,-0.01479,-0.017954,-0.098176,1.0,-0.054661,0.992143,-0.015737
NumberRealEstateLoansOrLines,-0.010641,0.004763,0.049168,-0.037863,0.177858,0.127313,0.442776,-0.054661,1.0,-0.047996,0.11488
NumberOfTime60-89DaysPastDueNotWorse,0.121886,-0.001413,-0.063622,0.98853,-0.01329,-0.015336,-0.087154,0.992143,-0.047996,1.0,-0.016493


In [5]:
# commented this out because it takes too much time to run, but it helped me see correlations better
# hw2_pipeline.make_scatter(cred_df)

In [6]:
# make boxplots without predictor
# commented out so it's not taking up space
# I used these to see the relationship between each of the variables and the predictor
# hw2_pipeline.make_boxplots(df_sans_ignore_without_predictor, predictor)

In [7]:
# dealing with outliers: calculate z-scores for each data point,
# then determine how many datapoints in each row are outliers.
# outlier for this project is defined as having a zscore greater than 1.96.
zscore_df = hw2_pipeline.make_zscore_df(df_sans_ignore_with_predictor)
zscore_df.head()

Unnamed: 0,num_outliers,RevolvingUtilizationOfUnsecuredLines_zscore,age_zscore,NumberOfTime30-59DaysPastDueNotWorse_zscore,DebtRatio_zscore,MonthlyIncome_zscore,NumberOfOpenCreditLinesAndLoans_zscore,NumberOfTimes90DaysLate_zscore,NumberRealEstateLoansOrLines_zscore,NumberOfTime60-89DaysPastDueNotWorse_zscore,NumberOfDependents_zscore
0,,-0.024257,0.224896,-0.113192,0.133894,-0.48926,-1.229706,-0.08084,-0.87431,-0.071879,-0.689603
1,,-0.026298,1.309871,-0.113192,-0.255379,0.675773,-0.26952,-0.08084,0.859054,-0.071879,-0.689603
2,,-0.02857,-0.046348,-0.113192,-0.255722,-0.176919,-0.653594,-0.08084,-0.87431,-0.071879,-0.689603
3,,-0.024644,0.224896,0.655208,-0.25512,0.18391,0.690666,-0.08084,1.725736,-0.071879,-0.689603
4,,-0.028649,-0.453214,-0.113192,-0.255695,0.284603,-0.845631,-0.08084,-0.87431,-0.071879,1.09409


In [8]:
# tell me which rows have many outliers or any outliers in them
score = 1.96 # set the zscore or outliers to 1.96. while the normal zscore for outliers is 3, there are no data with a zscore of 3, but there are outliers int he data (the boxplots show that), so wanted to capture those.
rows_with_many_outliers, rows_with_any_outliers = hw2_pipeline.calculate_outliers_per_row(zscore_df, score, 4)

In [9]:
# create dataframe minus people with many outliers then rerun boxplots to see if anything changes
cred_df_less_outliers = cred_df.drop(rows_with_many_outliers)
# commented out so it's not taking up space
#hw2_pipeline.make_boxplots(cred_df_less_outliers, predictor)

In [10]:
# re run scatter plots to see what's up
# commented this out because it takes too much time to run, but it helped me see correlations better
# hw2_pipeline.make_scatter(cred_df_less_outliers)

In [11]:
# create dataframe minus people with man outliers then rerun boxplots to see if anything changes
cred_df_no_outliers = cred_df.drop(rows_with_any_outliers)
# commented out so it's not taking up space
#hw2_pipeline.make_boxplots(cred_df_no_outliers, predictor)

In [12]:
# re run scatter to see what's up
# commented this out because it takes too much time to run, but it helped me see correlations better
# hw2_pipeline.make_scatter(cred_df_no_outliers)

In [13]:
# tell me which columns have NAs in them
are_there_nas = hw2_pipeline.find_nas(cred_df)
#well = are_there_nas.index[are_there_nas].tolist()
print(are_there_nas)


PersonID                                False
SeriousDlqin2yrs                        False
RevolvingUtilizationOfUnsecuredLines    False
age                                     False
zipcode                                 False
NumberOfTime30-59DaysPastDueNotWorse    False
DebtRatio                               False
MonthlyIncome                            True
NumberOfOpenCreditLinesAndLoans         False
NumberOfTimes90DaysLate                 False
NumberRealEstateLoansOrLines            False
NumberOfTime60-89DaysPastDueNotWorse    False
NumberOfDependents                       True
dtype: bool


In [14]:
cols_with_nas = hw2_pipeline.cols_with_nas(are_there_nas)
#are_there_nhw2_pipeline.as.index[are_there_nas].tolist()
print(cols_with_nas)

['MonthlyIncome', 'NumberOfDependents']


In [15]:
# options for dataframes: cred_df_no_outliers, cred_df_less_outliers, cred_df
# options for stat = mean or median

#fill columns with NAs in them with a number (mean, median)

nas_filled = hw2_pipeline.fillnas_with_data(cred_df, 'median', cols_with_nas)
nas_filled_less_outliers = hw2_pipeline.fillnas_with_data(cred_df_less_outliers, 'median', cols_with_nas)
nas_filled_no_outliers = hw2_pipeline.fillnas_with_data(cred_df_no_outliers, 'median', cols_with_nas)

nas_filled.head()

# I chose to use median because it is less likely to be affected by outliers.
# Before creating this function, I check the median and mean of columns with missing data on the full dataset, 
# the dataset with less outliers, and the dataset with no outliers (outliers as defined above). 
# I chose median and the full dataset because the median wasn't as affected by outliers, and in case outliers
# wasn't someone lying, I wanted the algorithm to account for that. Those sound like opposites, but mainly I wanted to 
# choose the stable one between median and mean, while still accounting for all data/not overfitting the data.

Unnamed: 0,PersonID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,zipcode,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,98976,0,1.0,55,60601,0,505.0,0.0,2,0,0,0,0.0
1,98991,0,0.547745,71,60601,0,0.459565,15666.0,7,0,2,0,0.0
2,99012,0,0.04428,51,60601,0,0.01452,4200.0,5,0,0,0,0.0
3,99023,0,0.914249,55,60601,4,0.794875,9052.0,12,0,3,0,0.0
4,99027,0,0.026599,45,60601,0,0.049966,10406.0,4,0,0,0,2.0


In [16]:
# check to make sure nas were filled.
hw2_pipeline.find_nas(cred_df)
hw2_pipeline.find_nas(cred_df_no_outliers)
hw2_pipeline.find_nas(cred_df_less_outliers)

PersonID                                False
SeriousDlqin2yrs                        False
RevolvingUtilizationOfUnsecuredLines    False
age                                     False
zipcode                                 False
NumberOfTime30-59DaysPastDueNotWorse    False
DebtRatio                               False
MonthlyIncome                           False
NumberOfOpenCreditLinesAndLoans         False
NumberOfTimes90DaysLate                 False
NumberRealEstateLoansOrLines            False
NumberOfTime60-89DaysPastDueNotWorse    False
NumberOfDependents                      False
dtype: bool

In [17]:
# discretize variable of choice by any buckets with equal-width bins or quantile bins
# options for dataframes: cred_df_no_outliers, cred_df_less_outliers, cred_df
#options for cut_type: cut, Cut, qcut, Qcut
# if using cut for the cut_type:
    # options for col_names = 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 
        # 'DebtRatio' (use with less or no outliers dataframes because data are too concentrated),
        # 'MonthlyIncome' (use with less or no outliers dataframes because data are too concentrated),
        # 'NumberOfOpenCreditLinesAndLoans' (get more variety with less or no outliers, but fine with all data, too)
        # 'NumberRealEstateLoansOrLines'(use with less or no outliers dataframes because data are too concentrated), 
        # 'NumberOfDependents' (use with less or no outliers dataframes or with higher number of buckets (10+)
            # because data are too concentrated)
# if using qcut for cut_type:
    # options for col_names ='age', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans'

#hw2_pipeline.discretize_variables(cred_df_less_outliers, 'age', 10, 'cut')
#cred_df.head()

# Wanted the user to have the choice of either cut or qcut. I chose cut because I didn't want each bin to have the same 
    # number of entries, and I chose age because I'm curious how the analysis changes by age range.

In [18]:
# create binary/dummy variables from categorical variable
dummy_data = hw2_pipeline.dummify_categories(cred_df_less_outliers, "zipcode") 
dummy_data.head()

Unnamed: 0,zipcode_60601,zipcode_60618,zipcode_60625,zipcode_60629,zipcode_60637,zipcode_60644
0,1,0,0,0,0,0
1,1,0,0,0,0,0
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0


In [19]:
# build classifer: For this assignment, select any classifier you feel comfortable with
X = cred_df_less_outliers.drop(ignore_with_predictor, axis=1)
Y = cred_df_less_outliers[predictor]
test_size = 0.3
n_neighbors = 10
metric = 'minkowski'
metric_params = {'p': 3}
knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric, metric_params=metric_params)

x_train, x_test, y_train, y_test = hw2_pipeline.knn_model(X, Y, test_size, knn)

  


In [20]:
# fit the model
fit = hw2_pipeline.knn_fit(x_train, y_train, knn)
print(fit)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params={'p': 3}, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')


In [21]:
# tell me the prediction probability
prediction = hw2_pipeline.predict_classifier(fit, X, knn)
print(prediction)

[[1.  0. ]
 [0.9 0.1]
 [0.9 0.1]
 ...
 [1.  0. ]
 [0.9 0.1]
 [0.9 0.1]]


In [22]:
# tell me the accuracy
accuracy = hw2_pipeline.accuracy_classifier(fit, x_test, y_test, knn)
print(accuracy)

0.8374644453474197
