# Homework 2. Carlos Alvarado

### Initial Settings

In [1]:
import sys
sys.path.append('./source')

from step1_read_data import read_data
from step2_explore import describe_column, tabular, histogram
from step3_preprocess import fill_miss, convert_column_type
from step4_create_features import discretize, make_dummies
from step5_classifiers import MyClassifier
from step6_evaluate import evaluate

### Step 1. Read Data

In [2]:
# Read Data into Pandas object
data = read_data('data/credit-data.csv')

### Step 2. Explore data

In [None]:
# Get column names and types
data.dtypes

In [None]:
for varname in data.columns:
    print('-----------------------------------------')
    print('Describing variable: {}'.format(varname))
    describe_column(data, varname)

In [16]:
describe_column(data, 'DebtRatio')

count    120269.000000
mean         26.598777
std         424.446457
min           0.000000
25%           0.143388
50%           0.296023
75%           0.482559
max       61106.500000
Name: DebtRatio, dtype: float64

Unique values: 109658

Missing values: 0


### Step 3. Pre-process data

In [3]:
# Convert SeriousDlqin2yrs to booleans
convert_column_type(data, 'SeriousDlqin2yrs', 'bool', value_if_true = 1)

#Convert NumberOfDependents to integers
convert_column_type(data, 'NumberOfDependents', 'int')

0         2
1         1
2         0
3         0
4         0
5         1
6         0
7         0
9         2
10        0
11        2
12        2
13        2
14        0
15        2
16        0
17        0
18        2
19        0
20        0
21        2
22        0
23        0
24        0
25        1
26        0
27        1
28        0
29        0
30        0
         ..
149970    2
149971    0
149972    1
149973    1
149974    0
149975    0
149976    0
149977    0
149978    0
149979    0
149980    0
149981    4
149982    3
149983    0
149984    0
149985    0
149986    1
149987    0
149988    0
149989    3
149990    2
149991    0
149992    3
149993    0
149994    0
149995    0
149996    2
149997    0
149998    0
149999    0
Name: NumberOfDependents, dtype: int64


In [5]:
#Variables with missing values: MonthlyIncome, NumberOfDependents
data = fill_miss(data, 'MonthlyIncome', method='mean')
data = fill_miss(data, 'NumberOfDependents', method='median')

# Note to me: add more methods in the future: nearest neighbor or other classification method

In [None]:
# Winsorize some variables (to remove possible outliers)


In [None]:
data[~data['NumberOfDependents'].isnull()]['NumberOfDependents'].astype(int)

### Step 4. Create Features

In [17]:
discretize(data, 'MonthlyIncome', nbins=5, cut_type='quantile')
discretize(data, 'DebtRatio', nbins=5, cut_type='logspace')



In [18]:
make_dummies(data, 'NumberOfDependents')
make_dummies(data, 'DebtRatio_cat')

0 <class 'numpy.float64'> 2.0
1 <class 'numpy.float64'> 1.0
2 <class 'numpy.float64'> 0.0
3 <class 'numpy.float64'> 3.0
4 <class 'numpy.float64'> 4.0
5 <class 'numpy.float64'> 5.0
6 <class 'numpy.float64'> 6.0
7 <class 'numpy.float64'> 8.0
8 <class 'numpy.float64'> 7.0
9 <class 'numpy.float64'> 20.0
10 <class 'numpy.float64'> 10.0
11 <class 'numpy.float64'> 9.0
12 <class 'numpy.float64'> 13.0
0 <class 'str'> (0.327, 18.692]
1 <class 'str'> (0.00572, 0.327]
2 <class 'str'> (18.692, 1068.735]
3 <class 'str'> (1068.735, 61106.5]
4 <class 'str'> [0.0001, 0.00572]
5 <class 'float'> nan


In [8]:
make_dummies(data, 'MonthlyIncome_cat')

0 <class 'str'> (9083, 3008750]
1 <class 'str'> [0, 3000]
2 <class 'str'> (3000, 4544.2]
3 <class 'str'> (6300, 9083]
4 <class 'str'> (4544.2, 6300]


In [19]:
data.columns

Index(['PersonID', 'SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines',
       'age', 'zipcode', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio',
       'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
       'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
       'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents',
       'MonthlyIncome_cat', 'NumberOfDependents_0', 'NumberOfDependents_1',
       'NumberOfDependents_2', 'NumberOfDependents_3', 'NumberOfDependents_4',
       'NumberOfDependents_5', 'NumberOfDependents_6', 'NumberOfDependents_7',
       'NumberOfDependents_8', 'NumberOfDependents_9', 'NumberOfDependents_10',
       'NumberOfDependents_11', 'NumberOfDependents_12', 'MonthlyIncome_cat_0',
       'MonthlyIncome_cat_1', 'MonthlyIncome_cat_2', 'MonthlyIncome_cat_3',
       'MonthlyIncome_cat_4', 'DebtRatio_cat', 'DebtRatio_cat_0',
       'DebtRatio_cat_1', 'DebtRatio_cat_2', 'DebtRatio_cat_3',
       'DebtRatio_cat_4', 'DebtRatio_cat_5'],
   

### Step 5. Classify

In [38]:
# The task here is to predict who will  experience financial 
# distress in the next two years. The outcome variable (label) 
# in the data is SeriousDlqin2yrs. We have access to other information 
# about this person (as described in the data dictionary). Your assignment 
#is to take this data and build a machine learning pipeline that 
#trains *one* machine learning model on the data. 


X = data.loc[:, ('NumberOfDependents', 'age', 'MonthlyIncome_cat_0',
       'MonthlyIncome_cat_1', 'MonthlyIncome_cat_2', 'MonthlyIncome_cat_3',
       'MonthlyIncome_cat_4', 'DebtRatio_cat_0',
       'DebtRatio_cat_1', 'DebtRatio_cat_2', 'DebtRatio_cat_3',
       'DebtRatio_cat_4', 'DebtRatio_cat_5')]

Y = data['SeriousDlqin2yrs']

model = MyClassifier(X, Y, 'logistic', test_size=0.3, seed=42)

model.model.score(model.X_test, model.Y_test)

0.9302125772567279

### Step 6. Evaluate

In [21]:
model.model.score(model.X_test, model.Y_test)

0.93406502037083228

In [40]:
import pandas as pd

probas = model.model.predict_proba(model.X_test)

pd.crosstab(model.Y_test, probas[:,1] > 0.3, rownames=["Actual"], colnames=["Predicted"])

Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,33562,1
True,2518,0


In [14]:
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix

confusion_matrix(model.Y_test, model.model.predict(model.X_test))


array([[11234,     0],
       [  793,     0]])

In [23]:
probas = model.model.predict_proba(model.X_train)

In [33]:
sum(probas[:,1] > 0.2)

107

In [34]:
model.model.predict(model.X_test)

array([False, False, False, ..., False, False, False], dtype=bool)