In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#every row of the table is a data sample (feature vector) from a person
df = pd.read_csv('/Users/antoniogondim/Downloads/MachineLearning_UM/hw3s/cs_data.csv')
df.head()

Unnamed: 0,id,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [3]:
df.NumberOfDependents.replace(np.nan,df.NumberOfDependents.mean(),inplace=True)

In [4]:
df.MonthlyIncome.replace(np.nan,df.MonthlyIncome.mean(),inplace=True)

In [5]:
df.isnull().sum()

id                                      0
SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

In [6]:
#'SeriousDlqin2yrs' is the target/class label, so it should not be in the input feature vector X
#'id' of a person is useless for the classification task
#so, we remove them from the table and get the input X
X=df.drop(['SeriousDlqin2yrs', 'id'], axis=1)
X.head()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [7]:
#Y is the target/class label
Y=df['SeriousDlqin2yrs']
Y.head()

0    1
1    0
2    0
3    0
4    0
Name: SeriousDlqin2yrs, dtype: int64

In [8]:
# convert pandas dataframe/series to numpy array
# sklearn functions may not work well with pandas data types
X=X.values
Y=Y.values

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
#split X_train and Y_train into a 'pure' training set and a validation set
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=0)
print('train:', X_train.shape, Y_train.shape)
print('validation:', X_val.shape, Y_val.shape)
print('test:', X_test.shape, Y_test.shape)

train: (108000, 10) (108000,)
validation: (12000, 10) (12000,)
test: (30000, 10) (30000,)


In [10]:
#apply feature normalization to training, validation and test sets
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_train) # think about why fit to X_train, not X ?
X_train=scaler.transform(X_train)
X_val=scaler.transform(X_val)
X_test=scaler.transform(X_test)

In [11]:
from sklearn.ensemble import RandomForestClassifier


### Perform grid search for more than one hyper-parameter
We will optimize more than one hyper-parameter of random forest <br>
To simply this sub-task, we will consider only the following hyper-parameters: <br>
`max_depth` <br>
`min_samples_split` <br>
`min_samples_leaf` <br>
`max_features` <br>
`max_samples` <br>

In [12]:
#if the goal is to optimize multiple hyper-parameters of random forest, then:
list1=[1, 10, 100]
list2=[2, 5, 10]
list3=[1, 5, 10]
list4=["sqrt", "log2", None]
list5=[0.1, 0.5, 0.9]
n_list=np.arange(10,100,10)

param_grid={'max_depth': list1,           
            'min_samples_split': list2,   
            'min_samples_leaf':list3,     
            'max_features':list4,         
            'max_samples':list5,          
            'class_weight':['balanced'],  #to handle class-imbalance: always set class_weight to 'balanced' 
            'n_estimators':n_list }        

In [13]:
from sklearn.metrics import confusion_matrix

def weighted_accuracy(confusion):
    #input: confusion is the confusion matrix
    #output: acc is the weighted classification accuracy
    M=confusion.copy().astype('float32')
    for k in range(0, M.shape[0]):
        M[k]/=M[k].sum()+1e-8    
    acc = M.diagonal().sum()/M.sum()
    return acc

def my_scorer(model, X, Y_true):
    Y_pred = model.predict(X)
    confusion=confusion_matrix(Y_true, Y_pred)
    acc=weighted_accuracy(confusion)
    return acc

In [14]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(estimator=RandomForestClassifier(),
                  param_grid=param_grid,
                  #scoring='accuracy', # it will calculate standard accuracy for training and validation
                  scoring=my_scorer,
                  cv=5)


In [None]:
gs.fit(X_train, Y_train)

In [None]:
acc_val_list=gs.cv_results_['mean_test_score']


In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(n_list, acc_val_list)
ax.set_title('val accuracy vs n')
ax.set_xlabel('n')
ax.set_ylabel('accuracy')

In [None]:
n_best=n_list[np.argmax(acc_val_list)]

In [None]:
#it is the best model
model_best=gs.best_estimator_

In [None]:
#measure the classification accuracy on the training set
acc_train=model_best.score(X_train, Y_train)

In [None]:
#measure the classification accuracy on the test set
acc_test=model_best.score(X_test, Y_test)

In [None]:
#if there is class-imbalance, we need to use weighted accuracy
acc_test_weighted=my_scorer(model_best, X_test, Y_test)