In [2]:
# Import pandas and plotting packages
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
import sklearn.metrics as metrics

from sklearn.model_selection import train_test_split, GridSearchCV,KFold, cross_val_score
from sklearn.metrics import r2_score,f1_score, confusion_matrix, classification_report, accuracy_score, roc_auc_score, mean_squared_error, r2_score

from sklearn.preprocessing import LabelEncoder



In [3]:
# Read in the data and print out the .head()
NewLoanDF = pd.read_csv("NewLoanData.csv")
NewLoanDF.head()

Unnamed: 0,Income,Age,Experience,Risk Flag,Married/Single,Home Ownership,Car Ownership
0,1303834,23,3,0,0,0,0
1,7574516,40,10,0,0,0,0
2,3991815,66,4,0,1,0,0
3,6256451,41,2,1,0,0,1
4,5768871,47,11,1,0,0,0


## Split data into test and training

In [4]:
#define X and y
X = NewLoanDF.drop('Risk Flag',axis=1)
y = NewLoanDF['Risk Flag']

In [5]:
#Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
print("X_train shape: {}".format(X_train.shape))
print("X_test shape: {}".format(X_test.shape))

X_train shape: (201600, 6)
X_test shape: (50400, 6)


In [7]:
# Scale the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Predict using Decision Tree Classifier

In [8]:
from sklearn.tree import DecisionTreeClassifier

dct = DecisionTreeClassifier()
dct.fit(X_train,y_train)
dct_predict = dct.predict(X_test)

In [9]:
print(f'Accuracy Score: {accuracy_score (y_test, dct_predict):.2f}')
print(f'AUC Score: {roc_auc_score (y_test, dct_predict):.2f}')
print(f'F1 Score: {f1_score(y_test,dct_predict):.2f}')

Accuracy Score: 0.88
AUC Score: 0.75
F1 Score: 0.55


In [10]:
#DecisionTree gives us a 0.88 accuracy and a .75 auc score

## Predict using Log Regression

In [11]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train,y_train)
lr_predict = lr.predict(X_test)

In [12]:
print(f'Accuracy Score: {accuracy_score (y_test, lr_predict):.2f}')
print(f'AUC Score: {roc_auc_score (y_test, lr_predict):.2f}')
print(f'F1 Score: {f1_score(y_test,lr_predict):.2f}')

Accuracy Score: 0.88
AUC Score: 0.50
F1 Score: 0.00


In [13]:
#LogisticRegression give us 0.88 accuracy and 0.5 auc score

## Predict using KNN

In [14]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_predict= knn.predict(X_test)

In [15]:
print(f'Accuracy Score: {accuracy_score (y_test, knn_predict):.2f}')
print(f'AUC Score: {roc_auc_score (y_test, knn_predict):.2f}')
print(f'F1 Score: {f1_score(y_test,knn_predict):.2f}')

Accuracy Score: 0.89
AUC Score: 0.71
F1 Score: 0.51


In [16]:
#KNN give us 0.89 accuracy and 0.71 auc score

## Trying some FOLDS


In [17]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

In [18]:
folds = 5
reg = LogisticRegression()
cvAUC = cross_val_score(reg, X, y, cv=folds, scoring="roc_auc")
cvAcc = cross_val_score(reg, X, y, cv=folds, scoring="accuracy")

print(cvAUC)

[0.50513364 0.50125179 0.50485306 0.50235594 0.49995095]


In [19]:
print(cvAcc)

[0.87700397 0.87700397 0.87700397 0.87700397 0.87698413]
