In [110]:
# Import Usual Modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# Import modules to make our pipeline
## Preprocessing functions
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as preprocessing
import sklearn.impute as impute
from sklearn.compose import ColumnTransformer

## Pipeline functions
from sklearn.pipeline import Pipeline, FeatureUnion

## Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_validate, GridSearchCV

## Metrics
from sklearn.metrics import roc_curve, confusion_matrix, auc, precision_recall_curve, accuracy_score


#Plotting
import matplotlib.pyplot as plt

#Set output display options
np.set_printoptions(linewidth=132)
np.set_printoptions(precision=4)

In [24]:
# Check credit risk assessment.csv in listdir
os.listdir(".")

['Credit risk assessment.csv',
 '.DS_Store',
 'Untitled.ipynb',
 'config.ipynb',
 '.ipynb_checkpoints',
 'src']

In [51]:
raw_data = pd.read_csv('Credit risk assessment.csv', low_memory = False)

In [43]:
raw_data.shape

(1000, 13)

In [419]:
preds = ['age','gender', 'job_cat', 'housing','savings','checking', 'loan_amount', 'duration','purpose', 'customer_loyalty']
raw_data.dtypes
col_types = {'index':np.int64, 
             'age':np.float64,
             'gender': 'category',
             'job_cat': 'category',
             'housing': 'category',
             'savings': 'category',
             'checking': 'category',
             'loan_amount': np.float64,
             'duration': np.int64,
             'purpose': 'category',
             'risk': 'category',
             'customer_loyalty':np.float64}

In [420]:
raw_data = raw_data.astype(col_types)

In [421]:
# add check for raw_data.dtypes == col_types
raw_data.dtypes

Unnamed: 0             int64
index                  int64
age                  float64
gender              category
job_cat             category
housing             category
savings             category
checking            category
loan_amount          float64
duration               int64
purpose             category
risk                category
customer_loyalty     float64
dtype: object

In [422]:
NAN = [(c, raw_data[c].isna().mean()*100) for c in raw_data]


In [423]:
NAN = pd.DataFrame(NAN, columns = ["column_name", "percentage"])

In [424]:
NAN.sort_values("percentage", ascending = False)

Unnamed: 0,column_name,percentage
7,checking,39.4
6,savings,18.3
8,loan_amount,4.1
2,age,1.3
0,Unnamed: 0,0.0
1,index,0.0
3,gender,0.0
4,job_cat,0.0
5,housing,0.0
9,duration,0.0


In [425]:
missing_counts = object_cols.isnull().sum()

In [426]:
missing_counts

gender        0
job_cat       0
housing       0
savings     183
checking    394
purpose       0
risk          0
dtype: int64

In [446]:
#split data
X_train, X_test, y_train, y_test = train_test_split(raw_data, raw_data['risk'], test_size = 0.3, random_state = 36)

In [447]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=36)

In [448]:
print(X_train.shape)
print(X_test.shape)
print(X_valid.shape)
print(y_train.shape)
print(y_test.shape)
print(y_valid.shape)

(525, 13)
(300, 13)
(175, 13)
(525,)
(300,)
(175,)


In [449]:
X_object_cols = X_train.select_dtypes(include=['category'])
X_numeric_cols = X_train.drop(object_cols.columns, axis = 1)

In [461]:
preprocessor = ColumnTransformer(
    transformers =[
        ('impute', impute.SimpleImputer(missing_values= np.nan, strategy = 'mean') , list(X_numeric_cols.columns)),
        ('standardize', StandardScaler(), list(X_numeric_cols.columns))
        #('imp_cat', impute.SimpleImputer(strategy = 'most_frequent'), list(X_object_cols.columns)),
        #('ohe', OneHotEncoder(categories = 'auto'), list(X_object_cols.columns))
    ], 
remainder = 'drop', n_jobs = -1)

In [462]:
X_train.dtypes

Unnamed: 0             int64
index                  int64
age                  float64
gender              category
job_cat             category
housing             category
savings             category
checking            category
loan_amount          float64
duration               int64
purpose             category
risk                category
customer_loyalty     float64
dtype: object

In [463]:
preprocessor.fit_transform(X_train)

array([[ 5.5300e+02,  5.5300e+02,  2.7000e+01, ..., -4.8681e-01, -7.2443e-01, -7.3901e-01],
       [ 7.8500e+02,  7.8500e+02,  3.5000e+01, ..., -5.0685e-01, -2.2619e-01,  1.6738e-01],
       [ 3.3500e+02,  3.3500e+02,  4.4000e+01, ...,  2.8760e-02, -1.2227e+00,  7.7251e-01],
       ...,
       [ 1.7200e+02,  1.7200e+02,  3.4000e+01, ..., -4.6120e-01,  2.7206e-01, -4.6447e-02],
       [ 1.2900e+02,  1.2900e+02,  2.9000e+01, ...,  7.1446e-02, -7.2443e-01, -6.7880e-01],
       [ 4.0400e+02,  4.0400e+02,  4.3000e+01, ..., -1.5126e-01, -2.2619e-01,  6.6615e-01]])

In [477]:
cat_imputed = X_object_cols.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [478]:
onehot = OneHotEncoder(sparse = False)

In [479]:
X_cat_1 = onehot.fit_transform(cat_imputed)

In [None]:
X_train

In [412]:
def rmse(x, y): return math.sqrt(((x-y)**2).mean())

In [413]:
def print_score(mdl):
    res = [rmse(mdl.predict(X_train), y_train),
           rmse(mdl.predict(X_valid), y_valid),
           mdl.score(X_train, y_train), mdl.score(X_valid, y_valid)]
    if hasattr(mdl, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [415]:
X_train

Unnamed: 0.1,Unnamed: 0,index,age,loan_amount,duration,customer_loyalty,gender,job_cat,housing,savings,checking,purpose,risk
553,553,553,27.0,1995.0,12,0.410886,male,2,own,moderate,moderate,car,good
785,785,785,35.0,1941.0,18,0.552889,male,1,own,high,moderate,business,good
335,335,335,44.0,3384.0,6,0.647695,male,3,rent,low,low,furniture/equipment,bad
634,634,634,25.0,1355.0,24,0.363950,female,1,own,low,moderate,car,bad
95,95,95,58.0,15945.0,54,0.844425,male,2,rent,low,moderate,business,bad
301,301,301,42.0,3804.0,36,0.566329,female,2,own,low,moderate,radio/TV,bad
881,881,881,48.0,9277.0,24,0.669453,male,2,free,low,low,car,good
986,986,986,33.0,6289.0,42,0.509578,male,2,own,low,high,business,good
119,119,119,36.0,2366.0,12,0.504766,male,3,own,very_high,moderate,car,good
456,456,456,36.0,3905.0,11,0.497251,male,2,rent,low,low,car,good


In [414]:
mdl = RandomForestClassifier(n_jobs = -1)
mdl.fit(X_train, y_train)
#print_score(mdl)



ValueError: could not convert string to float: 'male'

In [176]:
X_train.dtypes

Unnamed: 0            int64
index                 int64
age                 float64
gender               object
job_cat              object
housing              object
savings              object
checking             object
loan_amount         float64
duration              int64
purpose              object
risk                 object
customer_loyalty    float64
dtype: object

In [178]:
X_train.gender.value_counts()

male       261
female     125
unknown      7
Name: gender, dtype: int64

In [None]:
#pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

#X_digits, y_digits = datasets.load_digits(return_X_y=True)

# Parameters of pipelines can be set using ‘__’ separated parameter names:
#param_grid = {
    #'pca__n_components': [5, 15, 30, 45, 64],
#    'clf__C': np.logspace(-4, 4, 4)
#}
#search = GridSearchCV(lasso_pipeline, param_grid, n_jobs=-1)
#search.fit(X_train, y_train)