In [1]:
from IPython.display import Image
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
import warnings
warnings.simplefilter("ignore")
%matplotlib inline

matplotlib.rcParams['figure.figsize'] = [12, 12]
import seaborn as sns


In [2]:
data = pd.read_csv('credit_risk_features.csv')

In [3]:
data.head(10)

Unnamed: 0,SK_ID_CURR,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,BASEMENTAREA_AVG,ENTRANCES_AVG,FLAG_DOCUMENT_20,FONDKAPREMONT_MODE,CNT_CHILDREN,FLAG_DOCUMENT_15,FLAG_EMAIL,OWN_CAR_AGE
0,SK_ID_CURR,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,BASEMENTAREA_AVG,ENTRANCES_AVG,FLAG_DOCUMENT_20,FONDKAPREMONT_MODE,CNT_CHILDREN,FLAG_DOCUMENT_15,FLAG_EMAIL,OWN_CAR_AGE
1,100004,0,-815,,,0,,0,0,0,26
2,100006,0,-617,,,0,,0,0,0,
3,100012,0,-1673,,,0,,0,0,0,
4,100014,0,-844,,,0,,1,0,0,
5,100015,0,-2396,,,0,,0,0,0,
6,100017,0,-4,0.0973,0.1379,0,reg oper account,1,0,0,23
7,100018,0,-188,0.1335,0.1724,0,reg oper account,0,0,0,
8,100019,0,-925,,,0,,0,0,0,17
9,100030,0,0,0.0147,0.1379,0,reg oper account,0,0,0,


In [4]:
data.shape

(100001, 11)

In [5]:
data.describe()

Unnamed: 0,SK_ID_CURR,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,BASEMENTAREA_AVG,ENTRANCES_AVG,FLAG_DOCUMENT_20,FONDKAPREMONT_MODE,CNT_CHILDREN,FLAG_DOCUMENT_15,FLAG_EMAIL,OWN_CAR_AGE
count,100001,99654,100001,41556,49897.0,100001,31746,100001,100001,100001,34064
unique,100001,15,6774,5504,362.0,5,5,23,5,5,105
top,393214,0,0,0,0.1379,0,reg oper account,0,0,0,7
freq,1,59750,8073,3097,7214.0,65507,24003,45889,65451,61798,1592


In [6]:
data.dtypes

SK_ID_CURR                  object
DEF_60_CNT_SOCIAL_CIRCLE    object
DAYS_LAST_PHONE_CHANGE      object
BASEMENTAREA_AVG            object
ENTRANCES_AVG               object
FLAG_DOCUMENT_20            object
FONDKAPREMONT_MODE          object
CNT_CHILDREN                object
FLAG_DOCUMENT_15            object
FLAG_EMAIL                  object
OWN_CAR_AGE                 object
dtype: object

In [7]:
null_counts = data.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

FONDKAPREMONT_MODE          68255
OWN_CAR_AGE                 65937
BASEMENTAREA_AVG            58445
ENTRANCES_AVG               50104
DEF_60_CNT_SOCIAL_CIRCLE      347
dtype: int64

going to drop categorical column, and change objects to numerical dtype

In [8]:
## remove b/c only non numeric variable, makes model more simple 
data = data.drop('FONDKAPREMONT_MODE', 1)

In [9]:
## remove b/c dummy variable with missing values, makes model more simple 
data = data.drop('DEF_60_CNT_SOCIAL_CIRCLE', 1)

In [10]:
data.dtypes

SK_ID_CURR                object
DAYS_LAST_PHONE_CHANGE    object
BASEMENTAREA_AVG          object
ENTRANCES_AVG             object
FLAG_DOCUMENT_20          object
CNT_CHILDREN              object
FLAG_DOCUMENT_15          object
FLAG_EMAIL                object
OWN_CAR_AGE               object
dtype: object

In [11]:
data.DAYS_LAST_PHONE_CHANGE = pd.to_numeric(data.DAYS_LAST_PHONE_CHANGE, errors = "coerce")

In [12]:
data.BASEMENTAREA_AVG = pd.to_numeric(data.BASEMENTAREA_AVG, errors = "coerce")

In [13]:
data.ENTRANCES_AVG = pd.to_numeric(data.ENTRANCES_AVG, errors = "coerce")

In [14]:
data.FLAG_DOCUMENT_20 = pd.to_numeric(data.FLAG_DOCUMENT_20, errors = "coerce")

In [15]:
data.CNT_CHILDREN = pd.to_numeric(data.CNT_CHILDREN, errors = "coerce")

In [16]:
data.FLAG_DOCUMENT_15 = pd.to_numeric(data.FLAG_DOCUMENT_15, errors = "coerce")

In [17]:
data.FLAG_EMAIL = pd.to_numeric(data.FLAG_EMAIL, errors = "coerce")

In [18]:
data.OWN_CAR_AGE = pd.to_numeric(data.OWN_CAR_AGE, errors = "coerce")

In [19]:
data.dtypes

SK_ID_CURR                 object
DAYS_LAST_PHONE_CHANGE    float64
BASEMENTAREA_AVG          float64
ENTRANCES_AVG             float64
FLAG_DOCUMENT_20          float64
CNT_CHILDREN              float64
FLAG_DOCUMENT_15          float64
FLAG_EMAIL                float64
OWN_CAR_AGE               float64
dtype: object

In [20]:
data.head()

Unnamed: 0,SK_ID_CURR,DAYS_LAST_PHONE_CHANGE,BASEMENTAREA_AVG,ENTRANCES_AVG,FLAG_DOCUMENT_20,CNT_CHILDREN,FLAG_DOCUMENT_15,FLAG_EMAIL,OWN_CAR_AGE
0,SK_ID_CURR,,,,,,,,
1,100004,-815.0,,,0.0,0.0,0.0,0.0,26.0
2,100006,-617.0,,,0.0,0.0,0.0,0.0,
3,100012,-1673.0,,,0.0,0.0,0.0,0.0,
4,100014,-844.0,,,0.0,1.0,0.0,0.0,


In [21]:
data = data.iloc[1:]

data imputation

In [22]:
from sklearn import preprocessing

In [23]:
imputer = preprocessing.Imputer(strategy="mean")

In [24]:
numerical_imputed = imputer.fit_transform(data)

In [25]:
numerical_imputed

array([[ 1.00004000e+05, -8.15000000e+02,  8.86481506e-02, ...,
         0.00000000e+00,  0.00000000e+00,  2.60000000e+01],
       [ 1.00006000e+05, -6.17000000e+02,  8.86481506e-02, ...,
         0.00000000e+00,  0.00000000e+00,  1.20298564e+01],
       [ 1.00012000e+05, -1.67300000e+03,  8.86481506e-02, ...,
         0.00000000e+00,  0.00000000e+00,  1.20298564e+01],
       ...,
       [ 4.56249000e+05,  0.00000000e+00,  1.63800000e-01, ...,
         0.00000000e+00,  0.00000000e+00,  1.20298564e+01],
       [ 4.56252000e+05,  0.00000000e+00,  4.35000000e-02, ...,
         0.00000000e+00,  0.00000000e+00,  1.20298564e+01],
       [ 4.56253000e+05, -1.90900000e+03,  8.62000000e-02, ...,
         0.00000000e+00,  1.00000000e+00,  1.20298564e+01]])

In [26]:
numerical_imputed.shape

(100000, 9)

In [27]:
numerical_imputed_df = pd.DataFrame(numerical_imputed,
                                    index=data.index,
                                    columns=data.columns)
numerical_imputed_df['SK_ID_CURR'] = numerical_imputed_df.SK_ID_CURR.astype(int)

numerical_imputed_df.head(10)

Unnamed: 0,SK_ID_CURR,DAYS_LAST_PHONE_CHANGE,BASEMENTAREA_AVG,ENTRANCES_AVG,FLAG_DOCUMENT_20,CNT_CHILDREN,FLAG_DOCUMENT_15,FLAG_EMAIL,OWN_CAR_AGE
1,100004,-815.0,0.088648,0.14993,0.0,0.0,0.0,0.0,26.0
2,100006,-617.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856
3,100012,-1673.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856
4,100014,-844.0,0.088648,0.14993,0.0,1.0,0.0,0.0,12.029856
5,100015,-2396.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856
6,100017,-4.0,0.0973,0.1379,0.0,1.0,0.0,0.0,23.0
7,100018,-188.0,0.1335,0.1724,0.0,0.0,0.0,0.0,12.029856
8,100019,-925.0,0.088648,0.14993,0.0,0.0,0.0,0.0,17.0
9,100030,0.0,0.0147,0.1379,0.0,0.0,0.0,0.0,12.029856
10,100032,-2.0,0.088648,0.14993,0.0,1.0,0.0,0.0,12.029856


In [28]:
numerical_imputed_df[numerical_imputed_df.isnull().any(axis=1)].shape

(0, 9)

In [29]:
scaler = preprocessing.StandardScaler()
numerical_imputed_scaled_df = scaler.fit_transform(numerical_imputed_df)

In [30]:
scaler.mean_

array([ 2.78525175e+05, -9.64469220e+02,  8.86481506e-02,  1.49929706e-01,
        4.60000000e-04,  4.16790000e-01,  1.27000000e-03,  5.71100000e-02,
        1.20298564e+01])

In [31]:
numerical_imputed_scaled_df.mean(axis=0)

array([ 3.63797881e-17, -6.45883347e-17,  1.88791205e-16, -5.45696821e-17,
        4.12114787e-18,  4.65405492e-17, -1.31450406e-18, -2.85638180e-17,
        5.08961762e-16])

In [32]:
numerical_imputed_scaled_df.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [33]:
##load target variable
target_data = pd.read_csv('credit_risk_target.csv')

In [34]:
target_data.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,SK_ID_CURR,TARGET
1,100004,
2,100006,0
3,100012,
4,100014,0


In [35]:
numerical_imputed_df['TARGET'] = target_data['TARGET']

In [36]:
numerical_imputed_df = numerical_imputed_df.set_index('SK_ID_CURR')

In [37]:
numerical_imputed_df.head()

Unnamed: 0_level_0,DAYS_LAST_PHONE_CHANGE,BASEMENTAREA_AVG,ENTRANCES_AVG,FLAG_DOCUMENT_20,CNT_CHILDREN,FLAG_DOCUMENT_15,FLAG_EMAIL,OWN_CAR_AGE,TARGET
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100004,-815.0,0.088648,0.14993,0.0,0.0,0.0,0.0,26.0,
100006,-617.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856,0.0
100012,-1673.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856,
100014,-844.0,0.088648,0.14993,0.0,1.0,0.0,0.0,12.029856,0.0
100015,-2396.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856,0.0


In [38]:
test_ids = target_data[target_data.TARGET.isnull()].SK_ID_CURR.astype(int).values

In [39]:
train_data = numerical_imputed_df[numerical_imputed_df.TARGET.notnull()]

In [40]:
test_data = numerical_imputed_df.loc[test_ids]

In [41]:
train_data.head()

Unnamed: 0_level_0,DAYS_LAST_PHONE_CHANGE,BASEMENTAREA_AVG,ENTRANCES_AVG,FLAG_DOCUMENT_20,CNT_CHILDREN,FLAG_DOCUMENT_15,FLAG_EMAIL,OWN_CAR_AGE,TARGET
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100006,-617.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856,0
100014,-844.0,0.088648,0.14993,0.0,1.0,0.0,0.0,12.029856,0
100015,-2396.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856,0
100017,-4.0,0.0973,0.1379,0.0,1.0,0.0,0.0,23.0,0
100018,-188.0,0.1335,0.1724,0.0,0.0,0.0,0.0,12.029856,0


In [43]:
test_data.head(38)

Unnamed: 0_level_0,DAYS_LAST_PHONE_CHANGE,BASEMENTAREA_AVG,ENTRANCES_AVG,FLAG_DOCUMENT_20,CNT_CHILDREN,FLAG_DOCUMENT_15,FLAG_EMAIL,OWN_CAR_AGE,TARGET
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100004,-815.0,0.088648,0.14993,0.0,0.0,0.0,0.0,26.0,
100012,-1673.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856,
100034,-599.0,0.0838,0.1379,0.0,0.0,0.0,0.0,12.029856,
100043,-2411.0,0.088648,0.14993,0.0,2.0,0.0,0.0,12.029856,
100055,-784.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856,
100058,0.0,0.0442,0.0552,0.0,0.0,0.0,0.0,12.029856,
100063,-1547.0,0.066,0.0345,0.0,0.0,0.0,1.0,12.029856,
100088,-1537.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856,
100112,-591.0,0.088648,0.14993,0.0,0.0,0.0,0.0,2.0,
100126,-1091.0,0.0309,0.069,0.0,2.0,0.0,0.0,12.029856,


In [44]:
train_data.TARGET.value_counts(True)

0    0.92012
1    0.07988
Name: TARGET, dtype: float64

In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
independent_variables = ['DAYS_LAST_PHONE_CHANGE', 'BASEMENTAREA_AVG', 'ENTRANCES_AVG', 'FLAG_DOCUMENT_20', 'CNT_CHILDREN', 'FLAG_DOCUMENT_15', 'FLAG_EMAIL', 'OWN_CAR_AGE']

In [47]:
target = ['TARGET']

In [48]:
X = train_data[independent_variables]
y = train_data[target]

In [49]:
## creating model with training data 
clf = LogisticRegression()
clf.fit(X,y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
test_data['TARGET_PRED'] = clf.predict(test_data[independent_variables])
test_data.head()

Unnamed: 0_level_0,DAYS_LAST_PHONE_CHANGE,BASEMENTAREA_AVG,ENTRANCES_AVG,FLAG_DOCUMENT_20,CNT_CHILDREN,FLAG_DOCUMENT_15,FLAG_EMAIL,OWN_CAR_AGE,TARGET,TARGET_PRED
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100004,-815.0,0.088648,0.14993,0.0,0.0,0.0,0.0,26.0,,0
100012,-1673.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856,,0
100034,-599.0,0.0838,0.1379,0.0,0.0,0.0,0.0,12.029856,,0
100043,-2411.0,0.088648,0.14993,0.0,2.0,0.0,0.0,12.029856,,0
100055,-784.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856,,0


In [51]:
prediction_probabilities = clf.predict_proba(test_data[independent_variables])
prediction_probabilities[:10]

array([[0.91231362, 0.08768638],
       [0.92807136, 0.07192864],
       [0.91887552, 0.08112448],
       [0.92619633, 0.07380367],
       [0.92084745, 0.07915255],
       [0.91054514, 0.08945486],
       [0.92936257, 0.07063743],
       [0.92700703, 0.07299297],
       [0.92513345, 0.07486655],
       [0.91179452, 0.08820548]])

In [52]:
prediction_probabilities[:,0]

array([0.91231362, 0.92807136, 0.91887552, ..., 0.92020175, 0.92289217,
       0.91189819])

In [53]:
test_data['pred_proba_default'] = prediction_probabilities[:,0]

In [54]:
test_data.head()

Unnamed: 0_level_0,DAYS_LAST_PHONE_CHANGE,BASEMENTAREA_AVG,ENTRANCES_AVG,FLAG_DOCUMENT_20,CNT_CHILDREN,FLAG_DOCUMENT_15,FLAG_EMAIL,OWN_CAR_AGE,TARGET,TARGET_PRED,pred_proba_default
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100004,-815.0,0.088648,0.14993,0.0,0.0,0.0,0.0,26.0,,0,0.912314
100012,-1673.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856,,0,0.928071
100034,-599.0,0.0838,0.1379,0.0,0.0,0.0,0.0,12.029856,,0,0.918876
100043,-2411.0,0.088648,0.14993,0.0,2.0,0.0,0.0,12.029856,,0,0.926196
100055,-784.0,0.088648,0.14993,0.0,0.0,0.0,0.0,12.029856,,0,0.920847


In [55]:
submission_1 = test_data[['SK_ID_CURR', 'pred_proba_default']]
submission_1 = submission_1.rename(columns={'pred_proba_default': 'TARGET'})
submission_1.head()

KeyError: "['SK_ID_CURR'] not in index"

In [None]:
submission_1.to_csv('submission_1.csv', index = False)

In [None]:
submission_1.set_index('SK_ID_CURR', inplace=True)
submission_1.to_csv('test1.csv', index = True)

In [None]:
submission_1.isna().any()

In [None]:
work = pd.read_csv('submission_2.csv')
nowork = pd.read_csv('submission_1.csv')

In [None]:
nowork.SK_ID_CURR == work.SK_ID_CURR

In [None]:
good = target_data[target_data.TARGET.isnull()].SK_ID_CURR.astype(int)

In [None]:
set(good) - set(work.SK_ID_CURR)