In [167]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [168]:
DATA_DIR = '/kaggle/input/sf-dst-scoring/sf-dst-scoring (3)/'
custom_date_parser = lambda x: datetime.strptime(x, "%d%b%Y")
train = pd.read_csv(DATA_DIR+'/train.csv',
                   parse_dates=['app_date'],
                   date_parser=custom_date_parser)
test = pd.read_csv(DATA_DIR+'/test.csv',
                   parse_dates=['app_date'],
                   date_parser=custom_date_parser)
sample_submission = pd.read_csv(DATA_DIR+'/sample_submission.csv')

In [169]:
sample_submission.info()

In [170]:
def clean_data(df):
    df['education'] = df['education'].fillna('Unknown')
    df['age'] = df['age'].astype(int)
    df['decline_app_cnt'] = df['decline_app_cnt'].astype(int)
    df['good_work'] = df['good_work'].astype(int)
    df['bki_request_cnt'] = df['bki_request_cnt'].astype(int)
    df['home_address'] = df['home_address'].astype(int)
    df['work_address'] = df['work_address'].astype(int)
    df['income'] = df['income'].astype(int)
    df['month'] = df['app_date'].dt.month
    df['age<30'] = df['age'] < 30
    if('default' in df):
        df['default'] = df['default'].astype(int)

clean_data(train)
clean_data(test)

In [171]:
bin_cols = ['sex', 'car', 'car_type', 'foreign_passport', 'good_work']
cat_cols = ['education', 'home_address', 'work_address', 'month']
num_cols = ['age', 'decline_app_cnt', 'bki_request_cnt', 'income', 'sna', 'first_time', 'score_bki', 'region_rating', 'age<30', 'default']
num_cols_2 = ['age', 'decline_app_cnt', 'bki_request_cnt', 'income', 'sna', 'first_time', 'score_bki', 'region_rating', 'age<30']

for i in num_cols:
    plt.figure()
    sns.distplot(train[i][train[i] > 0].dropna(), kde = False, rug=False)
    plt.title(i)
    plt.show()

In [172]:
sns.heatmap(train[num_cols].corr().abs(), vmin=0, vmax=1)

sna and first_time have strong correlation

In [173]:
imp_num = pd.Series(f_classif(train[num_cols], train['default'])[0], index = num_cols)
imp_num.sort_values(inplace = True)
imp_num.plot(kind = 'barh')

In [174]:
label_encoder = LabelEncoder()

for column in bin_cols:
    train[column] = label_encoder.fit_transform(train[column])

In [175]:
for column in bin_cols:
    test[column] = label_encoder.fit_transform(test[column])
for column in cat_cols:
    test[column] = label_encoder.fit_transform(test[column])

In [176]:
for column in cat_cols:
    train[column] = label_encoder.fit_transform(train[column])

In [177]:
train.head()

In [178]:
imp_cat = pd.Series(mutual_info_classif(train[bin_cols + cat_cols], train['default'],
                                     discrete_features =True), index = bin_cols + cat_cols)
imp_cat.sort_values(inplace = True)
imp_cat.plot(kind = 'barh')

In [179]:
X_cat = OneHotEncoder(sparse = False).fit_transform(train[cat_cols].values)
X_cat_test = OneHotEncoder(sparse = False).fit_transform(test[cat_cols].values)

In [180]:
X_num = StandardScaler().fit_transform(train[num_cols].values)
X = np.hstack([X_num, train[bin_cols].values, X_cat])
Y = train['default'].values

In [181]:
X_num_test = StandardScaler().fit_transform(test[num_cols_2].values)
X_test_1 = np.hstack([X_num_test, test[bin_cols].values, X_cat_test])

In [182]:
X_train, X_test, y_train, y_test = train_test_split(X[:,:-1], Y, test_size=0.20, random_state=42)
clf = LogisticRegression().fit(X_train, y_train)

In [183]:
len(X_train[0])

In [184]:
len(X_test_1[0])

In [185]:
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, roc_curve, roc_auc_score, confusion_matrix
model = LogisticRegression()
model.fit(X_train, y_train)

probs = model.predict_proba(X_test)
probs = probs[:,1]


fpr, tpr, threshold = roc_curve(y_test, probs)
roc_auc = roc_auc_score(y_test, probs)

plt.figure()
plt.plot([0, 1], label='Baseline', linestyle='--')
plt.plot(fpr, tpr, label = 'Regression')
plt.title('Logistic Regression ROC AUC = %0.3f' % roc_auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc = 'lower right')
plt.show()

In [186]:
test.drop(['app_date'], axis=1).head()

In [187]:
pd.Series(model.predict_proba(X_test_1)[:, 1])

In [188]:
len(test['client_id'])

In [189]:
res = pd.DataFrame(test['client_id'])
res['default'] = pd.Series(model.predict_proba(X_test_1)[:, 1])
print(res.head())
res.to_csv('res.csv')