In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as plt


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [2]:
app_df = pd.read_csv("/kaggle/input/credit-card/application_data.csv")

In [3]:
#missing data 
missing_fractions = app_df.isnull().mean().sort_values(ascending=False)
missing_fractions.head(10)

# 1.Drop features 

# Limit the Feature Space
The full dataset has 122 features for each loan. We'll select features in two steps:

1. Drop features with more than 30% of their data missing.
2. Of the remaining features, choose only those that would be available to an investor before deciding to fund the loan.

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
mpl.style.use('ggplot')
sns.set(style='whitegrid')

# 1.1 Drop features missing more than 30% percent data

In [5]:
drop_list = sorted(list(missing_fractions[missing_fractions > 0.3].index))
app_df.drop(labels=drop_list, axis=1, inplace=True)

# Columns of choice

In [6]:
#useless columns:
["SK_ID_CURR"]
app_df.drop(labels=["SK_ID_CURR"], axis=1, inplace=True)

# Pearson correlation matrix

In [7]:
# indicator (dummy variable) whether the applicant provided ...
Flag=['FLAG_MOBIL',
       'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE',
       'FLAG_EMAIL','FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3',
       'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
       'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9',
       'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
       'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
       'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
       'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']

In [8]:
app_df_2=app_df.drop(labels=Flag, axis=1)

In [9]:
sns.set(style="whitegrid", font_scale=1)
plt.figure(figsize=(12,12))
plt.title('Pearson Correlation Matrix',fontsize=25)
sns.heatmap(app_df_2.corr(),linewidths=0.25,vmax=0.7,square=True,cmap="GnBu",linecolor='w',
            annot=False, cbar_kws={"shrink": .7})

In [10]:
drop_list_2=['AMT_ANNUITY','AMT_GOODS_PRICE','REGION_RATING_CLIENT']

In [11]:
app_df.drop(labels=drop_list_2, axis=1, inplace=True)

# For linear model only, other team member please delete this part(2. Multicollinearity) in your code!
# 2. Multicollinearity 
Although highly correlated features (multicollinearity) aren't a problem for the machine learning models based on decision trees (as used here), these features decrease importances of each other and can make feature analysis more difficult. Therefore, I calculate feature correlations and remove the features with very high correlation coefficients before applying machine learning.

In [12]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [13]:
#column that doesn't contain object input
value_column=[]
for i in range(app_df.shape[1]):
    if type(app_df.iloc[1,i])!=str:
        value_column.append(i)

In [14]:
app_df_3=app_df.iloc[:,value_column]

In [15]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
X=app_df_3.drop(labels=['TARGET'],axis=1)
#use the following line if you don't want to see the warning
#X=app_df_3.drop(labels=['FLAG_DOCUMENT_2'],axis=1).drop(labels=['TARGET'],axis=1)
X=X.dropna()
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [16]:
app_df_3=app_df_3.drop(labels=['FLAG_MOBIL','FLAG_EMP_PHONE','OBS_60_CNT_SOCIAL_CIRCLE'], axis=1)

# 3. Your code

In [17]:
#  one-hot encoding is not necessary for some models! Please be aware.
app_df=pd.get_dummies(app_df)

In [18]:
#app_df is for everyone and app_df_3 is for linear model

# cross validation  and tune parameters
# need the following 
 precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report