# import library

In [332]:
import pandas as pd
import numpy as np
import sklearn

import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
import plotly.offline as py 
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot



from sklearn.model_selection import train_test_split
init_notebook_mode(connected=True)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [333]:
train_df = pd.read_csv('../input/project/application_train2.csv')

In [334]:
train_df

# INSIGHT

In [335]:
cf.set_config_file(theme='polar')
contract_val = train_df['TARGET'].value_counts()
contract_df = pd.DataFrame({'labels': contract_val.index,
                   'values': contract_val.values
                  })
contract_df.iplot(kind='pie',labels='labels',values='values', title='LOAD REPAYED OR NOT', hole = 0.6)

## distribution amt credit total 

In [336]:
train_df[train_df['AMT_CREDIT'] < 2000000]['AMT_CREDIT'].iplot(kind='histogram', bins=100,
   xTitle = 'Total Credit', yTitle ='Count of applicants',
             title='Distribution of AMT_CREDIT')

In [337]:
(train_df[train_df['AMT_CREDIT'] > 1000000]['TARGET'].value_counts())/len(train_df[train_df['AMT_CREDIT'] > 1000000])*100

## distribution types of loan 

In [338]:
cf.set_config_file(theme='polar')
contract_val = train_df['NAME_CONTRACT_TYPE'].value_counts()
contract_df = pd.DataFrame({'labels': contract_val.index,
                   'values': contract_val.values
                  })
contract_df.iplot(kind='pie',labels='labels',values='values', title='Types of Loan', hole = 0.6)

## distribution type of suite in term loan repayed or not 

In [339]:
income_val = train_df['NAME_TYPE_SUITE'].value_counts()
income_val_y0 = []
income_val_y1 = []
for val in income_val.index:
    income_val_y1.append(np.sum(train_df['TARGET'][train_df['NAME_TYPE_SUITE']==val] == 1))
    income_val_y0.append(np.sum(train_df['TARGET'][train_df['NAME_TYPE_SUITE']==val] == 0))
data = [go.Bar(x = income_val.index, y = ((income_val_y1 / income_val.sum()) * 100), name='Yes' ),
        go.Bar(x = income_val.index, y = ((income_val_y0 / income_val.sum()) * 100), name='No' )]
layout = go.Layout(
    title = "Income sources of Applicants in terms of loan is repayed or not  in %",
    xaxis=dict(
        title='Income source',
       ),
    yaxis=dict(
        title='Count of applicants in %',
        )
)
fig = go.Figure(data = data, layout=layout) 
fig.layout.template = 'plotly_dark'
py.iplot(fig)

##  distribution of type income 

In [340]:
income_val = train_df['NAME_INCOME_TYPE'].value_counts()
income_val_y0 = []
income_val_y1 = []
for val in income_val.index:
    income_val_y1.append(np.sum(train_df['TARGET'][train_df['NAME_INCOME_TYPE']==val] == 1))
    income_val_y0.append(np.sum(train_df['TARGET'][train_df['NAME_INCOME_TYPE']==val] == 0))
data = [go.Bar(x = income_val.index, y = ((income_val_y1 / income_val.sum()) * 100), name='Yes' ),
        go.Bar(x = income_val.index, y = ((income_val_y0 / income_val.sum()) * 100), name='No' )]
layout = go.Layout(
    title = "Income sources of Applicants in terms of loan is repayed or not  in %",
    xaxis=dict(
        title='Income source',
       ),
    yaxis=dict(
        title='Count of applicants in %',
        )
)
fig = go.Figure(data = data, layout=layout) 
fig.layout.template = 'plotly_dark'
py.iplot(fig)

## distribusi tingkat pendidikan 

In [341]:
education_val = train_df['NAME_EDUCATION_TYPE'].value_counts()
education_val_y0 = []
education_val_y1 = []
for val in education_val.index:
    education_val_y1.append(np.sum(train_df['TARGET'][train_df['NAME_EDUCATION_TYPE']==val] == 1))
    education_val_y0.append(np.sum(train_df['TARGET'][train_df['NAME_EDUCATION_TYPE']==val] == 0))
data = [go.Bar(x = education_val.index, y = ((education_val_y1 / education_val.sum()) * 100), name='Yes' ),
        go.Bar(x = education_val.index, y = ((education_val_y0 / education_val.sum()) * 100), name='No' )]
layout = go.Layout(
    title = "Education sources of Applicants in terms of loan is repayed or not  in %",
    xaxis=dict(
        title='Education of Applicants',
       ),
    yaxis=dict(
        title='Count of applicants in %',
        )
)
fig = go.Figure(data = data, layout=layout) 
fig.layout.template = 'plotly_dark'
py.iplot(fig)

## distribusi status pernikahan 

In [342]:
family_val = train_df['NAME_FAMILY_STATUS'].value_counts()
family_val_y0 = []
family_val_y1 = []
for val in family_val.index:
    family_val_y1.append(np.sum(train_df['TARGET'][train_df['NAME_FAMILY_STATUS']==val] == 1))
    family_val_y0.append(np.sum(train_df['TARGET'][train_df['NAME_FAMILY_STATUS']==val] == 0))
data = [go.Bar(x = family_val.index, y = ((family_val_y1 / family_val.sum()) * 100), name='Yes' ),
        go.Bar(x = family_val.index, y = ((family_val_y0 / family_val.sum()) * 100), name='No' )]
layout = go.Layout(
    title = "family sources of Applicants in terms of loan is repayed or not  in %",
    xaxis=dict(
        title='family of Applicants',
       ),
    yaxis=dict(
        title='Count of applicants in %',
        )
)
fig = go.Figure(data = data, layout=layout) 
fig.layout.template = 'plotly_dark'
py.iplot(fig)

# PREPROCESSING

## menghitung persen kolom kosong

In [344]:
count = train_df.isnull().sum().sort_values(ascending=False)
percentage = ((train_df.isnull().sum()/len(train_df)*100)).sort_values(ascending=False)
missing_application = pd.concat([count, percentage], axis=1, keys=['Count','Percentage'])
print('Count and percentage of missing values for top 20 columns:')
missing_application.head(20)

## menghapus kolom kosong diatas 60 persen

In [345]:
def missing_values(df, percentage):

    columns = df.columns
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': columns,
                                 'percent_missing': percent_missing})

    missing_drop = list(missing_value_df[missing_value_df.percent_missing>percentage].column_name)
    df = df.drop(missing_drop, axis=1)
    return df

In [346]:
train_df = missing_values(train_df,60)
train_df

## mengisi kolom kosong dengan 0

In [347]:
train_df = train_df.replace(to_replace = np.nan, value = 0)
train_df.isnull().sum()

## mengecek duplikasi data


In [348]:
columns_without_id = [col for col in train_df.columns if col!='SK_ID_CURR']
#Checking for duplicates in the data.
train_df[train_df.duplicated(subset = columns_without_id, keep=False)]
print('The no of duplicates in the data:',train_df[train_df.duplicated(subset = columns_without_id, keep=False)]
      .shape[0])

# DATA PREPARATION

## melihat korelasi

In [363]:
# Find correlation with the target and sort
correlations = train_df.corr()['TARGET'].sort_values()

# Display correlations
print('Most Postive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))

## feature selection 

In [349]:
train  = train_df[['TARGET', 'EXT_SOURCE_3', 'EXT_SOURCE_1', 'EXT_SOURCE_2','DAYS_BIRTH','DAYS_EMPLOYED','REG_CITY_NOT_WORK_CITY','DAYS_ID_PUBLISH','DAYS_LAST_PHONE_CHANGE','REGION_RATING_CLIENT','REGION_RATING_CLIENT','REGION_RATING_CLIENT_W_CITY','CODE_GENDER', 'NAME_EDUCATION_TYPE', 'NAME_INCOME_TYPE']]

## ordinal encoder

In [350]:
#Map the atribute into ordinal numbers from 0-4
scale_mapper = {'Lower secondary' : 0, 'Secondary / secondary special' : 1, 
'Incomplete higher' : 2, 'Higher education' : 3, 'Academic degree' : 4}

train['NAME_EDUCATION_TYPE'] = train['NAME_EDUCATION_TYPE'].replace(scale_mapper)

In [364]:
train['NAME_EDUCATION_TYPE'].unique()

## one hot encoding

In [351]:
CODE_GENDER_dummies = pd.get_dummies(train['CODE_GENDER'],prefix='CODE_GENDER')
NAME_INCOME_TYPE_dummies = pd.get_dummies(train['NAME_INCOME_TYPE'],prefix='NAME_INCOME_TYPE')

In [352]:
train = pd.concat([train, CODE_GENDER_dummies, NAME_INCOME_TYPE_dummies], axis=1)

#Drop categorical attribute (before transformation)
train = train.drop(['CODE_GENDER','NAME_INCOME_TYPE'], axis=1)

In [353]:
train.head()

In [354]:
train.info()

## scaler , train test split

In [365]:
y = train.iloc[:, 0].values
x = train.iloc[:, 1:23].values

In [366]:
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(x, y)

In [367]:
sc = StandardScaler()
x_smote = sc.fit_transform(x_smote)

In [368]:
x_train, x_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size=0.3 , random_state=10)

# MODELING

## logistic Regression

In [369]:
logreg = LogisticRegression()
logreg.fit(x_train,y_train)
prediction = logreg.predict(x_test)

In [370]:
akurasi = accuracy_score(y_test, prediction)
akurasi

In [379]:
auch = roc_auc_score(y_test, prediction)
print(auch)

In [384]:
klas = classification_report(y_test, prediction)
print(klas)

In [385]:
print('accuracy_score \t\t:',akurasi)
print('roc_auc_score \t\t:',auch)
print('classification_report \t:\n',klas)


## Random Forest classifier

In [373]:
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

In [374]:
score = accuracy_score(y_test, y_pred)
print(score)

In [375]:
auc = roc_auc_score(y_test, y_pred)
print(auc)

In [386]:
klasifikasi = classification_report(y_test, y_pred)
print(klasifikasi)

In [387]:
print('accuracy_score \t\t:',score)
print('roc_auc_score \t\t:',auc)
print('classification_report \t:\n',klasifikasi)