In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from numpy import mean
from numpy import std
from sklearn import metrics 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [2]:
def init():
    #load file
    df_train=pd.read_csv("application_train.csv")
    df_test=pd.read_csv("application_test.csv")
    
    #set categories
    df_train["CODE_GENDER"] = df_train["CODE_GENDER"].astype('category').cat.codes
    df_test["CODE_GENDER"] = df_test["CODE_GENDER"].astype('category').cat.codes
    
    df_test['NAME_CONTRACT_TYPE']=df_test["NAME_CONTRACT_TYPE"].astype('category').cat.codes
    df_train['NAME_CONTRACT_TYPE']= df_train["NAME_CONTRACT_TYPE"].astype('category').cat.codes
    
    df_train['FLAG_OWN_CAR']=df_train["FLAG_OWN_CAR"].astype('category').cat.codes
    df_test['FLAG_OWN_CAR']=df_test["FLAG_OWN_CAR"].astype('category').cat.codes
    
    df_test['FLAG_OWN_REALTY']=df_test["FLAG_OWN_REALTY"].astype('category').cat.codes
    df_train['FLAG_OWN_REALTY']= df_train["FLAG_OWN_REALTY"].astype('category').cat.codes
    
    df_train['NAME_TYPE_SUITE']=df_train["NAME_TYPE_SUITE"].astype('category').cat.codes                                                                  
    df_test['NAME_TYPE_SUITE']= df_test["NAME_TYPE_SUITE"].astype('category').cat.codes
    
    df_test['NAME_EDUCATION_TYPE']= df_test["NAME_EDUCATION_TYPE"].astype('category').cat.codes                                                          
    df_train['NAME_EDUCATION_TYPE']= df_train["NAME_EDUCATION_TYPE"].astype('category').cat.codes
    
    df_test['NAME_FAMILY_STATUS']= df_test["NAME_FAMILY_STATUS"].astype('category').cat.codes
    df_train['NAME_FAMILY_STATUS']= df_train["NAME_FAMILY_STATUS"].astype('category').cat.codes
                                                                     
    df_test['NAME_HOUSING_TYPE']=df_test["NAME_HOUSING_TYPE"].astype('category').cat.codes
    df_train['NAME_HOUSING_TYPE']=df_train["NAME_HOUSING_TYPE"].astype('category').cat.codes
                                                              
    df_test['OCCUPATION_TYPE']=df_test["OCCUPATION_TYPE"].astype('category').cat.codes
    df_train['OCCUPATION_TYPE']= df_train["OCCUPATION_TYPE"].astype('category').cat.codes
    
    #prep for display nan / none
    column_with_nan_df_train = df_train.columns[df_train.isnull().any()]
    column_with_nan_df_test = df_test.columns[df_test.isnull().any()]
    
    #display nan / none in test / train
    print("Number of null / column :")
    for column in column_with_nan_df_test:
        print(column, df_test[column].isnull().sum())
    
    for column in column_with_nan_df_train:
        print(column, df_train[column].isnull().sum())
        
    return df_train,df_test

In [3]:
def matrice_corr(df_train,df_test):
    #prepare correlation matrix
    correlations = df_train.corr()['TARGET'].sort_values()
    print('\nMost Positive Correlations: \n', correlations.tail(10))
    print('\nMost Negative Correlations: \n', correlations.head(5),'\n')
    df_train_corr = df_train[['TARGET','DAYS_BIRTH','REGION_RATING_CLIENT_W_CITY','REGION_RATING_CLIENT',
                          'DAYS_LAST_PHONE_CHANGE','FLOORSMAX_AVG','DAYS_EMPLOYED','EXT_SOURCE_1', 
                          'EXT_SOURCE_2', 'EXT_SOURCE_3',"NAME_EDUCATION_TYPE","CODE_GENDER","DAYS_ID_PUBLISH"]]
    # Calculate correlations
    corr = df_train_corr.corr()
    # Heatmap
    plt.figure(figsize=(15,8))
    sns.heatmap(corr, annot=True, linewidths=.2, cmap="YlGnBu")
    print("\nCorrelation matrix :")
    plt.show()

we choose the values with the most correlation, and we delete the others

In [4]:
def setup_train(df_train,df_test):
    X_train = df_train[['DAYS_BIRTH','REGION_RATING_CLIENT_W_CITY','REGION_RATING_CLIENT','DAYS_LAST_PHONE_CHANGE',"NAME_EDUCATION_TYPE","CODE_GENDER","DAYS_ID_PUBLISH"]]
    Y_train = df_train["TARGET"]
    
    #Unused
    X_test = df_test[['DAYS_BIRTH','REGION_RATING_CLIENT_W_CITY','REGION_RATING_CLIENT','DAYS_LAST_PHONE_CHANGE',"NAME_EDUCATION_TYPE","CODE_GENDER","DAYS_ID_PUBLISH"]]
    
    X_train=X_train.dropna()
    X_train = X_train[:300000]
    Y_train = Y_train[:300000]
    return X_train,Y_train

In [5]:
def RF_model(X_train,Y_train):
    # creating a RF classifier
    clf = RandomForestClassifier(n_estimators = 100,verbose=1,n_jobs=-1)
     
    # Training the model on the training dataset
    # fit function is used to train the model using the training sets as parameters
    clf.fit(X_train, Y_train)
     
    # performing predictions on the test dataset
    y_pred_train = clf.predict(X_train)
    
    # using metrics module for accuracy calculation
    print("\nRF accuracy score:\n")
    print(metrics.accuracy_score(Y_train, y_pred_train))

In [6]:
def GB_model(X_train,Y_train):
    # creating a RF classifier
    clf2 = GradientBoostingClassifier(n_estimators = 1000,verbose=1) 
     
    # Training the model on the training dataset
    # fit function is used to train the model using the training sets as parameters
    clf2.fit(X_train, Y_train)
     
    # performing predictions on the test dataset
    y_pred_train = clf2.predict(X_train)
     
    #using metrics module for accuracy calculation
    print("\nGB accuracy score:\n")
    print(metrics.accuracy_score(Y_train, y_pred_train))

In [27]:
def XGBC_model(X_train,Y_train):
    xg_clf = XGBClassifier(learning_rate=0.15, max_depth=30, scale_pos_weight=1.5,eval_metric='mlogloss',n_jobs=-1,use_label_encoder=False)
    xg_clf.fit(X_train,Y_train)    
    preds = xg_clf.predict(X_train)
    print("\nXGBC accuracy score:\n")
    print(metrics.accuracy_score(Y_train, preds))

In [None]:
if __name__ == "__main__":
    df_train,df_test = init()
    matrice_corr(df_train,df_test)
    X_train,Y_train = setup_train(df_train,df_test)
    RF_model(X_train,Y_train)
    GB_model(X_train,Y_train)
    XGBC_model(X_train,Y_train)

Number of null / column :
AMT_ANNUITY 24
OWN_CAR_AGE 32312
EXT_SOURCE_1 20532
EXT_SOURCE_2 8
EXT_SOURCE_3 8668
APARTMENTS_AVG 23887
BASEMENTAREA_AVG 27641
YEARS_BEGINEXPLUATATION_AVG 22856
YEARS_BUILD_AVG 31818
COMMONAREA_AVG 33495
ELEVATORS_AVG 25189
ENTRANCES_AVG 23579
FLOORSMAX_AVG 23321
FLOORSMIN_AVG 32466
LANDAREA_AVG 28254
LIVINGAPARTMENTS_AVG 32780
LIVINGAREA_AVG 23552
NONLIVINGAPARTMENTS_AVG 33347
NONLIVINGAREA_AVG 26084
APARTMENTS_MODE 23887
BASEMENTAREA_MODE 27641
YEARS_BEGINEXPLUATATION_MODE 22856
YEARS_BUILD_MODE 31818
COMMONAREA_MODE 33495
ELEVATORS_MODE 25189
ENTRANCES_MODE 23579
FLOORSMAX_MODE 23321
FLOORSMIN_MODE 32466
LANDAREA_MODE 28254
LIVINGAPARTMENTS_MODE 32780
LIVINGAREA_MODE 23552
NONLIVINGAPARTMENTS_MODE 33347
NONLIVINGAREA_MODE 26084
APARTMENTS_MEDI 23887
BASEMENTAREA_MEDI 27641
YEARS_BEGINEXPLUATATION_MEDI 22856
YEARS_BUILD_MEDI 31818
COMMONAREA_MEDI 33495
ELEVATORS_MEDI 25189
ENTRANCES_MEDI 23579
FLOORSMAX_MEDI 23321
FLOORSMIN_MEDI 32466
LANDAREA_MEDI 28254
L