In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import gc

In [2]:
df = pd.read_csv('../Input/Aggregated_Removed_Empty_60.csv')

In [3]:
df.shape

(356244, 859)

In [4]:
empty_columns = (df.isnull().sum() / len(df)).sort_values(ascending = False)
empty_columns.head()

empty_columns = empty_columns.index[empty_columns > .60]

print('There are %d columns with more than 60%% mising values' %len(empty_columns))

# Drop the empty columns
target_labels = df['TARGET']
SK_IDS = df['SK_ID_CURR']

df = pd.get_dummies(df.drop(columns = empty_columns))

df.shape

There are 659 columns with more than 60% mising values


(356244, 858)

In [5]:
df = pd.read_csv('../Input/Cleaned_Aggregated_Features.csv')

In [6]:
df = pd.get_dummies(df.drop(columns = empty_columns))
df.shape

(356244, 857)

In [None]:
# Threshold for removing correlated variables
threshold = 0.8

# Absolute value correlation matrix
corr_matrix = df.corr().abs()
del corr_matrix

In [5]:
corr_matrix.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,APARTMENTS_AVG,APARTMENTS_MEDI,...,INS_ins AMT_INSTALMENT - AMT_PAYMENT_MIN,INS_ins AMT_INSTALMENT - AMT_PAYMENT_MAX,INS_ins AMT_INSTALMENT - AMT_PAYMENT_MEAN,INS_ins AMT_INSTALMENT - AMT_PAYMENT_VAR,INS_ins AMT_INSTALMENT - AMT_PAYMENT_SUM,INS_ins AMT_PAYMENT / AMT_INSTALMENT_MIN,INS_ins AMT_PAYMENT / AMT_INSTALMENT_MAX,INS_ins AMT_PAYMENT / AMT_INSTALMENT_MEAN,INS_ins AMT_PAYMENT / AMT_INSTALMENT_VAR,INS_ins AMT_PAYMENT / AMT_INSTALMENT_SUM
Unnamed: 0,1.0,1.0,0.0424,0.042463,0.020216,0.061768,0.092033,0.012808,0.010543,0.010817,...,0.009414,0.006551,0.0091,0.000899,0.011408,0.00119,0.003166,0.004924,0.003779,0.002254
Unnamed: 0.1,1.0,1.0,0.0424,0.042463,0.020216,0.061768,0.092033,0.012808,0.010543,0.010817,...,0.009414,0.006551,0.0091,0.000899,0.011408,0.00119,0.003166,0.004924,0.003779,0.002254
AMT_CREDIT,0.0424,0.0424,1.0,0.987161,0.345288,0.058058,0.003924,0.047655,0.060684,0.058884,...,0.07402,0.053792,0.064027,0.054479,0.062338,0.006213,0.003033,0.002241,0.001578,0.005999
AMT_GOODS_PRICE,0.042463,0.042463,0.987161,1.0,0.351689,0.05986,0.003988,0.050087,0.065053,0.063297,...,0.076175,0.053135,0.067531,0.056794,0.065364,0.008877,0.002739,0.002246,0.001257,0.005473
AMT_INCOME_TOTAL,0.020216,0.020216,0.345288,0.351689,1.0,0.053109,0.01848,0.027367,0.104055,0.10193,...,0.079044,0.10432,0.051465,0.054266,0.046261,0.024672,0.004959,0.006469,0.004302,0.007939


In [6]:
# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

# Select column with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d columns to remove.' % (len(to_drop)))

# Drop correlated columns
df = df.drop(columns = to_drop)
print('Dataframe shape: ', df.shape)

There are 533 columns to remove.
Dataframe shape:  (356244, 326)


In [None]:
df.head()

In [None]:
df.to_csv('../Input/Cleaned_Aggregated_Features_Removed_Correlated.csv')

In [7]:
df.to_csv('../Input/Aggregated_Removed_Empty_60_Corr.csv')