In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pickle

In [2]:
df_train = pd.read_csv('data/application_train_clean.csv')
df_test = pd.read_csv('data/application_test_clean.csv')

df_train.head()

Unnamed: 0,sk_id_curr,target,is_revolving_loan,is_male,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,...,flag_document_12,flag_document_13,flag_document_14,flag_document_15,flag_document_16,flag_document_17,flag_document_18,flag_document_19,flag_document_20,flag_document_21
0,100002,1,0,1,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0,0,0,0,0,0
1,100003,0,0,0,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0,0,0,0,0,0
2,100004,0,1,1,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0,0,0,0,0,0
3,100006,0,0,0,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,0,0,0,0,0,0
4,100007,0,0,1,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0,0,0,0,0,0


# Separate Target

In [3]:
train_cols = [col for col in df_train.columns if col not in ['sk_id_curr', 'target']]
test_cols = [col for col in df_test.columns if col != 'sk_id_curr']

train = df_train[train_cols].values
test = df_test[test_cols].values

print(train.shape)
print(test.shape)

(307511, 58)
(48744, 58)


# Scaling

In [4]:
scaler = StandardScaler()
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

# PCA

In [5]:
pca = PCA(n_components=38, random_state=42)
pca.fit(train)
train = pca.transform(train)
test = pca.transform(test)
sum(pca.explained_variance_ratio_)

0.8535961846118231

85% variance explained, shaved an additional 20 columns off.

# Pickle & Save

In [6]:
df_train = df_train[['sk_id_curr', 'target']]
df_test = df_test['sk_id_curr']

df_train = pd.concat([df_train, pd.DataFrame(data=train)], axis=1)
df_test = pd.concat([df_test, pd.DataFrame(data=test)], axis=1)

df_train.to_csv('data/application_train_pca.csv', index=False)
df_test.to_csv('data/application_test_pca.csv', index=False)

with open('pca.pkl', 'wb') as f:
    pickle.dump(pca, f)

In [7]:
with open('pca.pkl', 'rb') as f:
    model = pickle.load(f)
sum(model.explained_variance_ratio_)

0.8535961846118231

Pickle successful.