In [1]:
# Import required modules

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Import the csv file having all the train/test data
# Total rows: 981, Train set rows: First 614 and Test set rows: last 367

df = pd.read_csv('alldata.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


In [3]:
# Create a copy of the original df for further pre-processing
df1 = df.copy(deep=True)

In [4]:
# Pre-processing: Convert Categorical variables to numbers and make adjustments for null values

df1['Gender'] = df1['Gender'].map(lambda x: 1 if x == 'Male' else 0)
df1['Married'] = df1['Married'].map(lambda x: 1 if x == 'Yes' else 0)
df1['Dependents'] = df1['Dependents'].map(lambda x: 3 if x == '3+' else float(x))
df1['Dependents'] = df1['Dependents'].fillna(df1['Dependents'].mean())#value=0)
df1['Education'] = df1['Education'].map(lambda x: 1 if x == 'Graduate' else 0)
df1['Self_Employed'] = df1['Self_Employed'].map(lambda x: 1 if x == 'Yes' else 0)
df1['LoanAmount'] = df1['LoanAmount'].fillna(df1['LoanAmount'].mean())
df1['Loan_Amount_Term'] = df1['Loan_Amount_Term'].fillna(df1['Loan_Amount_Term'].mean())
df1['Credit_History'] = df1['Credit_History'].fillna(df1['Credit_History'].mean())
df1['Property_Area'] = df1['Property_Area'].map(lambda x: 2 if x == 'Urban' else (1 if x == 'Semiurban' else 0))
#df1['Loan_Status'] = df1['Loan_Status'].map(lambda x: 1 if x == 'Y' else 0)
df1.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
976,LP002971,1,1,3.0,0,1,4009,1777.0,113.0,360.0,1.0,2
977,LP002975,1,1,0.0,1,0,4158,709.0,115.0,360.0,1.0,2
978,LP002980,1,0,0.0,1,0,3250,1993.0,126.0,360.0,0.83592,1
979,LP002986,1,1,0.0,1,0,5000,2393.0,158.0,360.0,1.0,0
980,LP002989,1,0,0.0,1,1,9200,0.0,98.0,180.0,1.0,0


In [5]:
# Scale the data using MinMaxScaler() and perform PCA

scaler = MinMaxScaler()
pca = PCA(n_components=6)     # Specify the number of Principal Components

features = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']

# Separating out the features
X = df1.loc[:, features].values
X = scaler.fit_transform(X)

principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2','pc3','pc4','pc5','pc6'])
principalDf.head()

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6
0,0.532196,-0.372043,0.305666,-0.54659,0.135447,0.00973
1,-0.389258,-0.045656,-0.5736,-0.017898,-0.057655,-0.232852
2,-0.293952,-0.467042,0.134255,-0.177374,-0.261998,0.795718
3,-0.378499,0.394433,0.677732,-0.340884,-0.22023,-0.150057
4,0.532253,-0.371999,0.305718,-0.546631,0.135432,0.009699


In [6]:
# Separate the train set Principal Components (PCs) from Test Set PCs

l = [614,981]
l_mod = [0] + l + [max(l)+1]

test_train_dfs = [principalDf.iloc[l_mod[n]:l_mod[n+1]] for n in range(len(l_mod)-1)]

In [7]:
# Check the train PCs
test_train_dfs[0].tail()

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6
609,1.006864,0.07107,-0.481291,0.302156,-0.26324,-0.262554
610,-0.607257,-0.006084,-0.49238,0.194913,-0.091029,-0.130987
611,-0.394248,-0.473673,0.30912,-0.099693,-0.045803,-0.085052
612,-0.498234,-0.452489,0.347638,0.007699,-0.061313,-0.039808
613,0.9992,0.061467,-0.108885,0.714614,0.367396,0.924789


In [8]:
# Check the test set PCs
test_train_dfs[1].tail()

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6
976,-0.716815,0.45808,0.633832,0.03051,-0.496405,0.939447
977,-0.2799,-0.482073,0.281182,-0.216168,-0.031075,-0.143275
978,0.534166,-0.129052,-0.124858,-0.436347,0.270465,-0.033875
979,-0.283648,-0.064893,-0.61049,-0.126851,-0.04215,-0.280191
980,0.508034,0.061178,-0.725197,-0.423431,-0.109456,0.819669


In [9]:
# Save the final train/test PCs to local disk for Predictive Analytics

test_train_dfs[0].to_csv('pca_train.csv', index=False)
test_train_dfs[1].to_csv('pca_test.csv', index=False)