In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import xgboost as xgb
from sklearn import cross_validation, metrics
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from datetime import date

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/Train.csv', engine='python')
train.head()

In [None]:
test = pd.read_csv('../input/Test.csv')
test.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.describe(include='O')

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.columns

In [None]:
test.columns

In [None]:
test.info()

In [None]:
target = train['Disbursed']
n_train = train.shape[0]
n_test = test.shape[0]
IDcol = 'ID'
train.drop(['LoggedIn', 'Disbursed'], axis=1, inplace=True)

In [None]:
data = pd.concat((train, test)).reset_index(drop=True)
data.shape

In [None]:
data.drop(['City'], axis=1, inplace=True)

In [None]:
data['DOB'] = pd.to_datetime(data['DOB'])

In [None]:
data['DOB'][0]

In [None]:
def age(born):
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

In [None]:
data['Age'] = data.apply(lambda row: age(row['DOB']), axis=1)

In [None]:
data.drop(['DOB'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
import math
math.isnan(data['EMI_Loan_Submitted'][0])

In [None]:
def missing_values(col, new_col):
    new_values = []
    d = data.shape[0]
    for i in range(d):
        if math.isnan(col[i]) == True:
            new_values.append(1)
        else:
            new_values.append(0)
    frame = pd.DataFrame(data = new_values,
                         columns = {new_col})
    return frame
EMI_Loan_Submitted_Missing = missing_values(data['EMI_Loan_Submitted'], 'EMI_Loan_Submitted_Missing')
Interest_Rate_Missing = missing_values(data['Interest_Rate'], 'Interest_Rate_Missing')
Loan_Amount_Submitted_Missing = missing_values(data['Loan_Amount_Submitted'], 'Loan_Amount_Submitted_Missing')
Loan_Tenure_Submitted_Missing = missing_values(data['Loan_Tenure_Submitted'], 'Loan_Tenure_Submitted_Missing')
Processing_Fee_Missing = missing_values(data['Processing_Fee'], 'Processing_Fee_Missing')

In [None]:
data = pd.concat((data, EMI_Loan_Submitted_Missing, Interest_Rate_Missing, Loan_Amount_Submitted_Missing, Loan_Tenure_Submitted_Missing, Processing_Fee_Missing), axis=1)

In [None]:
data.drop(['EMI_Loan_Submitted', 'Interest_Rate', 'Loan_Amount_Submitted', 'Loan_Tenure_Submitted', 'Processing_Fee',
           'Employer_Name', 'Lead_Creation_Date', 'Salary_Account'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
sns.countplot('Device_Type', data=data)

In [None]:
plt.figure(figsize=(16, 9))
sns.countplot('Source', data=data)
plt.xticks(rotation=90)
plt.show()

In [None]:
data.isnull().sum().sort_values(ascending=False)

In [None]:
data['Loan_Amount_Applied'].median()

In [None]:
cols = ['Loan_Amount_Applied', 'Loan_Tenure_Applied', 'Existing_EMI']
for col in cols:
    data[col] = data[col].fillna(data[col].median())

In [None]:
data.isnull().sum().max()

In [None]:
data['Source'] = data['Source'].apply(lambda x: 'others' if x not in ['S122', 'S133'] else x)

In [None]:
sns.countplot('Var1', data=data)

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
cols = ['Gender', 'Mobile_Verified', 'Filled_Form', 'Device_Type', 'Var2', 'Var1', 'Source']
for col in cols:
    data[col] = label.fit(data[col]).transform(data[col])

In [None]:
data = pd.get_dummies(data, columns=cols)
data.columns

In [None]:
data.head()

In [None]:
train = data[:n_train]
test = data[n_train:]

In [None]:
train = pd.concat((train, target), axis=1)

In [None]:
train.head()

In [None]:
train.to_csv('train_modified.csv', index=False)
test.to_csv('test_modified.csv', index=False)