# Citibank Housing ML

In [0]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [0]:
!unzip -qn home_credit_data.zip

In [0]:
# Training data
pd.set_option('display.width', 0)
data = pd.read_csv('application_train.csv')
print('Training data shape: ', data.shape)
data.head()

In [0]:
best_features = ['DAYS_BIRTH', 
                 'DAYS_EMPLOYED', 
                 'AMT_ANNUITY', 
                 'DAYS_LAST_PHONE_CHANGE',
                 'AMT_CREDIT', 
                 'AMT_INCOME_TOTAL',
                 'AMT_GOODS_PRICE',
                 'HOUR_APPR_PROCESS_START', 
                 'OWN_CAR_AGE',
                 'CNT_FAM_MEMBERS',
                 'CNT_CHILDREN',
                 'FLAG_OWN_REALTY',
                 'FLAG_PHONE', 
                 'NAME_FAMILY_STATUS',
                 'FLAG_WORK_PHONE', 
                 'ORGANIZATION_TYPE',
                 'OCCUPATION_TYPE',
                 'NAME_INCOME_TYPE',
                 'NAME_EDUCATION_TYPE', 
                 'CODE_GENDER', 
#                  'NAME_HOUSING_TYPE', 
                 'FLAG_EMAIL', 
#                  'NAME_CONTRACT_TYPE'
                ]

In [0]:
data = data[best_features + ['TARGET']]
print(data.shape)
app_train = data.copy()

In [0]:
# Create a label encoder object

def encode_labels(x):
    for col in x:
        if x[col].dtype == 'object':
            if len(list(x[col].unique())) <= 2:
                x[col] = LabelEncoder().fit_transform(x[col])
    return x
                
app_train = encode_labels(app_train)

In [0]:
app_train.dtypes.value_counts()

In [0]:
# one-hot encoding of categorical variables
app_train = pd.get_dummies(app_train)
print('Training Features shape: ', app_train.shape)

In [0]:
train_labels = app_train['TARGET']
print('Training Features shape: ', app_train.shape)

In [0]:
from sklearn.preprocessing import MinMaxScaler, Imputer

# Drop the target from the training data
if 'TARGET' in app_train:
    train = app_train.drop(columns = ['TARGET'])
else:
    train = app_train.copy()
        
# Feature names
features = list(train.columns)

# Median imputation of missing values
imputer = Imputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(train)

# Transform both training and testing data
train = imputer.transform(train)

# Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)
# test = scaler.transform(test)

print('Training data shape: ', train.shape)

In [0]:
from sklearn.model_selection import train_test_split
x_trn, x_val, y_trn, y_val = train_test_split(train, train_labels, random_state=42, test_size=0.1, stratify=train_labels)

In [0]:
from sklearn.linear_model import *

In [0]:
%%time
model = RidgeClassifier(class_weight='balanced')
# Train on the training data
model.fit(x_trn, y_trn)

In [0]:
# Make predictions
# Make sure to select the second column only
# pred = model.predict_proba(x_val)[:, 1]
pred = model.decision_function(x_val)

In [0]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, pred)

In [0]:
test = data.iloc[:1]

In [0]:
def process_sample(df):
    df = encode_labels(df)
    df = pd.get_dummies(df)
    df = df.drop('TARGET', axis=1) if 'TARGET' in df.columns else df
    template = app_train.drop('TARGET', axis=1)
    _, df = template.align(df, fill_value=None, axis=1)
    df = df[[col for col in df if col in template.columns]]
    df = scaler.transform(imputer.transform(df))
    return df

In [0]:
def demo(data):
    i = np.random.choice(len(data) - 1)
    sample = data.iloc[i:i+1].drop('TARGET', axis=1)
    sample = scaler.transform(imputer.transform(sample))
    pred = model.predict(sample)
    return pred

In [0]:
def tune_price(sample):
    pred = model.predict(preprocess(sample))
    price = sample['AMT_GOODS_PRICE'].values[0]
    print('Original price', price)
    if pred == True:
        print('True, increasing price')
        while pred == True:
            test = sample.copy()
            test['AMT_GOODS_PRICE'].values[0] = price + 10000
            pred = model.predict(preprocess(sample))
            if pred == True:
                price += 10000
                print(price)
    if pred == False:
        print('False, decreasing price')
        while pred == False:
            test = sample.copy()
            test['AMT_GOODS_PRICE'].values[0] = price - 10000
            pred = model.predict(preprocess_sample)
            if pred == False:
                price -= 10000
                print(price)

In [0]:
def preprocess(sample):
    return scaler.transform(imputer.transform(sample.drop('TARGET', axis=1)))

In [0]:
def load_eligibility(tenure, outstanding_loans):
    if tenure <= 25:
        if outstanding_loans == 0:
            return 0.75
        elif outstanding_loans == 1:
            return 0.45
        elif outstanding_loans > 1:
            return 0.35
    elif tenure > 25:
        if outstanding_loans == 0:
            return 0.55
        elif outstanding_loans == 1:
            return 0.25
        elif outstanding_loans > 1: 
            return 0.15