In [1]:
model_name = "knn_on_selected_pca"

In [2]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.decomposition import PCA
import os
pd.options.mode.chained_assignment = None

In [3]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"

## Get Application Train/Test Data

In [4]:
train = pd.read_csv('../data/application_train.csv', usecols = ["SK_ID_CURR",'AMT_INCOME_TOTAL', 'AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE', 'TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2','EXT_SOURCE_3'])
test = pd.read_csv('../data/application_test.csv', usecols = ["SK_ID_CURR",'AMT_INCOME_TOTAL', 'AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE', 'EXT_SOURCE_1', 'EXT_SOURCE_2','EXT_SOURCE_3'])

target = train.TARGET
train_id = train[["SK_ID_CURR"]]
test_id = test[["SK_ID_CURR"]]

data = pd.concat([train, test], axis=0).reset_index(drop=True)
data['credit_to_annuity_ratio'] = data['AMT_CREDIT']/data['AMT_ANNUITY']

In [5]:
pca_cols = ["credit_to_annuity_ratio",'AMT_INCOME_TOTAL', 'AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE', 'EXT_SOURCE_1', 'EXT_SOURCE_2','EXT_SOURCE_3']
pca = PCA(random_state=0).fit_transform(StandardScaler().fit_transform(data[pca_cols].fillna(-1)))
pca = pd.DataFrame(pca, columns = ["pca_{}".format(i) for i in range(pca.shape[1])])

# Generate Data For Training

In [6]:
train = pca[:train.shape[0]].reset_index(drop=True)
test = pca[train.shape[0]:].reset_index(drop=True)

# Defining Model

In [7]:
from sklearn.neighbors import KNeighborsClassifier

def model_knn(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id):
    clf = KNeighborsClassifier(n_neighbors=150)
    clf.fit(x_train, y_train)
    
    meta_train[test_index] = clf.predict_proba(x_test)[:,1]
    meta_test.append(clf.predict_proba(test)[:,1])

    print roc_auc_score(y_test, meta_train[test_index])

# Training Classifier

In [8]:
meta_train = np.zeros(train.shape[0])
meta_test = []

kf = StratifiedKFold(n_splits= 10, shuffle=True, random_state=47)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_knn(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] = np.array(meta_test).T.mean(axis=1)
train_id["TARGET"] = meta_train

0.7088865433200032
0.7266345530020852
0.718211647045963
0.7318904708229481
0.7170925724834424
0.7090863335546161
0.725133733505265
0.7164610097185372
0.72179060966709
0.7181111388440811


In [10]:
train_id.to_csv("csv/{}_train.csv".format(model_name), index=False)
test_id.to_csv("csv/{}_test.csv".format(model_name), index=False)