In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression


%matplotlib inline

In [2]:
df_train = pd.read_csv('train_v2.csv', low_memory = False)
df_test = pd.read_csv('test_v2.csv', low_memory = False)

In [3]:
#dealing with missing variables in df_train
df_numeric = df_train.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values

for col in numeric_cols:
    missing = df_train[col].isnull()
    num_missing = np.sum(missing)
    
    if num_missing > 0:
        df_train['{}_ismissing'.format(col)] = missing
        mean_val = df_train[col].mean()
        df_train[col] = df_train[col].fillna(mean_val)

In [4]:
#dealing with missing variables in the test df
test_numeric = df_test.select_dtypes(include=[np.number])
test_numeric_cols = test_numeric.columns.values

for col in test_numeric_cols:
    missing = df_test[col].isnull()
    num_missing = np.sum(missing)
    
    if num_missing > 0:
        df_test['{}_ismissing'.format(col)] = missing
        mean_val = df_test[col].mean()
        df_test[col] = df_test[col].fillna(mean_val)

In [5]:
# Droping irrelavant columns

#for df_train
df_train.drop(df_train.iloc[:, 771:1284], inplace = True, axis = 1) 

# for df_test
df_test.drop(df_test.iloc[:, 771:1284], inplace = True, axis = 1) 

In [6]:
#dropping correlated columns in the df
corr_matrix = df_train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
df_train.drop(to_drop, axis=1, inplace=True)
#also dropping these columns in the test df
df_test.drop(to_drop, axis=1, inplace=True)

In [7]:
# Making a column stating whether a loss happened or not
df_train['loss_fact'] = df_train['loss']

loss_fact = df_train['loss']

for i in loss_fact:
    if i != 0:
        loss_fact = loss_fact.replace(i, 1)
        
df_train['loss_fact'] = loss_fact

In [8]:
df_train = pd.concat([df_train.select_dtypes(include=[np.float64]), df_train['loss_fact']], axis=1)

df_test = df_test.select_dtypes(include=[np.float64])

In [9]:
test_float_train = df_test
test_float_train = test_float_train.drop('f5', axis = 1)


In [10]:
df_train = df_train.dropna(how='any')

In [11]:
X= df_train.drop(['loss_fact'], axis=1)
y= df_train['loss_fact']

## KNN

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=35)
X_train.shape, X_test.shape

((68556, 323), (36915, 323))

In [13]:
knn = KNeighborsClassifier(n_neighbors=15)

In [14]:
classifier=knn.fit(X_train,y_train)

In [15]:
#X_train.describe()

In [16]:
#X_train.dtypes

In [17]:
predictions = knn.predict(X_test)

In [18]:
accuracy = accuracy_score(y_test, predictions)

In [19]:
accuracy 

0.9076256264391169

In [20]:
def write_to_submission_file(predicted_labels, out_file, train_num=105471,
                    target='loss', index_label="id"):
    #turning predictions into a data frame and saving them as a csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(train_num + 1,
                                                  train_num + 1 +
                                                  predicted_labels.shape[0]),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [21]:
classifier.predict(test_float_train)

array([0, 0, 0, ..., 0, 0, 0])

In [22]:
write_to_submission_file(classifier.predict(test_float_train), out_file="knn_loan_prediction.csv")