# Assignment 1 â€” Loan Default Prediction using KNN



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression
plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
df = pd.read_csv('loan_default_dataset.csv')
df

In [None]:
print('Shape:', df.shape)
print('Class distribution (loan):', df['loan'].value_counts())
print('Numeric summary:')
print(df[['Age','Annual Income (lakhs)','Credit Score','Loan Amount (lakhs)','Loan Term (years)']].describe())

In [None]:
X = df.drop(columns=['loan'])
y = df['loan']

numeric_features = ['Age','Annual Income (lakhs)','Credit Score','Loan Amount (lakhs)','Loan Term (years)']
categorical_features = ['Employment Type']

preprocess = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

knn = KNeighborsClassifier()
pipe_knn = Pipeline([('prep', preprocess), ('model', knn)])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {
    'model__n_neighbors': [1,3,5],
    'model__weights': ['uniform','distance'],
    'model__p': [1,2]
}
search = GridSearchCV(pipe_knn, param_grid, cv=cv, scoring='accuracy')
search.fit(X, y)
print('Best params:', search.best_params_)
print('Best CV accuracy:', search.best_score_)

best_knn = search.best_estimator_
pred = cross_val_predict(best_knn, X, y, cv=cv)
print('Classification report (CV):', classification_report(y, pred))
print('Confusion matrix (CV):', confusion_matrix(y, pred))

In [None]:
best_knn.fit(X, y)
proba = best_knn.predict_proba(X)[:,1]
risk_df = df.copy()
risk_df['pred_default_proba'] = proba
risk_df.sort_values('pred_default_proba', ascending=False)

In [None]:
r = permutation_importance(best_knn, X, y, n_repeats=30, random_state=42, scoring='accuracy')
feature_names = ['Age','Annual Income (lakhs)','Credit Score','Loan Amount (lakhs)','Loan Term (years)']
ohe = best_knn.named_steps['prep'].named_transformers_['cat']
feature_names += list(ohe.get_feature_names_out(['Employment Type']))
importances = (pd.DataFrame({'feature': feature_names,
                             'importance_mean': r.importances_mean,
                             'importance_std': r.importances_std})
               .sort_values('importance_mean', ascending=False))
importances

In [None]:
# Credit Score vs Income decision surface
import numpy as np
fixed = {
    'Age': df['Age'].median(),
    'Loan Amount (lakhs)': df['Loan Amount (lakhs)'].median(),
    'Loan Term (years)': df['Loan Term (years)'].median(),
    'Employment Type': 'Salaried'
}
cs_grid = np.linspace(df['Credit Score'].min(), df['Credit Score'].max(), 80)
inc_grid = np.linspace(df['Annual Income (lakhs)'].min(), df['Annual Income (lakhs)'].max(), 80)
CS, INC = np.meshgrid(cs_grid, inc_grid)
rows = []
for i in range(CS.shape[0]):
    for j in range(CS.shape[1]):
        rows.append({'Age': fixed['Age'],
                     'Annual Income (lakhs)': INC[i,j],
                     'Credit Score': CS[i,j],
                     'Loan Amount (lakhs)': fixed['Loan Amount (lakhs)'],
                     'Loan Term (years)': fixed['Loan Term (years)'],
                     'Employment Type': fixed['Employment Type']})
Xg = pd.DataFrame(rows)
Z = best_knn.predict_proba(Xg)[:,1].reshape(CS.shape)
import matplotlib.pyplot as plt
plt.figure(figsize=(7,5))
cp = plt.contourf(CS, INC, Z, levels=15, cmap='RdYlBu_r')
plt.colorbar(cp, label='Predicted default probability')
plt.xlabel('Credit Score')
plt.ylabel('Annual Income (lakhs)')
plt.title('Model sensitivity: Credit Score vs Annual Income')
plt.show()

In [None]:
from sklearn.metrics import pairwise_distances
Xt = best_knn.named_steps['prep'].transform(X)
raw_num = X[['Age','Annual Income (lakhs)','Credit Score','Loan Amount (lakhs)','Loan Term (years)']].values
scaled_dist = pairwise_distances(Xt, Xt).mean()
raw_dist = pairwise_distances(raw_num, raw_num).mean()
print('Mean pairwise distance (scaled):', scaled_dist)
print('Mean pairwise distance (raw numeric):', raw_dist)

from sklearn.preprocessing import StandardScaler
num_scaled = StandardScaler().fit_transform(raw_num)
amp = num_scaled.copy()
loan_idx = 3
amp[:, loan_idx] *= 50
print('Mean pairwise distance (properly scaled):', pairwise_distances(num_scaled).mean())
print('Mean pairwise distance (loan amount x50):', pairwise_distances(amp).mean())

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

dt = DecisionTreeClassifier(random_state=42, max_depth=3)
pipe_dt = Pipeline([('prep', preprocess), ('model', dt)])
acc_knn = cross_val_score(best_knn, X, y, cv=5, scoring='accuracy').mean()
acc_dt = cross_val_score(pipe_dt, X, y, cv=5, scoring='accuracy').mean()
print(f'KNN CV accuracy: {acc_knn:.3f}')
print(f'Decision Tree CV accuracy: {acc_dt:.3f}')