In [1]:
# Import required packages
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
import matplotlib.pylab as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

!pip install dmba

import dmba

%matplotlib inline

Colab environment detected.


In [2]:
# Load the Universal Bank dataset and name it as "bank_df"
universal_bank_df = pd.read_csv("UniversalBank.csv")
universal_bank_df.dropna(inplace=True)
universal_bank_df.head(9)
universal_bank_df = universal_bank_df.drop(columns=['ID', 'ZIP Code'])

# One-hot encode the 'Education' column
edu = pd.get_dummies(universal_bank_df['Education'], prefix='Education')
universal_bank_df = pd.concat([universal_bank_df, edu], axis=1)
universal_bank_df.drop(['Education'], axis=1, inplace=True)
print(universal_bank_df)

      Age  Experience  Income  Family  CCAvg  Mortgage  Personal Loan  \
0      25           1      49       4    1.6         0              0   
1      45          19      34       3    1.5         0              0   
2      39          15      11       1    1.0         0              0   
3      35           9     100       1    2.7         0              0   
4      35           8      45       4    1.0         0              0   
...   ...         ...     ...     ...    ...       ...            ...   
4995   29           3      40       1    1.9         0              0   
4996   30           4      15       4    0.4        85              0   
4997   63          39      24       2    0.3         0              0   
4998   65          40      49       3    0.5         0              0   
4999   28           4      83       3    0.8         0              0   

      Securities Account  CD Account  Online  CreditCard  Education_1  \
0                      1           0       0      

##Partition the data
1. Partition the data into training (75%) and validation (25%) sets.

In [3]:
# Partition the data into training (75%) and validation (25%) sets

X = universal_bank_df.drop(['Personal Loan'], axis=1)
y = universal_bank_df['Personal Loan']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=123)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)
print(X_train)

(3750, 13) (1250, 13) (3750,) (1250,)
      Age  Experience  Income  Family  CCAvg  Mortgage  Securities Account  \
2413   60          34      31       2   1.00         0                   0   
1471   52          26     180       1   1.00         0                   0   
1196   37          13      71       2   2.70        94                   0   
1509   56          26      92       2   4.50         0                   1   
4110   66          41      59       3   2.40         0                   0   
...   ...         ...     ...     ...    ...       ...                 ...   
1593   63          38      83       3   1.80         0                   0   
4060   31           6     174       2   6.70         0                   0   
1346   44          20      50       3   2.33       200                   0   
3454   47          21     132       1   0.30         0                   0   
3582   49          25      65       1   0.00         0                   0   

      CD Account  Online 

##Record to be classified
2. Consider the following customer for classification: Age = 40, Experience = 10,
Income = 84, Family = 2, CCAvg = 2, Education_1 = 0, Education_2 = 1,
Education_3 = 0, Mortgage = 0, Securities Account = 1, CD Account = 1, Online = 1,
and Credit Card = 1.

In [4]:
new_customer = pd.DataFrame({
    'Age': [40],
    'Experience': [10],
    'Income': [84],
    'Family': [2],
    'CCAvg': [2],
    'Mortgage': [0],
    'Securities Account': [1],
    'CD Account': [1],
    'Online': [1],
    'CreditCard': [1],
    'Education_1': [0],
    'Education_2': [1],
    'Education_3': [0]
})

print(new_customer)


   Age  Experience  Income  Family  CCAvg  Mortgage  Securities Account  \
0   40          10      84       2      2         0                   1   

   CD Account  Online  CreditCard  Education_1  Education_2  Education_3  
0           1       1           1            0            1            0  


##Preprocess the data


3. Standardize all the data sets using mean and standard deviations.

In [5]:
## Standardize the data using mean and standard deviations
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_valid = scaler.transform(X_valid)
new_customer = scaler.transform(new_customer)

4. Perform a k-NN classification with all predictors except ID and ZIP code using k = 1.
How would this customer be classified?

In [6]:
# Perform K-NN classification with k = 1
knn = NearestNeighbors(n_neighbors=1)
knn.fit(X_train)

distances, indices = knn.kneighbors(X_valid)
nearest_neighbors = universal_bank_df.iloc[indices[0], :]
print(nearest_neighbors["Personal Loan"])

3497    0
Name: Personal Loan, dtype: int64


Customer with ID 3497 is the nearest neighbour and they did not take a personal loan. So, the prediction is that the new customer will also not take a personal loan.


5. Now find the optimal value of k using the validation data set. What is the optimal k?

In [7]:
# Find the optimal value of k using the validation data set
results = []
for k in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train)
    results.append({
        'k': k,
        'accuracy': accuracy_score(y_valid, knn.predict(X_valid))
    })

# Convert results to a pandas data frame
results = pd.DataFrame(results)
print(results)

     k  accuracy
0    1    0.9640
1    2    0.9536
2    3    0.9616
3    4    0.9568
4    5    0.9624
5    6    0.9568
6    7    0.9608
7    8    0.9536
8    9    0.9576
9   10    0.9536
10  11    0.9576
11  12    0.9528
12  13    0.9536
13  14    0.9520
14  15    0.9536
15  16    0.9496
16  17    0.9520
17  18    0.9496
18  19    0.9528


K=1 gives the most accuracy and hence it is the optimal **k**.
Using K = 1 might cause overfitting of the model.


6. Print the confusion matrix for the validation data that results from using the optimal
k.

In [8]:
knn_optimal = KNeighborsClassifier(n_neighbors=1)
knn_optimal.fit(X_train, y_train)

# Make predictions on the validation data
y_pred = knn_optimal.predict(X_valid)

# Compute the confusion matrix
confusion = confusion_matrix(y_valid, y_pred)

print("Confusion Matrix:")
print(confusion)

Confusion Matrix:
[[1108   17]
 [  28   97]]


7. Classify the customer specified in Question 2 using the best k.

In [9]:
knn_best_classifier = NearestNeighbors(n_neighbors=1)
knn_best_classifier.fit(X_train)
distances, indices = knn_best_classifier.kneighbors(new_customer)
nearest_neighbors = universal_bank_df.iloc[indices[0], :]
print(nearest_neighbors["Personal Loan"])


2004    0
Name: Personal Loan, dtype: int64


Customer with ID 2004 is the nearest neighbour to the new_customer and they did not take a personal loan. So, the prediction is that the new customer will also not take a personal loan.

8. Now repartition the data into three parts: training, validation, and test sets (50%,
30%, and 20%).

In [10]:
# Repartition the data into training (50%), validation (30%), and test (20%) sets
X_train2, X_temp, y_train2, y_temp = train_test_split(X, y, test_size=0.5, random_state=123)
X_valid2, X_test2, y_valid2, y_test2 = train_test_split(X_temp, y_temp, test_size=0.4, random_state=123)
print(X_train2.shape, X_valid2.shape,X_test2.shape)



(2500, 13) (1500, 13) (1000, 13)


In [11]:
# Standardize the data for the new partitions
scaler2 = StandardScaler()
scaler2.fit(X_train2)
X_train2 = scaler2.transform(X_train2)
X_valid2 = scaler2.transform(X_valid2)
X_test2 = scaler2.transform(X_test2)



9. Apply the k-NN method with the optimal k chosen above.

In [12]:
# Re-run K-NN with the best k on the new partitions
knn_train2_classifier = KNeighborsClassifier(n_neighbors=1)
knn_train2_classifier.fit(X_train2, y_train2)

knn_valid2_classifier = KNeighborsClassifier(n_neighbors=1)
knn_valid2_classifier.fit(X_valid2, y_valid2)

knn_test2_classifier = KNeighborsClassifier(n_neighbors=1)
knn_test2_classifier.fit(X_test2, y_test2)

knn_train2_prediction = knn_train2_classifier.predict(X_train2)
knn_valid2_prediction = knn_valid2_classifier.predict(X_valid2)
knn_test2_prediction = knn_test2_classifier.predict(X_test2)


10. Compare the confusion matrix of the test set with that of the training and validation
sets. Comment on the differences and their reason.


In [13]:
# Calculate confusion matrices for different sets
confusion_matrix_train2 = confusion_matrix(y_train2, knn_train2_prediction)
confusion_matrix_valid2 = confusion_matrix(y_valid2, knn_valid2_prediction)
confusion_matrix_test2 = confusion_matrix(y_test2, knn_test2_prediction)

print(confusion_matrix_train2)
print(confusion_matrix_valid2)
print(confusion_matrix_test2)

[[2258    0]
 [   0  242]]
[[1353    0]
 [   0  147]]
[[909   0]
 [  0  91]]


In [14]:
report_train2 = classification_report(y_train2, knn_train2_prediction)
print(report_train2)

report_valid2 = classification_report(y_valid2, knn_valid2_prediction)
print(report_valid2)

report_test2 = classification_report(y_test2, knn_test2_prediction)
print(report_test2)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2258
           1       1.00      1.00      1.00       242

    accuracy                           1.00      2500
   macro avg       1.00      1.00      1.00      2500
weighted avg       1.00      1.00      1.00      2500

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1353
           1       1.00      1.00      1.00       147

    accuracy                           1.00      1500
   macro avg       1.00      1.00      1.00      1500
weighted avg       1.00      1.00      1.00      1500

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       909
           1       1.00      1.00      1.00        91

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



Since all the accuracies are 1 we can assume that the model is overfitting. So, we can try computing the the accuracies with the next best k value which is 5
