In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
#Loading Data and Encoding categorical data

df = pd.read_csv("../data/bank.csv")

# List of categorical columns to encode
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

# Label Encoding each categorical column
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])
df.head()

In [None]:
# Dropping the feature 'anamoly'


features = df.drop(columns=['anamoly'])
# Scaling the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, df['anamoly'], test_size=0.3, random_state=42)


In [5]:
# Initialize KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# %% Train the KNN model
knn.fit(X_train, y_train)

# Predict on the test set
y_pred = knn.predict(X_test)

In [6]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[10732   236]
 [ 1133   256]]

Classification Report:
              precision    recall  f1-score   support

          no       0.90      0.98      0.94     10968
         yes       0.52      0.18      0.27      1389

    accuracy                           0.89     12357
   macro avg       0.71      0.58      0.61     12357
weighted avg       0.86      0.89      0.86     12357



In [None]:
# Save the predictions in the original DataFrame
df['predicted_anamoly_knn'] = knn.predict(features_scaled)

# Map 1 as 'yes' (anomaly) and 0 as 'no' (normal)
df['predicted_anamoly_knn'] = df['predicted_anamoly_knn'].map({1: 'yes', 0: 'no'})

In [8]:
anomaly_distribution = df.groupby(['anamoly', 'predicted_anamoly_knn']).size().reset_index(name='count')

# Print the results
print(anomaly_distribution)

Empty DataFrame
Columns: [anamoly, predicted_anamoly_knn, count]
Index: []
