<a href="https://colab.research.google.com/github/anshika0601/ml-learn/blob/main/KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

working on imbalanced data

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score


# Load dataset
df = pd.read_csv("creditcard.csv")
print(df.head())
print(df['Class'].value_counts(normalize=True))

# Drop rows with NaN values in the 'Class' column
df.dropna(subset=['Class'], inplace=True)

X = df.drop("Class", axis=1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#knn model
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(X_train, y_train)

#predict
y_pred = knn.predict(X_test)

#evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))

# Probabilities for ROC-AUC
y_proba = knn.predict_proba(X_test)[:,1]
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

from sklearn.model_selection import cross_val_score

for k in [3, 5, 7, 11, 15, 21]:
    model = KNeighborsClassifier(n_neighbors=k, weights='distance')
    scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
    print(f"K={k}, Mean CV ROC-AUC={scores.mean():.4f}")

   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

working on balanced data using SMOTE

In [6]:
from imblearn.over_sampling import SMOTE
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_resampled.value_counts())


knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(X_train_resampled, y_train_resampled)

y_pred = knn.predict(X_test)


print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))

# Probabilities for ROC-AUC
y_proba = knn.predict_proba(X_test)[:,1]
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

from sklearn.model_selection import cross_val_score

for k in [3, 5, 7, 11, 15, 21]:
    model = KNeighborsClassifier(n_neighbors=k, weights='distance')
    scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
    print(f"K={k}, Mean CV ROC-AUC={scores.mean():.4f}")

Before SMOTE: Class
0.0    12690
1.0       58
Name: count, dtype: int64
After SMOTE: Class
0.0    12690
1.0    12690
Name: count, dtype: int64
Confusion Matrix:
 [[3169    3]
 [   3   12]]
Classification Report:
               precision    recall  f1-score   support

         0.0     0.9991    0.9991    0.9991      3172
         1.0     0.8000    0.8000    0.8000        15

    accuracy                         0.9981      3187
   macro avg     0.8995    0.8995    0.8995      3187
weighted avg     0.9981    0.9981    0.9981      3187

ROC-AUC: 0.9328709541824296
K=3, Mean CV ROC-AUC=0.3929
K=5, Mean CV ROC-AUC=0.3679
K=7, Mean CV ROC-AUC=0.3645
K=11, Mean CV ROC-AUC=0.3639
K=15, Mean CV ROC-AUC=0.3634
K=21, Mean CV ROC-AUC=0.3627


In [1]:
!pip install imbalanced-learn


