In [3]:
# ===============================
# 1. Import Required Libraries
# ===============================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV


# ===============================
# 2. Load Dataset
# ===============================
# CHANGE filename if needed
df = pd.read_csv("KNNAlgorithmDataset.csv")

print("Dataset Loaded Successfully")
print(df.head())
print(df.info())


# ===============================
# 3. Separate Features & Target
# ===============================
X = df.drop("outcome", axis=1)
y = df["outcome"]


# ===============================
# 4. Handle Missing Values
# ===============================
X = X.fillna(X.mean())


# ===============================
# 5. Train-Test Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)


# ===============================
# 6. Feature Scaling
# ===============================
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# ===============================
# 7. Find Best K (Optional but Recommended)
# ===============================
param_grid = {'n_neighbors': range(1, 21)}

grid = GridSearchCV(
    KNeighborsClassifier(),
    param_grid,
    cv=5,
    scoring='accuracy'
)

grid.fit(X_train, y_train)

best_k = grid.best_params_['n_neighbors']
print("Best K value:", best_k)


# ===============================
# 8. Train KNN Classifier
# ===============================
knn = KNeighborsClassifier(
    n_neighbors=best_k,
    metric='minkowski',
    p=2
)

knn.fit(X_train, y_train)


# ===============================
# 9. Predictions
# ===============================
y_pred = knn.predict(X_test)


# ===============================
# 10. Model Evaluation
# ===============================
print("\nModel Accuracy:", accuracy_score(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Dataset Loaded Successfully
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_w

KeyError: "['outcome'] not found in axis"

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# ===============================
# Load Dataset
# ===============================
df = pd.read_csv("KNNAlgorithmDataset.csv")

# Normalize column names
df.columns = df.columns.str.strip().str.lower()

print("Columns:", df.columns)


# ===============================
# Separate Features & Target
# ===============================
target_column = "outcome"   # now guaranteed to work

X = df.drop(target_column, axis=1)
y = df[target_column]


# ===============================
# Handle Missing Values
# ===============================
X = X.fillna(X.mean(numeric_only=True))


# ===============================
# Train-Test Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)


# ===============================
# Feature Scaling
# ===============================
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# ===============================
# Find Best K
# ===============================
param_grid = {'n_neighbors': range(1, 21)}

grid = GridSearchCV(
    KNeighborsClassifier(),
    param_grid,
    cv=5,
    scoring='accuracy'
)

grid.fit(X_train, y_train)
best_k = grid.best_params_['n_neighbors']

print("Best K:", best_k)


# ===============================
# Train KNN
# ===============================
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)


# ===============================
# Evaluation
# ===============================
y_pred = knn.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Columns: Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'unnamed: 32'],
      dtype='object')


KeyError: "['outcome'] not found in axis"

In [5]:
# ============================================================
# KNN CLASSIFICATION USING SCIKIT-LEARN (FULL WORKING CODE)
# ============================================================

# 1. Import required libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# ============================================================
# 2. Load Dataset
# ============================================================
# Replace with your actual file name
df = pd.read_csv("KNNAlgorithemDataset.csv")

print("Dataset loaded successfully\n")
print(df.head())


# ============================================================
# 3. FIX COLUMN NAME ISSUE (IMPORTANT)
# ============================================================
# Convert column names to lowercase and remove spaces
df.columns = df.columns.str.strip().str.lower()

print("\nColumn names after fixing:")
print(df.columns)


# ============================================================
# 4. Separate Features and Target
# ============================================================
# Target column
target_column = "outcome"

X = df.drop(columns=[target_column])
y = df[target_column]


# ============================================================
# 5. Handle Missing Values (Numeric Columns)
# ============================================================
X = X.fillna(X.mean(numeric_only=True))


# ============================================================
# 6. Train-Test Split
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)


# ============================================================
# 7. Feature Scaling (MANDATORY FOR KNN)
# ============================================================
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# ============================================================
# 8. Find Best K using Cross-Validation
# ============================================================
param_grid = {'n_neighbors': range(1, 21)}

grid = GridSearchCV(
    KNeighborsClassifier(),
    param_grid,
    cv=5,
    scoring='accuracy'
)

grid.fit(X_train, y_train)

best_k = grid.best_params_['n_neighbors']
print("\nBest K value found:", best_k)


# ============================================================
# 9. Train Final KNN Model
# ============================================================
knn = KNeighborsClassifier(
    n_neighbors=best_k,
    metric='minkowski',
    p=2   # Euclidean distance
)

knn.fit(X_train, y_train)


# ============================================================
# 10. Predictions
# ============================================================
y_pred = knn.predict(X_test)


# ============================================================
# 11. Model Evaluation
# ============================================================
print("\nModel Accuracy:", accuracy_score(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


FileNotFoundError: [Errno 2] No such file or directory: 'KNNAlgorithemDataset.csv'