In [None]:
!git clone https://github.com/bankira-rahul-is-iitian/Project_ML.git

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

df = pd.read_csv("/content/Project_ML/cleaned_last.csv")

print(df.head())

In [None]:
df.info()

In [None]:
print(df["IS_FRAUD"].value_counts())

In [None]:
print(df['IS_FRAUD'].value_counts())


In [None]:
# Split into majority and minority
majority = df[df['IS_FRAUD'] == 0]
minority = df[df['IS_FRAUD'] == 1]

print("Majority class:", len(majority))
print("Minority class:", len(minority))


In [None]:
n_majority = len(majority)
n_minority = len(minority)
minority_oversampled = minority.sample(n=n_majority, replace=True, random_state=42)
df_oversampled = pd.concat([majority, minority_oversampled])
df_oversampled = df_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)
print(df_oversampled['IS_FRAUD'].value_counts())


In [None]:
df_oversampled.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(5,4))
sns.countplot(x='IS_FRAUD', data=df_oversampled, palette='Set2')
plt.title("Class Distribution After Oversampling")
plt.xlabel("IS_FRAUD (0=No, 1=Yes)")
plt.ylabel("Count")
plt.show()


In [None]:
sk=df_oversampled

In [None]:
class KNNClassifier:
    """k-NN Classifier with multiple distance metrics and weighting options"""

    def __init__(self, distance_metric='euclidean', weighted=False):
        self.X_train = None
        self.y_train = None
        self.distance_metric = distance_metric
        self.weighted = weighted

    def fit(self, X_train, y_train):
        """Store training data"""
        self.X_train = X_train
        self.y_train = y_train

    def _euclidean_distance(self, x1, x2):
        """Compute Euclidean distance"""
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def _manhattan_distance(self, x1, x2):
        """Compute Manhattan distance"""
        return np.sum(np.abs(x1 - x2))

    def _compute_distance(self, x1, x2):
        """Compute distance based on selected metric"""
        if self.distance_metric == 'euclidean':
            return self._euclidean_distance(x1, x2)
        elif self.distance_metric == 'manhattan':
            return self._manhattan_distance(x1, x2)
        else:
            raise ValueError("Unknown distance metric")

    def predict(self, X_test, k=3):
        """Predict class labels for test samples"""
        predictions = []
        for x in X_test:
            distances = np.array([self._compute_distance(x, x_train)
                                for x_train in self.X_train])
            neighbors_idx = np.argsort(distances)[:k]
            neighbor_labels = self.y_train[neighbors_idx]
            neighbor_distances = distances[neighbors_idx]

            if self.weighted:
                # Distance-weighted voting
                epsilon = 1e-5
                weights = 1 / (neighbor_distances + epsilon)
                votes = {}
                for w, lbl in zip(weights, neighbor_labels):
                    lbl = int(lbl)
                    votes[lbl] = votes.get(lbl, 0) + w
                predicted = max(votes, key=votes.get)
            else:
                # Majority voting
                counts = np.bincount(neighbor_labels.astype(int))
                if len(np.unique(neighbor_labels)) == 2 and counts[0] == counts[1]:
                    # Tie-breaking: choose class with smaller mean distance
                    mean_dist_0 = np.mean(neighbor_distances[neighbor_labels == 0])
                    mean_dist_1 = np.mean(neighbor_distances[neighbor_labels == 1])
                    predicted = 0 if mean_dist_0 < mean_dist_1 else 1
                else:
                    predicted = np.argmax(counts)

            predictions.append(predicted)
        return np.array(predictions)

    def get_neighbors(self, x, k=3):
        """Get k nearest neighbors for a point"""
        distances = np.array([self._compute_distance(x, x_train)
                            for x_train in self.X_train])
        neighbors_idx = np.argsort(distances)[:k]
        return neighbors_idx, distances[neighbors_idx]

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

target_col = 'IS_FRAUD'
X = sk.drop(columns=[target_col])
y = sk[target_col].values

X_train, X_test, y_train, y_test = train_test_split(
    X.values, y, test_size=0.3, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNNClassifier(distance_metric='euclidean', weighted=False)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test, k=5)

#Evaluate
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("KNN Classifier Results:")
print(f"Accuracy:  {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall:    {rec:.3f}")
print(f"F1 Score:  {f1:.3f}")


In [None]:
y.shape

K-FOLD Validation

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score
import matplotlib.pyplot as plt

def find_optimal_k(X_train, y_train, k_range=range(1, 21), cv_folds=5):
    cv_scores = []
    best_score = 0
    best_k = 1

    print("Performing Cross-Validation for Optimal k...")
    print("k\tMean F1-Score\tStd Dev")
    print("-" * 30)

    for k in k_range:
        knn_cv = KNNClassifier(distance_metric='euclidean', weighted=False)
        def knn_predict(X_test_fold):
            return knn_cv.predict(X_test_fold, k=k)
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        fold_scores = []

        for train_idx, val_idx in skf.split(X_train, y_train):

            X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
            y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]
            knn_cv.fit(X_fold_train, y_fold_train)
            y_fold_pred = knn_cv.predict(X_fold_val, k=k)
            fold_f1 = f1_score(y_fold_val, y_fold_pred)
            fold_scores.append(fold_f1)

        mean_score = np.mean(fold_scores)
        std_score = np.std(fold_scores)
        cv_scores.append(mean_score)

        print(f"{k}\t{mean_score:.4f}\t\t{std_score:.4f}")
        if mean_score > best_score:
            best_score = mean_score
            best_k = k

    return best_k, cv_scores, k_range
best_k, cv_scores, k_range = find_optimal_k(X_train, y_train, k_range=range(1, 26), cv_folds=5)

print(f"\nOptimal k: {best_k} with F1-Score: {cv_scores[best_k-1]:.4f}")

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(k_range, cv_scores, marker='o', linestyle='-', color='b', label='CV F1-Score')
plt.axvline(x=best_k, color='r', linestyle='--', alpha=0.7, label=f'Optimal k = {best_k}')
plt.xlabel('k Value')
plt.ylabel('Cross-Validation F1-Score')
plt.title('Finding Optimal k for KNN Classifier')
plt.grid(True, alpha=0.3)
plt.legend()
plt.show()

In [None]:
def compare_knn_configurations(X_train, y_train, X_test, y_test, k_values=[3, 5, 7, 9]):
    configurations = [
        {'distance_metric': 'euclidean', 'weighted': False, 'name': 'Euclidean + Majority'},
        {'distance_metric': 'euclidean', 'weighted': True, 'name': 'Euclidean + Weighted'},
        {'distance_metric': 'manhattan', 'weighted': False, 'name': 'Manhattan + Majority'},
        {'distance_metric': 'manhattan', 'weighted': True, 'name': 'Manhattan + Weighted'}
    ]

    results = []

    print("Comparing KNN Configurations...")
    print("Configuration\t\t\tk\tAccuracy\tPrecision\tRecall\tF1-Score")
    print("-" * 80)

    for config in configurations:
        for k in k_values:
            # Train and evaluate
            knn = KNNClassifier(
                distance_metric=config['distance_metric'],
                weighted=config['weighted']
            )
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test, k=k)

            # Calculate metrics
            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred)
            rec = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            results.append({
                'config': config['name'],
                'k': k,
                'accuracy': acc,
                'precision': prec,
                'recall': rec,
                'f1_score': f1
            })

            print(f"{config['name']:25} {k}\t{acc:.4f}\t\t{prec:.4f}\t\t{rec:.4f}\t{f1:.4f}")

    return pd.DataFrame(results)

# Compare configurations
results_df = compare_knn_configurations(X_train, y_train, X_test, y_test)

In [None]:
best_overall = results_df.loc[results_df['f1_score'].idxmax()]

print("\nüèÜ BEST OVERALL CONFIGURATION:")
print(f"Configuration: {best_overall['config']}")
print(f"k value: {best_overall['k']}")
print(f"F1-Score: {best_overall['f1_score']:.4f}")
print(f"Accuracy: {best_overall['accuracy']:.4f}")
print(f"Precision: {best_overall['precision']:.4f}")
print(f"Recall: {best_overall['recall']:.4f}")

In [None]:
print("\nFINAL EVALUATION WITH OPTIMAL PARAMETERS:")
final_knn = KNNClassifier(distance_metric='euclidean', weighted=True)
final_knn.fit(X_train, y_train)
y_pred_final = final_knn.predict(X_test, k=best_k)

from sklearn.metrics import classification_report, confusion_matrix

print(f"Using k={best_k}, Euclidean distance with weighted voting")
print("\n Classification Report:")
print(classification_report(y_test, y_pred_final))

print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Final KNN Model')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

final_accuracy = accuracy_score(y_test, y_pred_final)
final_precision = precision_score(y_test, y_pred_final)
final_recall = recall_score(y_test, y_pred_final)
final_f1 = f1_score(y_test, y_pred_final)

print(f"\\n Final Model Performance:")
print(f"Accuracy:  {final_accuracy:.3f}")
print(f"Precision: {final_precision:.3f}")
print(f"Recall:    {final_recall:.3f}")
print(f"F1-Score:  {final_f1:.3f}")

BAGGING

In [None]:
class KNNBaggingClassifier:

    def __init__(self, n_estimators=10, k=5, distance_metric='euclidean', weighted=False,
                 max_samples=1.0, random_state=42):
        self.n_estimators = n_estimators
        self.k = k
        self.distance_metric = distance_metric
        self.weighted = weighted
        self.max_samples = max_samples
        self.random_state = random_state
        self.estimators_ = []
        self.sample_indices_ = []

    def fit(self, X, y):
        np.random.seed(self.random_state)
        n_samples = X.shape[0]
        n_bootstrap = int(n_samples * self.max_samples)

        self.estimators_ = []
        self.sample_indices_ = []

        print(f"Training {self.n_estimators} KNN estimators with bagging...")

        for i in range(self.n_estimators):
            bootstrap_indices = np.random.choice(n_samples, size=n_bootstrap, replace=True)
            X_bootstrap = X[bootstrap_indices]
            y_bootstrap = y[bootstrap_indices]
            knn = KNNClassifier(distance_metric=self.distance_metric, weighted=self.weighted)
            knn.fit(X_bootstrap, y_bootstrap)

            self.estimators_.append(knn)
            self.sample_indices_.append(bootstrap_indices)

            if (i + 1) % 5 == 0 or (i + 1) == self.n_estimators:
                print(f"  Trained estimator {i + 1}/{self.n_estimators}")

    def predict(self, X):
        """Make predictions using majority voting from all estimators"""
        all_predictions = []

        for i, estimator in enumerate(self.estimators_):
            predictions = estimator.predict(X, k=self.k)
            all_predictions.append(predictions)
        all_predictions = np.array(all_predictions)

        # Majority voting
        final_predictions = []
        for sample_idx in range(X.shape[0]):
            votes = all_predictions[:, sample_idx]
            majority_vote = np.bincount(votes.astype(int)).argmax()
            final_predictions.append(majority_vote)

        return np.array(final_predictions)

    def predict_proba(self, X):
        """Predict class probabilities (soft voting)"""
        all_predictions = []

        for estimator in self.estimators_:
            predictions = estimator.predict(X, k=self.k)
            all_predictions.append(predictions)

        all_predictions = np.array(all_predictions)
        probabilities = []
        for sample_idx in range(X.shape[0]):
            votes = all_predictions[:, sample_idx]
            class_0_prob = np.sum(votes == 0) / self.n_estimators
            class_1_prob = np.sum(votes == 1) / self.n_estimators
            probabilities.append([class_0_prob, class_1_prob])

        return np.array(probabilities)

In [None]:
print("Testing Bagging KNN Classifier...")

# Initialize bagging classifier
bagging_knn = KNNBaggingClassifier(
    n_estimators=20,
    k=5,
    distance_metric='euclidean',
    weighted=False,
    max_samples=0.8,
    random_state=42
)

bagging_knn.fit(X_train, y_train)

# Make predictions
y_pred_bagging = bagging_knn.predict(X_test)

# Evaluate performance
acc_bagging = accuracy_score(y_test, y_pred_bagging)
prec_bagging = precision_score(y_test, y_pred_bagging)
rec_bagging = recall_score(y_test, y_pred_bagging)
f1_bagging = f1_score(y_test, y_pred_bagging)

print("\nBagging KNN Results:")
print(f"Accuracy:  {acc_bagging:.3f}")
print(f"Precision: {prec_bagging:.3f}")
print(f"Recall:    {rec_bagging:.3f}")
print(f"F1-Score:  {f1_bagging:.3f}")

In [None]:
# Compare with single KNN
print(" Comparison: Single KNN vs Bagging KNN")

# Single KNN (for comparison)
single_knn = KNNClassifier(distance_metric='euclidean', weighted=False)
single_knn.fit(X_train, y_train)
y_pred_single = single_knn.predict(X_test, k=5)

acc_single = accuracy_score(y_test, y_pred_single)
prec_single = precision_score(y_test, y_pred_single)
rec_single = recall_score(y_test, y_pred_single)
f1_single = f1_score(y_test, y_pred_single)

print("\nSingle KNN:")
print(f"Accuracy:  {acc_single:.3f}")
print(f"Precision: {prec_single:.3f}")
print(f"Recall:    {rec_single:.3f}")
print(f"F1-Score:  {f1_single:.3f}")

print("\nBagging KNN:")
print(f"Accuracy:  {acc_bagging:.3f}")
print(f"Precision: {prec_bagging:.3f}")
print(f"Recall:    {rec_bagging:.3f}")
print(f"F1-Score:  {f1_bagging:.3f}")

# Calculate improvement
improvement = ((f1_bagging - f1_single) / f1_single) * 100
print(f"\nF1-Score Improvement: {improvement:+.2f}%")

In [None]:
# Hyperparameter tuning for Bagging KNN
def tune_bagging_knn(X_train, y_train, X_test, y_test):
    """Find optimal parameters for Bagging KNN"""

    param_combinations = [
        {'n_estimators': 10, 'k': 3, 'max_samples': 0.7},
        {'n_estimators': 20, 'k': 5, 'max_samples': 0.8},
        {'n_estimators': 30, 'k': 7, 'max_samples': 0.9},
        {'n_estimators': 20, 'k': 5, 'max_samples': 1.0},
        {'n_estimators': 15, 'k': 5, 'max_samples': 0.8, 'weighted': True},
    ]

    best_score = 0
    best_params = None
    results = []

    print("Tuning Bagging KNN Hyperparameters...")
    print("n_estimators\tk\tmax_samples\tweighted\tF1-Score")
    print("-" * 60)

    for params in param_combinations:
        bagging_knn = KNNBaggingClassifier(
            n_estimators=params['n_estimators'],
            k=params['k'],
            max_samples=params['max_samples'],
            weighted=params.get('weighted', False),
            random_state=42
        )

        # Use a smaller subset for faster tuning (optional)
        if X_train.shape[0] > 10000:
            # Sample for faster computation
            indices = np.random.choice(X_train.shape[0], 5000, replace=False)
            X_tune = X_train[indices]
            y_tune = y_train[indices]
        else:
            X_tune = X_train
            y_tune = y_train

        bagging_knn.fit(X_tune, y_tune)
        y_pred = bagging_knn.predict(X_test)
        f1 = f1_score(y_test, y_pred)

        results.append({
            'n_estimators': params['n_estimators'],
            'k': params['k'],
            'max_samples': params['max_samples'],
            'weighted': params.get('weighted', False),
            'f1_score': f1
        })

        print(f"{params['n_estimators']}\t\t{params['k']}\t{params['max_samples']}\t\t{params.get('weighted', False)}\t\t{f1:.4f}")

        if f1 > best_score:
            best_score = f1
            best_params = params

    return best_params, best_score, results

# Perform hyperparameter tuning
best_params, best_score, tuning_results = tune_bagging_knn(X_train, y_train, X_test, y_test)

print(f"\nüèÜ Best Parameters: {best_params}")
print(f"üéØ Best F1-Score: {best_score:.4f}")