In [None]:
!git clone https://github.com/bankira-rahul-is-iitian/Project_ML.git

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

df = pd.read_csv("/content/Project_ML/cleaned_fraud_dataset.csv")

print(df.head())

In [None]:
df.drop(columns=["month","year"], inplace=True)

In [None]:
df.info()

In [None]:
print(df["IS_FRAUD"].value_counts())

In [None]:
print(df['IS_FRAUD'].value_counts())


In [None]:
# Split into majority and minority
majority = df[df['IS_FRAUD'] == 0]
minority = df[df['IS_FRAUD'] == 1]

print("Majority class:", len(majority))
print("Minority class:", len(minority))


In [None]:
# Downsample majority class to match minority size
majority_downsampled = majority.sample(n=len(minority), random_state=42)

# Combine both classes
df_downsampled = pd.concat([majority_downsampled, minority])

# Shuffle the dataset
df_downsampled = df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Check new distribution
print(df_downsampled['IS_FRAUD'].value_counts())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(5,4))
sns.countplot(x='IS_FRAUD', data=df_downsampled, palette='Set2')
plt.title("Class Distribution After Oversampling")
plt.xlabel("IS_FRAUD (0=No, 1=Yes)")
plt.ylabel("Count")
plt.show()


In [None]:
df_downsampled

In [None]:
dk=df_downsampled

In [None]:
dk

In [None]:
class KNNClassifier:
    """k-NN Classifier with multiple distance metrics and weighting options"""

    def __init__(self, distance_metric='euclidean', weighted=False):
        self.X_train = None
        self.y_train = None
        self.distance_metric = distance_metric
        self.weighted = weighted

    def fit(self, X_train, y_train):
        """Store training data"""
        self.X_train = X_train
        self.y_train = y_train

    def _euclidean_distance(self, x1, x2):
        """Compute Euclidean distance"""
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def _manhattan_distance(self, x1, x2):
        """Compute Manhattan distance"""
        return np.sum(np.abs(x1 - x2))

    def _compute_distance(self, x1, x2):
        """Compute distance based on selected metric"""
        if self.distance_metric == 'euclidean':
            return self._euclidean_distance(x1, x2)
        elif self.distance_metric == 'manhattan':
            return self._manhattan_distance(x1, x2)
        else:
            raise ValueError("Unknown distance metric")

    def predict(self, X_test, k=3):
        """Predict class labels for test samples"""
        predictions = []
        for x in X_test:
            distances = np.array([self._compute_distance(x, x_train)
                                for x_train in self.X_train])
            neighbors_idx = np.argsort(distances)[:k]
            neighbor_labels = self.y_train[neighbors_idx]
            neighbor_distances = distances[neighbors_idx]

            if self.weighted:
                # Distance-weighted voting
                epsilon = 1e-5
                weights = 1 / (neighbor_distances + epsilon)
                votes = {}
                for w, lbl in zip(weights, neighbor_labels):
                    lbl = int(lbl)
                    votes[lbl] = votes.get(lbl, 0) + w
                predicted = max(votes, key=votes.get)
            else:
                # Majority voting
                counts = np.bincount(neighbor_labels.astype(int))
                if len(np.unique(neighbor_labels)) == 2 and counts[0] == counts[1]:
                    # Tie-breaking: choose class with smaller mean distance
                    mean_dist_0 = np.mean(neighbor_distances[neighbor_labels == 0])
                    mean_dist_1 = np.mean(neighbor_distances[neighbor_labels == 1])
                    predicted = 0 if mean_dist_0 < mean_dist_1 else 1
                else:
                    predicted = np.argmax(counts)

            predictions.append(predicted)
        return np.array(predictions)

    def get_neighbors(self, x, k=3):
        """Get k nearest neighbors for a point"""
        distances = np.array([self._compute_distance(x, x_train)
                            for x_train in self.X_train])
        neighbors_idx = np.argsort(distances)[:k]
        return neighbors_idx, distances[neighbors_idx]

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Ensure target column is named correctly
target_col = 'IS_FRAUD'  # change if different
X = dk.drop(columns=[target_col])
y = dk[target_col].values

# âœ… Split data
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y, test_size=0.3, random_state=42, stratify=y
)

# âœ… Standardize features (important for KNN)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# âœ… Import your KNN class (paste the definition here if not imported already)
# from your_file import KNNClassifier  # not needed if class is already defined

# Instantiate and train
knn = KNNClassifier(distance_metric='euclidean', weighted=False)
knn.fit(X_train, y_train)

# âœ… Predict
y_pred = knn.predict(X_test, k=5)

# âœ… Evaluate
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("ðŸ“Š KNN Classifier (Custom Implementation) Results:")
print(f"Accuracy:  {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall:    {rec:.3f}")
print(f"F1 Score:  {f1:.3f}")


In [None]:
y.shape