#### Import Libraries and Modules

In [None]:
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

#### Load and Preview Data

In [None]:
# Load the dataset from 'creditcard_2023.csv' into a DataFrame and set 'id' as the index column
df = pd.read_csv('creditcard_2023.csv', index_col='id')

# Print the shape of the DataFrame to show the number of rows and columns
print(df.shape)

# Display the first five rows of the DataFrame to provide a quick overview of the data
df.head()

#### Initialize and Apply Standard Scaler to Selected Columns

In [None]:
# Initialize the Standard Scaler
scaler = StandardScaler()

# List of columns to be scaled
# These are the feature columns that will be standardized
columns_to_scale = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

# Fit the scaler on the columns and transform them
# This will standardize the feature columns in-place
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

# Display the first few rows of the DataFrame to verify the scaling
# This allows you to check that the scaling was applied correctly
df.head()

#### Display Class Distribution for Original and Subsampled Data

In [None]:
# Print the class distribution in the original DataFrame
print(df['Class'].value_counts())

# Create a new DataFrame 'df1' by slicing rows from index 69000 to 500000
df1 = df.iloc[69000:500000,:]

# Print the class distribution in the new DataFrame 'df1'
print(df1['Class'].value_counts())

#### Create Sub-DataFrame and Reset Index

In [None]:
# Create a new DataFrame 'df2' by slicing rows 212100 to 220000 from 'df1'
df2 = df1.iloc[212100:220000,:]

# Reset the index of 'df2' and drop the old index
df2 = df2.reset_index(drop=True)


#### Shuffle DataFrame and Display Basic Properties

In [None]:
# Shuffle the DataFrame rows randomly
df2 = df2.sample(frac=1).reset_index(drop=True)

# Display the first 500 rows of the shuffled DataFrame for preview
df2.head(500)

# Print the shape of the DataFrame to show the number of rows and columns
df2.shape


#### K-NN Model Training and Hyperparameter Tuning with Multiple Metrics

In [None]:
# Select features and target variable from the DataFrame 'df2'
X = df2.iloc[:7000, 0:29]  # Features (first 29 columns)
y = df2.iloc[:7000, 29]    # Target variable (30th column)

# Define lists for different metrics, weights, and test sizes to be used in k-NN
metrics_list = ['euclidean', 'manhattan', 'minkowski', 'chebyshev', 'hamming', 'canberra', 'braycurtis', 'jaccard', 'dice', 'kulsinski', 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', 'yule', 'cosine', 'correlation']
weights_list = ['uniform', 'distance']
ts_range = [.1,.2,.3]

# Initialize empty lists to store accuracy scores and parameter combinations
accuracyScoreResults = []
params = []

# Nested loops to iterate through different combinations of hyperparameters
for num in range(2,3):  # Number of neighbors
    for ts in ts_range:  # Test size
        for i in metrics_list:  # Distance metric
            for j in weights_list:  # Weight type
                # Split the data into training and test sets
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=7)
                
                # Initialize k-NN classifier with current hyperparameters
                knn = KNeighborsClassifier(n_neighbors=num, metric=i, weights=j)
                
                # Train the k-NN model
                knn.fit(X_train, y_train)
                
                # Make predictions on the test set
                y_pred = knn.predict(X_test)
                
                # Evaluate the model and store results
                if accuracy_score(y_test, y_pred) == 1:
                    print(f"Accuracy of KNN = {num}, Metric = {i}, Weight = {j}, TS = {ts}:", accuracy_score(y_test, y_pred))
                    accuracyScoreResults.append(accuracy_score(y_test, y_pred))
                    params.append([num, i, j, ts])
                else:
                    accuracyScoreResults.append(accuracy_score(y_test, y_pred))
                    params.append([num, i, j, ts])


#### Find and Display Best Model Parameters and Maximum Accuracy

In [None]:
# Find the maximum accuracy score from the list of results
max_value = max(accuracyScoreResults)

# Find the index of the maximum accuracy score
max_index = accuracyScoreResults.index(max_value)

# Retrieve the number of neighbors corresponding to the maximum accuracy
neighbors = params[max_index][0]

# Retrieve the distance metric corresponding to the maximum accuracy
metric = params[max_index][1]

# Retrieve the weight type corresponding to the maximum accuracy
weights = params[max_index][2]

# Print the parameters that yielded the maximum accuracy
print(params[max_index])

# Print the maximum accuracy value
print(max_value)


#### Train and Evaluate k-NN Model

In [None]:
# Assuming X and y are already defined
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=7)

# Initialize k-NN classifier
knn = KNeighborsClassifier(n_neighbors=neighbors, metric=metric, weights=weights)

# Train the model
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
print(accuracy_score(y_test, y_pred))

#### Evaluate and Display Model Metrics


In [None]:
# Evaluate the model using various metrics

# Calculate Accuracy: Proportion of correctly predicted observations to the total observations
accuracy = accuracy_score(y_test, y_pred)

# Calculate Precision: Proportion of correctly predicted positive observations to the total predicted positives
precision = precision_score(y_test, y_pred, average='weighted')  # Change 'weighted' as per your needs

# Calculate Recall: Proportion of correctly predicted positive observations to the all observations in actual class
recall = recall_score(y_test, y_pred, average='weighted')  # Change 'weighted' as per your needs

# Calculate F1 Score: Weighted average of Precision and Recall
f1 = f1_score(y_test, y_pred, average='weighted')  # Change 'weighted' as per your needs

# Generate Confusion Matrix: Shows the ways in which your classification model is confused when it makes predictions
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix: \n{conf_matrix}")


#### Evaluate Model on New Test Set


In [None]:
# Select the feature columns for the new test set
new_X_test = df2.iloc[7000:,:29]

# Select the target column for the new test set
new_y_test = df2.iloc[7000:,29]

# Make predictions using the trained k-NN model
new_y_pred = knn.predict(new_X_test)

# Evaluate the model's performance on the new test set
print(accuracy_score(new_y_test, new_y_pred))


#### Evaluate Model Metrics


In [None]:
# Evaluate the model using various metrics

# Calculate the accuracy of the model on the test set
accuracy = accuracy_score(new_y_test, new_y_pred)

# Calculate the weighted precision of the model
# Change 'weighted' to other options like 'micro', 'macro', etc., as per your needs
precision = precision_score(new_y_test, new_y_pred, average='weighted')

# Calculate the weighted recall of the model
# Change 'weighted' to other options like 'micro', 'macro', etc., as per your needs
recall = recall_score(new_y_test, new_y_pred, average='weighted')

# Calculate the weighted F1 score of the model
# Change 'weighted' to other options like 'micro', 'macro', etc., as per your needs
f1 = f1_score(new_y_test, new_y_pred, average='weighted')

# Generate the confusion matrix for the model predictions
conf_matrix = confusion_matrix(new_y_test, new_y_pred)

# Print out the calculated metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix: \n{conf_matrix}")


#### Random Sampling and Model Evaluation

In [None]:
# Initialize counter for correct predictions
countCorrect = 0

# Number of random samples to test
rng = 10000

# Loop through rng times to make random predictions
for i in range(rng):
    # Generate two random integers, one from each class range
    rand1 = random.randint(1, 68000)  # Class 0 range
    rand2 = random.randint(500001, 560000)  # Class 1 range
    
    # Create a list of the two random integers
    randRand = [rand1, rand2]
    
    # Randomly select one integer from the list
    randSelect = randRand[random.randint(0, 1)]
    
    # Fetch the row corresponding to the random integer from the DataFrame
    sampleRow = df.iloc[randSelect, :30]
    
    # Separate features and target label
    sampleRow_X = sampleRow.drop('Class')
    sampleRow_y = sampleRow['Class']
    
    # Reshape the row to be a 2D array for prediction
    sampleRowReshaped = sampleRow_X.values.reshape(1, -1)
    
    # Use the trained model to make a prediction
    sampleRowPrediction = knn.predict(sampleRowReshaped)
    
    # Check if the prediction is correct and update the counter if so
    if sampleRowPrediction[0] == sampleRow_y:
        countCorrect += 1

# Calculate and print the percentage of correct predictions
print(str(f"{countCorrect / rng * 100}%"))


#### Perform k-Fold Cross-Validation with k-NN Classifier

In [None]:
# Initialize k-NN classifier with specific hyperparameters
knn = KNeighborsClassifier(n_neighbors=2, metric='braycurtis', weights='distance')

# Number of folds for k-fold cross-validation
n_folds = 500

# Initialize KFold object with the number of splits, shuffle option, and random seed
kf = KFold(n_splits=n_folds, shuffle=True, random_state=7)

# Initialize an empty list to store the accuracy for each fold
accuracy_list = []

# Perform k-fold cross-validation
# kf.split() will generate indices to split data into training and test sets
for train_index, test_index in kf.split(X):
    # Extract training and test sets based on indices for this fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the k-NN classifier on the current training set
    knn.fit(X_train, y_train)
    
    # Use the trained k-NN classifier to make predictions on the test set
    y_pred = knn.predict(X_test)
    
    # Calculate the accuracy of the model for this fold
    accuracy = accuracy_score(y_test, y_pred)
    
    # Append the accuracy to the list
    accuracy_list.append(accuracy)
    
    # Print the accuracy for this fold
    print(f"Fold accuracy: {accuracy}")

# Calculate and print the mean accuracy across all folds
mean_accuracy = np.mean(accuracy_list)
print(f"Mean accuracy over {n_folds} folds: {mean_accuracy}")


#### Define and Test Edge Cases with Predictions

In [None]:
# Define an edge case near the decision boundary
edge_case_1 = pd.Series({
    'V1': 0.1, 'V2': -0.1, 'V3': 0.2, 'V4': -0.2, 'V5': 0.1, 'V6': -0.1, 'V7': 0.2, 'V8': -0.2, 
    'V9': 0.1, 'V10': -0.1, 'V11': 0.2, 'V12': -0.2, 'V13': 0.1, 'V14': -0.1, 'V15': 0.2, 'V16': -0.2, 
    'V17': 0.1, 'V18': -0.1, 'V19': 0.2, 'V20': -0.2, 'V21': 0.1, 'V22': -0.1, 'V23': 0.2, 'V24': -0.2, 
    'V25': 0.1, 'V26': -0.1, 'V27': 0.2, 'V28': -0.2, 'Amount': 0.05, 'Class': 1  # Actual class label
})

# Define an outlier edge case
edge_case_2 = pd.Series({
    'V1': 3, 'V2': -3, 'V3': 3, 'V4': -3, 'V5': 3, 'V6': -3, 'V7': 3, 'V8': -3, 
    'V9': 3, 'V10': -3, 'V11': 3, 'V12': -3, 'V13': 3, 'V14': -3, 'V15': 3, 'V16': -3, 
    'V17': 3, 'V18': -3, 'V19': 3, 'V20': -3, 'V21': 3, 'V22': -3, 'V23': 3, 'V24': -3, 
    'V25': 3, 'V26': -3, 'V27': 3, 'V28': -3, 'Amount': 3, 'Class': 0  # Actual class label
})

# Reshape the rows to be 2D arrays for prediction
edge_case_reshaped_1 = edge_case_1.drop('Class').values.reshape(1, -1)
edge_case_reshaped_2 = edge_case_2.drop('Class').values.reshape(1, -1)

# Use the trained k-NN model to make predictions for the edge cases
edge_case_prediction_1 = knn.predict(edge_case_reshaped_1)
edge_case_prediction_2 = knn.predict(edge_case_reshaped_2)

# Print the predicted class labels for the edge cases
print(f"The predicted class label for edge_case_1 is: {edge_case_prediction_1[0]}")
print(f"The predicted class label for edge_case_2 is: {edge_case_prediction_2[0]}")
