**Load the Dataset and Explore**

In [1]:
import pandas as pd
df=pd.read_csv("/kaggle/input/synthetic-mobile-money-transaction-dataset/MoMTSim_20240722202413_1000_dataset.csv")
df.head()


Unnamed: 0,step,transactionType,amount,initiator,oldBalInitiator,newBalInitiator,recipient,oldBalRecipient,newBalRecipient,isFraud
0,0,PAYMENT,579.16,4018807983755588,29.32,29.32,30-0000345,0.0,0.0,0
1,0,TRANSFER,39.03,4610593854752035,39.03,0.0,4944921135715488,55.94,94.98,1
2,0,TRANSFER,21.78,4840578447365454,21.78,0.0,4305579785620656,58.09,79.88,1
3,0,PAYMENT,570.22,4958066799829630,5.29,5.29,00-0000388,0.0,0.0,0
4,0,PAYMENT,574.8,4149196095918843,82.63,82.63,80-0005320,0.0,0.0,0


In [None]:
df.info() # to explore my data 
df.describe() # what is 5 summary statistics of data 

In [None]:
missing_percentage = (df.isnull().sum() / len(df)) * 100
#missing_percentage = missing_percentage[missing_percentage >0].sort_values(ascending=False)
print(missing_percentage)
# Here I get that there is no missing value in my data so no handling of the data 

**Analyze the Distribution of Features and Target Classes**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
df.hist(figsize=(12,18),bins=30) #only for numerical features 
plt.show()

sns.countplot(x='isFraud',data=df)
plt.show()

In [None]:

# Mean values:

# amount: 53,021.39

# oldBalInitiator: 22,460,554.33

# newBalInitiator: 22,465,267.76

# oldBalRecipient: 1,685,005.12

# newBalRecipient: 1,703,966.82

# Median values:

# amount: 596.69

# oldBalInitiator: 3,655.43

# newBalInitiator: 0.00

# oldBalRecipient: 200,761.40

# newBalRecipient: 204,247.91


<!-- Second Way to analyze -->

**Second Way to anlyze**

In [None]:
columns_to_calculate = ['amount', 'oldBalInitiator', 'newBalInitiator', 'oldBalRecipient', 'newBalRecipient']
class_mean = df.groupby('isFraud')[columns_to_calculate].mean()
print(class_mean)


In [None]:
class_median=df.groupby('isFraud')[columns_to_calculate].median()
print(class_median)

In [None]:
import numpy as np
# Boxplot for each numerical feature
for col in columns_to_calculate:
    plt.figure(figsize=(8, 6))
    log_data = np.log1p(df[col])
    sns.boxplot(log_data)
    plt.title(f"Boxplot for {col}")
    plt.show()


In [None]:
Q1 = df[columns_to_calculate].quantile(0.25)
Q3 = df[columns_to_calculate].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = (df[columns_to_calculate] < lower_bound) | (df[columns_to_calculate] > upper_bound)

# Count outliers per row (IQR)
outlier_counts_iqr = outliers.sum(axis=1)

# Select rows where at least 3 features are outliers (IQR)
outlier_rows_iqr = df[outlier_counts_iqr >= 3]

# Display results
print("Total IQR-based outlier rows (at least 3 features):", outlier_rows_iqr.shape[0])


In [None]:
# Boxplot of numerical features grouped by target class
for col in df[columns_to_calculate]:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x='isFraud', y=col, data=df)
    plt.title(f'Boxplot of {col} by Target Class')
    plt.show()


In [None]:
from scipy.stats import zscore
df_zscore = df[columns_to_calculate].apply(zscore)
outliers_z = (df_zscore > 3) | (df_zscore < -3)
outlier_counts_z = outliers_z.sum(axis=1)
# Select rows where at least 3 features are outliers (Z-score)
outlier_rows_z = df[outlier_counts_z >= 3]
# Display results
print("Total Z-score-based outlier rows (at least 3 features):", outlier_rows_z.shape[0])


In [None]:
common_outliers= outlier_rows_z.index.intersection(outlier_rows_iqr.index)
print(f"Common outliers in both: {len(common_outliers)}")

In [None]:
toatl_outlier_rows=outlier_rows_iqr.shape[0]
total_rows_data=df.shape[0]
outliers_percentage=(toatl_outlier_rows/total_rows_data) *100
print(outliers_percentage)

In [None]:
df_capped = df[columns_to_calculate].copy()
df_capped = df_capped.clip(lower=lower_bound, upper=upper_bound, axis=1)

print("Outliers have been capped at IQR bounds.")

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=df["isFraud"], y=np.log1p(df["amount"]), data=df)  # Applying log transformation
plt.title("Log-Transformed Transaction Amount Distribution by Fraud Class")
plt.xlabel("Fraud (1) vs. Non-Fraud (0)")
plt.ylabel("Log(Transaction Amount)")
plt.show()


In [None]:
plt.figure(figsize=(9,6))
plt.subplots_adjust(left=0.15, right=0.85, top=0.85, bottom=0.15)
sns.countplot(data=df, x='transactionType', hue='isFraud', palette=['#766CDB', '#DA847C'])
plt.title('Count of Transaction Types by Fraud Status', pad=15, fontsize=20, weight='semibold', color='#222222')
plt.xlabel('Transaction Type', labelpad=10, fontsize=16, weight='medium', color='#333333')
plt.ylabel('Count', labelpad=10, fontsize=16, weight='medium', color='#333333')
plt.xticks(fontsize=14, color='#555555')
plt.yticks(fontsize=14, color='#555555')
plt.legend(title='isFraud', fontsize=12, title_fontsize=12, loc='upper right')
plt.gca().set_axisbelow(True)
plt.show()
print('Displayed Visualization 2: Count plot of transaction types by fraud status.')


In [None]:
# Visualization 3: Scatter plot of oldBalInitiator vs newBalInitiator colored by fraud status
plt.figure(figsize=(9,6))
plt.subplots_adjust(left=0.15, right=0.85, top=0.85, bottom=0.15)
# Adding a small jitter by using alpha and sizes to show density
sns.scatterplot(data=df, x='oldBalInitiator', y='newBalInitiator', hue='isFraud', palette=['#877877', '#52515E'], alpha=0.7)
plt.title('Initiator Balance Transition by Fraud Status', pad=15, fontsize=20, weight='semibold', color='#222222')
plt.xlabel('Old Balance (Initiator)', labelpad=10, fontsize=16, weight='medium', color='#333333')
plt.ylabel('New Balance (Initiator)', labelpad=10, fontsize=16, weight='medium', color='#333333')
plt.xticks(fontsize=14, color='#555555')
plt.yticks(fontsize=14, color='#555555')
plt.legend(title='isFraud', fontsize=12, title_fontsize=12, loc='upper right')
plt.gca().set_axisbelow(True)
plt.show()
print('Displayed Visualization 3: Scatter plot of oldBalInitiator vs newBalInitiator colored by fraud status.')

In [None]:
filtered_df = df.drop(columns=["initiator", "transactionType", "recipient"], errors="ignore")

# Compute correlation matrix
correlation_matrix = filtered_df.corr()

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, cbar=True)

# Add title
plt.title("Correlation Matrix of Numerical Features", fontsize=14)
plt.show()

In [None]:
# Summary statistics for numerical features
print(df.describe())

# Summary of categorical features
print(df.describe(include=['object']))

In [None]:

df[columns_to_calculate] = df_capped.clip(lower=lower_bound, upper=upper_bound, axis=1)

print("Outliers have been capped at IQR bounds.")

df.head()

In [None]:
# Check for duplicates
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

# Remove duplicates
df = df.drop_duplicates()


In [6]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['transactionType'] = encoder.fit_transform(df['transactionType'])
df.head()

Unnamed: 0,step,transactionType,amount,initiator,oldBalInitiator,newBalInitiator,recipient,oldBalRecipient,newBalRecipient,isFraud,initiator_type,recipient_type
0,0,2,579.16,4018807983755588,29.32,29.32,30-0000345,0.0,0.0,0,1,2
1,0,3,39.03,4610593854752035,39.03,0.0,4944921135715488,55.94,94.98,1,1,1
2,0,3,21.78,4840578447365454,21.78,0.0,4305579785620656,58.09,79.88,1,1,1
3,0,2,570.22,4958066799829630,5.29,5.29,00-0000388,0.0,0.0,0,1,2
4,0,2,574.8,4149196095918843,82.63,82.63,80-0005320,0.0,0.0,0,1,2


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Function to categorize initiator and recipient
def categorize_id(value):
    value = str(value)  # Ensure it's treated as a string
    if value.isdigit():  # If it's a pure number (e.g., credit card-like)
        return 1
    else:  # If it contains special characters (e.g., bank ID format)
        return 2

# Apply categorization
df['initiator_type'] = df['initiator'].apply(categorize_id)
df['recipient_type'] = df['recipient'].apply(categorize_id)

# Drop original columns after encoding

# Automatically detect numerical features
numerical_features = df.select_dtypes(include=['number']).columns.tolist()

# Detect encoded categorical features (e.g., one-hot encoded transaction types)
encoded_features = ['transactionType']

# Define final feature set
features = numerical_features + encoded_features + ['initiator_type', 'recipient_type']

# Define target variable
target = 'isFraud'

# Define X (features) and y (target)
X = df[features]
y = df[target]

# Split the data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print dataset shape after split
print(f"Training set: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Testing set: X_test={X_test.shape}, y_test={y_test.shape}")

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# Min-Max Scaling
minmax_scaler = MinMaxScaler()
X_train_minmax = minmax_scaler.fit_transform(X_train)
X_test_minmax = minmax_scaler.transform(X_test)

# Z-score Normalization (Standardization)
zscore_scaler = StandardScaler()
X_train_zscore = zscore_scaler.fit_transform(X_train)
X_test_zscore = zscore_scaler.transform(X_test)


In [None]:
!pip install cudf-cu11 cuml-cu11 --extra-index-url=https://pypi.nvidia.com


In [None]:
import cupy as cp
from cuml.neighbors import KNeighborsClassifier as cuKNN
from sklearn.metrics import accuracy_score
# 🔹 Convert data to GPU format (CuPy)
X_train_minmax_gpu = cp.asarray(X_train_minmax)
X_test_minmax_gpu = cp.asarray(X_test_minmax)
X_train_zscore_gpu = cp.asarray(X_train_zscore)
X_test_zscore_gpu = cp.asarray(X_test_zscore)
y_train_gpu = cp.asarray(y_train)
y_test_gpu = cp.asarray(y_test)

# 🔹 Hyperparameters (Single Values)
k = 5  # KNN
svm_kernel = 'rbf'  # SVM Kernel
C = 1  # SVM & Logistic Regression Regularization

# Store results
results = {}

#  **K-Nearest Neighbors (KNN)**
knn_minmax_gpu = KNeighborsClassifier(n_neighbors=k)
knn_minmax_gpu.fit(X_train_minmax_gpu, y_train_gpu)
y_pred_knn_minmax_gpu = cp.asnumpy(knn_minmax_gpu.predict(X_test_minmax_gpu))

knn_zscore_gpu = KNeighborsClassifier(n_neighbors=k)
knn_zscore_gpu.fit(X_train_zscore_gpu, y_train_gpu)
y_pred_knn_zscore_gpu = cp.asnumpy(knn_zscore_gpu.predict(X_test_zscore_gpu))

# Compute accuracy
results[("KNN", "MinMax")] = accuracy_score(y_test, y_pred_knn_minmax_gpu)
results[("KNN", "Zscore")] = accuracy_score(y_test, y_pred_knn_zscore_gpu)



In [None]:
import cuml
print(dir(cuml))


In [None]:
#  **Support Vector Machine (SVM)**
svm_minmax_gpu = SVC(kernel=svm_kernel, C=C)
svm_minmax_gpu.fit(X_train_minmax_gpu, y_train_gpu)
y_pred_svm_minmax_gpu = cp.asnumpy(svm_minmax_gpu.predict(X_test_minmax_gpu))

svm_zscore_gpu = SVC(kernel=svm_kernel, C=C)
svm_zscore_gpu.fit(X_train_zscore_gpu, y_train_gpu)
y_pred_svm_zscore_gpu = cp.asnumpy(svm_zscore_gpu.predict(X_test_zscore_gpu))

# Compute accuracy
results[("SVM", "MinMax")] = accuracy_score(y_test, y_pred_svm_minmax_gpu)
results[("SVM", "Zscore")] = accuracy_score(y_test, y_pred_svm_zscore_gpu)





In [None]:
from cuml.linear_model import LogisticRegression
lr_minmax_gpu = LogisticRegression(C=C)
lr_minmax_gpu.fit(X_train_minmax_gpu, y_train_gpu)
y_pred_lr_minmax_gpu = cp.asnumpy(lr_minmax_gpu.predict(X_test_minmax_gpu))

lr_zscore_gpu = LogisticRegression(C=C)
lr_zscore_gpu.fit(X_train_zscore_gpu, y_train_gpu)
y_pred_lr_zscore_gpu = cp.asnumpy(lr_zscore_gpu.predict(X_test_zscore_gpu))


results[("LogReg", "MinMax")] = accuracy_score(y_test, y_pred_lr_minmax_gpu)
results[("LogReg", "Zscore")] = accuracy_score(y_test, y_pred_lr_zscore_gpu)


In [None]:
print("\n🔹 Accuracy Summary:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

In [None]:
import seaborn as sns
corr_matrix = X.corr()

# Plot heatmap of correlations
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.show()

**For stratified splitting**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
models = {
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'SVM': SVC(kernel='linear', C=1),
    'LogReg': LogisticRegression(C=1, max_iter=500)
}


accuracy_results = {}
for name, model in models.items():
    # Train & test on MinMax scaled data
    model.fit(X_train_minmax, y_train)
    y_pred_minmax = model.predict(X_test_minmax)
    accuracy_results[(name, 'MinMax')] = accuracy_score(y_test, y_pred_minmax)

    # Train & test on Z-score normalized data
    model.fit(X_train_zscore, y_train)
    y_pred_zscore = model.predict(X_test_zscore)
    accuracy_results[(name, 'Zscore')] = accuracy_score(y_test, y_pred_zscore)


for key, acc in accuracy_results.items():
    print(f"Model: {key[0]} | Scaling: {key[1]} | Accuracy: {acc:.4f}")


**Adding Some Noise**

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Function to categorize IDs
def categorize_id(value):
    return 1 if str(value).isdigit() else 2

# Apply categorization
df['initiator_type'] = df['initiator'].apply(categorize_id)
df['recipient_type'] = df['recipient'].apply(categorize_id)

# Detect features correctly
numerical_features = df.select_dtypes(include=['number']).columns.tolist()
encoded_features = ['transactionType']

# Remove target from numerical features if present
if 'isFraud' in numerical_features:
    numerical_features.remove('isFraud')

# Define final features (no duplicates)
features = numerical_features + encoded_features  # initiator/recipient_type already in numerical_features

# Define target
target = 'isFraud'

# Split data
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

def add_noise_to_data(X_train, X_test, y_train, numerical_features):
    X_train = X_train.copy()
    X_test = X_test.copy()
    y_train = y_train.copy()
    
    np.random.seed(42)
    noise_level = 0.05
    
    for feature in numerical_features:
        # Compute standard deviation safely
        std = X_train[feature].std()
        
        if np.isscalar(std) and std > 0:  # Ensure std is a scalar
            # Add noise to train
            noise = np.random.normal(0, std * noise_level, X_train[feature].shape[0])
            X_train[feature] += noise
            
            # Add noise to test
            noise = np.random.normal(0, std * noise_level, X_test[feature].shape[0])
            X_test[feature] += noise

    # Add label noise (flip labels randomly)
    flip_mask = np.random.rand(len(y_train)) < 0.01
    y_train.iloc[flip_mask] = 1 - y_train.iloc[flip_mask]  # Ensure correct indexing
    
    return X_train, X_test, y_train

# Apply noise
X_train_noisy, X_test_noisy, y_train_noisy = add_noise_to_data(
    X_train, X_test, y_train, numerical_features
)

print(f"Training set: {X_train_noisy.shape}, Test set: {X_test_noisy.shape}")


Training set: (3380766, 11), Test set: (845192, 11)


In [11]:
import time
import cudf  # GPU DataFrame (similar to pandas)
import cupy as cp  # GPU Arrays (similar to NumPy)
from cuml.neighbors import KNeighborsClassifier  # GPU-accelerated KNN
from cuml.metrics import accuracy_score  # GPU accuracy computation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_noisy)
X_test_scaled = scaler.transform(X_test_noisy)

# Convert to cudf DataFrame for GPU processing
X_train_gpu = cudf.DataFrame(X_train_scaled)
X_test_gpu = cudf.DataFrame(X_test_scaled)
y_train_gpu = cudf.Series(y_train_noisy.values)
y_test_gpu = cudf.Series(y_test.values)

# Define different k values
k_values = [1, 3, 5,]

# Distance metrics to test
distance_metrics = ['euclidean', 'manhattan']

# Store results
results = []

print("\nRunning GPU-Accelerated KNN on noisy dataset...\n")

for metric in distance_metrics:
    for k in k_values:
        start_time = time.time()
        
        # Train GPU-accelerated KNN
        knn_gpu = KNeighborsClassifier(n_neighbors=k, metric=metric)
        knn_gpu.fit(X_train_gpu, y_train_gpu)
        
        # Predict using GPU
        y_pred_gpu = knn_gpu.predict(X_test_gpu)
        
        # Compute accuracy on GPU
        accuracy = accuracy_score(y_test_gpu, y_pred_gpu)
        
        # Time taken
        time_taken = time.time() - start_time
        
        # Store results
        results.append((metric, k, float(accuracy), time_taken))
        
        print(f"Metric: {metric}, k={k}, Accuracy: {accuracy:.4f}, Time: {time_taken:.2f}s")

# Convert results to cudf DataFrame
results_df = cudf.DataFrame(results, columns=['Distance Metric', 'k', 'Accuracy', 'Time Taken (s)'])

# Convert to pandas for display
print("\nFinal Results (GPU KNN):\n", results_df.to_pandas())



Running GPU-Accelerated KNN on noisy dataset...

Metric: euclidean, k=1, Accuracy: 0.9423, Time: 83.32s
Metric: euclidean, k=3, Accuracy: 0.9577, Time: 83.91s
Metric: euclidean, k=5, Accuracy: 0.9607, Time: 84.07s
Metric: manhattan, k=1, Accuracy: 0.9423, Time: 198.62s
Metric: manhattan, k=3, Accuracy: 0.9576, Time: 211.72s
Metric: manhattan, k=5, Accuracy: 0.9607, Time: 212.06s

Final Results (GPU KNN):
   Distance Metric  k  Accuracy  Time Taken (s)
0       euclidean  1  0.942263       83.321291
1       euclidean  3  0.957724       83.908886
2       euclidean  5  0.960685       84.070700
3       manhattan  1  0.942290      198.624898
4       manhattan  3  0.957634      211.717520
5       manhattan  5  0.960711      212.063099


In [8]:
df.head()

Unnamed: 0,step,transactionType,amount,initiator,oldBalInitiator,newBalInitiator,recipient,oldBalRecipient,newBalRecipient,isFraud,initiator_type,recipient_type
0,0,2,579.16,4018807983755588,29.32,29.32,30-0000345,0.0,0.0,0,1,2
1,0,3,39.03,4610593854752035,39.03,0.0,4944921135715488,55.94,94.98,1,1,1
2,0,3,21.78,4840578447365454,21.78,0.0,4305579785620656,58.09,79.88,1,1,1
3,0,2,570.22,4958066799829630,5.29,5.29,00-0000388,0.0,0.0,0,1,2
4,0,2,574.8,4149196095918843,82.63,82.63,80-0005320,0.0,0.0,0,1,2


In [None]:
from cuml.svm import SVC
# Define SVM parameters
svm_kernels = ['linear', 'rbf', 'poly']
c_values = [0.1, 1, 10]  # Regularization parameter tuning
svm_results = []

print("\nRunning GPU-Accelerated SVM on dataset...\n")

for kernel in svm_kernels:
    for C in c_values:
        start_time = time.time()

        # Train GPU-accelerated SVM
        svm_gpu = SVC(kernel=kernel, C=C, probability=False)
        svm_gpu.fit(X_train_gpu, y_train_gpu)

        # Predict using GPU
        y_pred_gpu = svm_gpu.predict(X_test_gpu)

        # Compute accuracy on GPU
        accuracy = accuracy_score(y_test_gpu, y_pred_gpu)

        # Time taken
        time_taken = time.time() - start_time

        # Store results
        svm_results.append((kernel, C, float(accuracy), time_taken))

        print(f"SVM - Kernel: {kernel}, C={C}, Accuracy: {accuracy:.4f}, Time: {time_taken:.2f}s")

# Convert SVM results to cudf DataFrame
svm_results_df = cudf.DataFrame(svm_results, columns=['Kernel', 'C', 'Accuracy', 'Time Taken (s)'])

# Convert to pandas for display
print("\nFinal Results (GPU SVM):\n", svm_results_df.to_pandas())



Running GPU-Accelerated SVM on dataset...



In [None]:
from cuml.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# Define hyperparameters
max_depth_values = [3, 5, 10]
min_samples_split_values = [2, 5, 10]
dt_results = []

print("\nRunning GPU-Accelerated Decision Tree on dataset...\n")

for max_depth in max_depth_values:
    for min_samples_split in min_samples_split_values:
        start_time = time.time()

        # Train GPU-accelerated Decision Tree
        dt_gpu = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split)
        dt_gpu.fit(X_train_gpu, y_train_gpu)

        # Predict using GPU
        y_pred_gpu = dt_gpu.predict(X_test_gpu)

        # Compute accuracy on GPU
        accuracy = accuracy_score(y_test_gpu, y_pred_gpu)

        # Time taken
        time_taken = time.time() - start_time

        # Store results
        dt_results.append((max_depth, min_samples_split, float(accuracy), time_taken))

        print(f"DT - Max Depth: {max_depth}, Min Samples Split: {min_samples_split}, Accuracy: {accuracy:.4f}, Time: {time_taken:.2f}s")

# Convert Decision Tree results to cudf DataFrame
dt_results_df = cudf.DataFrame(dt_results, columns=['Max Depth', 'Min Samples Split', 'Accuracy', 'Time Taken (s)'])

# Convert to pandas for display
print("\nFinal Results (GPU Decision Tree):\n", dt_results_df.to_pandas())

# Visualizing one decision tree (first configuration)
dt_sklearn = DecisionTreeClassifier(max_depth=5, min_samples_split=2)  # CPU-based for visualization
dt_sklearn.fit(X_train_noisy, y_train_noisy)  # Using original NumPy/Pandas data

# Plot tree
plt.figure(figsize=(12, 6))
plot_tree(dt_sklearn, feature_names=X_train.columns, class_names=["Class 0", "Class 1"], filled=True)
plt.title("Decision Tree Visualization")
plt.show()


In [None]:
from cuml.linear_model import LogisticRegression
import pandas as pd

# Define regularization techniques
regularization_types = ['l1', 'l2']
C_values = [0.01, 0.1, 1, 10]  # Regularization strength

logistic_results = []

print("\nRunning GPU-Accelerated Logistic Regression on dataset...\n")

for reg in regularization_types:
    for C in C_values:
        start_time = time.time()

        # Train GPU-accelerated Logistic Regression
        log_reg_gpu = LogisticRegression(penalty=reg, C=C, solver='qn')
        log_reg_gpu.fit(X_train_gpu, y_train_gpu)

        # Predict using GPU
        y_pred_gpu = log_reg_gpu.predict(X_test_gpu)

        # Compute accuracy on GPU
        accuracy = accuracy_score(y_test_gpu, y_pred_gpu)

        # Time taken
        time_taken = time.time() - start_time

        # Store results
        logistic_results.append((reg, C, float(accuracy), time_taken))

        print(f"Logistic Regression - Penalty: {reg}, C: {C}, Accuracy: {accuracy:.4f}, Time: {time_taken:.2f}s")

# Convert results to cudf DataFrame
logistic_results_df = cudf.DataFrame(logistic_results, columns=['Penalty', 'C', 'Accuracy', 'Time Taken (s)'])

# Convert to pandas for display
print("\nFinal Results (GPU Logistic Regression):\n", logistic_results_df.to_pandas())

# Compare against previous classifiers
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression (Best)', 'Decision Tree (Best)', 'KNN (Best)'],
    'Accuracy': [
        logistic_results_df['Accuracy'].max(),
        dt_results_df['Accuracy'].max(),
        results_df['Accuracy'].max()
    ]
})

print("\nPerformance Comparison:\n", comparison_df)


In [None]:
from cuml.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Models to evaluate
models = {
    "Logistic Regression": log_reg_gpu,
    "Decision Tree": dt_gpu,
    "KNN": knn_gpu
}

evaluation_results = []

print("\nModel Evaluation and Comparison...\n")

for model_name, model in models.items():
    start_train_time = time.time()
    model.fit(X_train_gpu, y_train_gpu)
    train_time = time.time() - start_train_time

    start_pred_time = time.time()
    y_pred_gpu = model.predict(X_test_gpu)
    pred_time = time.time() - start_pred_time

    # Compute metrics
    accuracy = accuracy_score(y_test_gpu, y_pred_gpu)
    precision = precision_score(y_test_gpu, y_pred_gpu, average='weighted')
    recall = recall_score(y_test_gpu, y_pred_gpu, average='weighted')
    f1 = f1_score(y_test_gpu, y_pred_gpu, average='weighted')

    # Confusion Matrix
    cm = confusion_matrix(y_test_gpu.to_pandas(), y_pred_gpu.to_pandas())

    # Store results
    evaluation_results.append((model_name, accuracy, precision, recall, f1, train_time, pred_time))

    print(f"\nModel: {model_name}")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
    print(f"Training Time: {train_time:.2f}s, Prediction Time: {pred_time:.2f}s")

    # Print classification report
    print("\nClassification Report:\n", classification_report(y_test_gpu.to_pandas(), y_pred_gpu.to_pandas()))

    # Plot confusion matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix - {model_name}")
    plt.show()

# Convert results to DataFrame for comparison
eval_results_df = pd.DataFrame(evaluation_results, columns=[
    "Model", "Accuracy", "Precision", "Recall", "F1-score", "Training Time (s)", "Prediction Time (s)"
])

print("\nFinal Model Evaluation Results:\n", eval_results_df)
