<a href="https://colab.research.google.com/github/arham5siddiqui/Mitigating-Linkability-Attacks-through-Differential-Privacy-enabled-Neural-Network-Training/blob/main/Step4_DP_Classification_Evaluation_Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import time

Training and implementing DP, table formation from the results

In [None]:

# Function to apply differential privacy
def add_noise(data, epsilon=1.0):
    noise = np.random.laplace(0, 1/epsilon, data.shape)
    return data + noise

# Initialize classifiers
classifiers = {
    'SVC': SVC(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier()
}

# Placeholder lists for results
results = []

# Assuming data is stored in Google Drive in a folder named 'YourDataFolder'
train_path = '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Reduced_Combined_Train.csv'
test_path = '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Reduced_Combined_Test.csv'

chunk_size = 10000  # size for chunks due to large dataset and limited RAM

# Epsilon values for differential privacy for each classifier
epsilons = {
    'SVC': 0.5,
    'Naive Bayes': 0.8,
    'Decision Tree': 0.7,
    'Random Forest': 0.6,
    'KNN': 0.9
}

# Loop to read training data in chunks and train classifiers
for chunk in pd.read_csv(train_path, chunksize=chunk_size):
    X_train = chunk.drop(['userID', 'videoID'], axis=1)  # drop labels
    y_train_user = chunk['userID']

    # Train classifiers on user identification
    for name, clf in classifiers.items():
        epsilon = epsilons[name]  # Fetch epsilon value for the classifier
        X_train_noisy = add_noise(X_train, epsilon)  # Apply differential privacy with the fetched epsilon

        start_time = time.time()
        clf.fit(X_train_noisy, y_train_user)
        fit_time = time.time() - start_time  # in seconds
        results.append({'Classifier': name, 'Fit_time': fit_time, 'Epsilon': epsilon})

# Loop to read test data in chunks and evaluate classifiers
for chunk in pd.read_csv(test_path, chunksize=chunk_size):
    X_test = chunk.drop(['userID', 'videoID'], axis=1)  # drop labels
    y_test_user = chunk['userID']

    # Evaluate classifiers on user identification
    for name, clf in classifiers.items():
        epsilon = epsilons[name]  # Fetch epsilon value for the classifier
        X_test_noisy = add_noise(X_test, epsilon)  # Apply differential privacy with the fetched epsilon

        y_pred = clf.predict(X_test_noisy)
        test_accuracy = accuracy_score(y_test_user, y_pred)
        train_accuracy = clf.score(X_train_noisy, y_train_user)  # Using the last chunk for demonstration
        results.append({'Classifier': name, 'Test Accuracy': test_accuracy, 'Train Accuracy': train_accuracy, 'Epsilon': epsilon})

# Convert results to DataFrame and save as CSV
results_df = pd.DataFrame(results)
results_df.to_csv('/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Results/main_table.csv', index=False)



# Generate separate tables for each classifier and save them
for classifier in classifiers.keys():
    classifier_df = results_df[results_df['Classifier'] == classifier]
    classifier_df.to_csv(f'/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Results/{classifier}_updated_table.csv', index=False)


Graphs from the Tables obtained

In [None]:

# Plotting individual classifier graphs based on the tables obtained
def plot_classifier_graph_from_table(title, table_path):
    df = pd.read_csv(table_path)

    # Extract relevant data
    datasets = df['Dataset'].tolist()
    test_accuracies_before_dp = df['Test Accuracy (Before DP)'].tolist()
    test_accuracies_after_dp = df['Test Accuracy (After DP)'].tolist()

    # Create the plot
    plt.figure(figsize=(12, 8))

    # Plot bars for Test Accuracy (before DP)
    plt.bar(datasets, test_accuracies_before_dp, color='#1f77b4', label='Before DP')

    # Plot bars for Test Accuracy (after DP)
    plt.bar(datasets, test_accuracies_after_dp, color='#ff7f0e', label='After DP', alpha=0.6)

    # Add labels and title
    plt.xlabel('Datasets')
    plt.ylabel('Test Accuracy')
    plt.title(f'{title} Classifier: After Differential Privacy')
    plt.legend(loc='upper right')

    # Save the plot as a PNG file
    save_path = f'/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Results/{title}.png'  # As, running code in Google Colab
    plt.savefig(save_path)

    # Show the plot
    plt.show()


# Function to plot the Main Graph based on the table
def plot_main_graph_from_table(table_path):
    df = pd.read_csv(table_path)

    # Extract relevant data
    classifiers = df['Classifier'].tolist()
    avg_test_accuracy_before_dp = df['Average Test Accuracy (Before DP)'].tolist()
    avg_test_accuracy_after_dp = df['Average Test Accuracy (After DP)'].tolist()

    # Create the plot
    plt.figure(figsize=(12, 8))

    # Plot bars for Average Test Accuracy
    plt.barh(classifiers, avg_test_accuracy_before_dp, color='#1f77b4', label='Before DP')
    plt.barh(classifiers, avg_test_accuracy_after_dp, color='#ff7f0e', label='After DP', alpha=0.6)

    # Add labels and title
    plt.xlabel('Average Test Accuracy')
    plt.ylabel('Classifiers')
    plt.title('Average Test Accuracy Before and After Implementing Differential Privacy')
    plt.legend(loc='upper right')

    # Save the plot as a PNG file
    save_path = '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Results/MainGraph.png'  # Assuming you'll run this code in Google Colab
    plt.savefig(save_path)

    # Show the plot
    plt.show()

# Paths to the reference tables uploaded in Google Colab
table_paths = {
    'Main Graph': '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Results/main_table.csv',
    'Naive Bayes': '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Results/Naive_Bayes_updated_table.csv',
    'Decision Tree': '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Results/Decision_Tree_updated_table.csv',
    'Random Forest': '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Results/Random_Forest_updated_table.csv',
    'SVM': '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Results/SVC_updated_table.csv',
    'KNN': '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Results/KNN_updated_table.csv'
}

# Plot the Main Graph
plot_main_graph_from_table(table_paths['Main Graph'])

# Plot graphs for individual classifiers
for title, table_path in {k: v for k, v in table_paths.items() if k != 'Main Graph'}.items():
    plot_classifier_graph_from_table(title, table_path)
