<a href="https://colab.research.google.com/github/arham5siddiqui/Mitigating-Linkability-Attacks-through-Differential-Privacy-enabled-Neural-Network-Training/blob/main/Step3_User_Identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

Shuffling a Train file in chunks, in order include multiple Classes(multiple user data) in every chunk.

In [None]:
# For running the model training, shuffling the users is necessary in order to get all variety of users and data in every chunk. As the file cannot be trained by taking its complete size, training one chunk at a time is possible.
# Few errors came up initially, but after referring stack overflow blogs, error solutions and debugging, the following code was concluded.

input_file_path = "/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Reduced_Combined_Train.csv"
output_file_path = "/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Shuffled_Reduced_Combined_Train.csv"

chunk_size = 10000  # Choose a reasonable chunk size

# Initialize writing to CSV
first_one = True

# Read and shuffle each chunk, then append to new CSV
for chunk in pd.read_csv(input_file_path, chunksize=chunk_size):
    shuffled_chunk = chunk.sample(frac=1).reset_index(drop=True)

    if first_one:
        shuffled_chunk.to_csv(output_file_path, mode='w', index=False)
        first_one = False
    else:
        shuffled_chunk.to_csv(output_file_path, mode='a', header=False, index=False)


Using: all columns except labels 'userID' and 'videoID', as features.
First each classifier is trained,
and then tested for each classifier.

In [None]:

# Initialize classifiers
classifiers = {
    "SGD": SGDClassifier(),
    "GaussianNB": GaussianNB(),
    "RandomForest": RandomForestClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier()
}

# Initialize a flag to indicate if each classifier has been fitted
fitted = {name: False for name in classifiers.keys()}

# Initialize empty dictionary to store accuracy
accuracy_dict = {'Classifier': [], 'Dataset': [], 'Accuracy': []}

# Initialize empty list to store train accuracy
train_accuracy_list = []

# Initialize final dictionary to store average accuracies
final_dict = {'Classifier': [], 'Dataset': [], 'Accuracy': [], 'Train Accuracy': []}

# Initialize the list of dataset types
dataset_types = ['Quaternion', 'Euler', 'Yaw']  # Adding actual dataset types

# Define chunk size
chunk_size = 10 ** 4  # Adjust based on your available memory
subset_size = 10 ** 4  # Size for training classifiers that don't support partial_fit

# Read a small portion of the file to get the unique classes
unique_classes = set()
for chunk in pd.read_csv("/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Shuffled_Reduced_Combined_Train.csv", chunksize=chunk_size):
    unique_classes.update(chunk['userID'])
unique_classes = np.array(list(unique_classes))


# Function to train classifiers that support partial_fit
def train_partial_fit(chunk):
    X_train = chunk.drop(['userID', 'videoID'], axis=1)
    y_train = chunk['userID']
    if len(np.unique(y_train)) > 1:  # Check for multiple unique classes
        for name, clf in classifiers.items():
            if hasattr(clf, 'partial_fit'):
                clf.partial_fit(X_train, y_train, classes=unique_classes)
                fitted[name] = True  # Mark as fitted

# Function to test classifiers
def test_classifiers(chunk, dataset_type):
    X_test = chunk.drop(['userID', 'videoID'], axis=1)
    y_test = chunk['userID']
    for name, clf in classifiers.items():
        if fitted[name]:  # Only test if the classifier was fitted
            y_pred = clf.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            accuracy_dict['Classifier'].append(name)
            accuracy_dict['Dataset'].append(dataset_type)
            accuracy_dict['Accuracy'].append(accuracy)


# Training loop for classifiers that support partial_fit
for chunk in pd.read_csv("/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Shuffled_Reduced_Combined_Train.csv", chunksize=chunk_size):
    train_partial_fit(chunk)

# Fit classifiers that do not support partial_fit on a smaller subset
subset = pd.read_csv("/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Shuffled_Reduced_Combined_Train.csv", nrows=subset_size)
X_subset = subset.drop(['userID', 'videoID'], axis=1)
y_subset = subset['userID']
for name, clf in classifiers.items():
    if not fitted[name]:
        clf.fit(X_subset, y_subset)
        fitted[name] = True
        train_accuracy = clf.score(X_subset, y_subset)
        train_accuracy_list.append((name, train_accuracy))

# Test the classifiers
for chunk in pd.read_csv("/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Reduced_Combined_Test.csv", chunksize=chunk_size):
    test_classifiers(chunk, 'Test')

# Average the test accuracy and fill in the dictionary for DataFrame creation
for name in classifiers.keys():
    indices = [i for i, x in enumerate(accuracy_dict['Classifier']) if x == name]
    if indices:
        avg_accuracy = np.mean([accuracy_dict['Accuracy'][i] for i in indices])
        final_dict['Classifier'].append(name)
        final_dict['Dataset'].append('Test')  # This will be 'Test' for all classifiers in this case
        final_dict['Accuracy'].append(avg_accuracy)
        final_dict['Train Accuracy'].append(np.nan)  # Placeholder for train accuracy

# Incorporate the train_accuracy_list into the final_dict
for name, train_accuracy in train_accuracy_list:
    if name in final_dict['Classifier']:
        idx = final_dict['Classifier'].index(name)
        final_dict['Train Accuracy'][idx] = train_accuracy

# Save results to CSV
results_df = pd.DataFrame(final_dict)
results_df.to_csv("/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/results.csv", index=False)
