In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import random

In [None]:
# read the adult data set
adult_data = pd.read_csv('adult_data.csv', sep=',')

In [None]:
# removing the non values and the two features fnlwgt and education
adult_data.dropna(inplace=True)
adult_data.reset_index(drop=True, inplace=True)

# Drop fnlwgt, not interesting for ML
adult_data.drop('fnlwgt', axis=1, inplace=True)
adult_data.drop('education', axis=1, inplace=True)

#Data columns and their types
adult_data.info()
adult_data.head(10)

In [None]:
# incoding the marital-status into married or unmarried
adult_data['marital-status'].replace('Married-civ-spouse', 'Married', inplace=True)
adult_data['marital-status'].replace('Divorced', 'Unmarried', inplace=True)
adult_data['marital-status'].replace('Never-married', 'Unmarried', inplace=True)
adult_data['marital-status'].replace('Separated', 'Unmarried', inplace=True)
adult_data['marital-status'].replace('Widowed', 'Unmarried', inplace=True)
adult_data['marital-status'].replace('Married-spouse-absent', 'Married', inplace=True)
adult_data['marital-status'].replace('Married-AF-spouse', 'Married', inplace=True)

In [None]:
obj_columns = adult_data.select_dtypes(['object']).columns
adult_data[obj_columns] = adult_data[obj_columns].astype('category')

In [None]:
# Convert numerics to floats 
num_columns = adult_data.select_dtypes(['int64']).columns
adult_data[num_columns] = adult_data[num_columns].astype('float64')
# encoding the categorical attributes into numerical
marital_status = dict(zip(adult_data['income'].cat.codes, adult_data['income']))
adult_data['income'] = adult_data['income'].cat.codes
marital_status = dict(zip(adult_data['marital-status'].cat.codes, adult_data['marital-status']))
adult_data['marital-status'] = adult_data['marital-status'].cat.codes
occupation = dict(zip(adult_data['occupation'].cat.codes, adult_data['occupation']))
adult_data['occupation'] = adult_data['occupation'].cat.codes
relationship = dict(zip(adult_data['relationship'].cat.codes, adult_data['relationship']))
adult_data['relationship'] = adult_data['relationship'].cat.codes
race = dict(zip(adult_data['race'].cat.codes, adult_data['race']))
adult_data['race'] = adult_data['race'].cat.codes
gender = dict(zip(adult_data['gender'].cat.codes, adult_data['gender']))
adult_data['gender'] = adult_data['gender'].cat.codes
native_country = dict(zip(adult_data['native-country'].cat.codes, adult_data['native-country']))
adult_data['native-country'] = adult_data['native-country'].cat.codes
workclass = dict(zip(adult_data['workclass'].cat.codes, adult_data['workclass']))
adult_data['workclass'] = adult_data['workclass'].cat.codes

num_columns = adult_data.select_dtypes(['int8']).columns
adult_data[num_columns] = adult_data[num_columns].astype('float64')

display(adult_data.info())
display(adult_data.head(10))

In [None]:
# convert the data set from pandas to numpy
adult_data = adult_data.to_numpy()

In [None]:
# spliting the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(adult_data[:,:-1],adult_data[:,-1], test_size=0.2, random_state=92)


In [None]:
# normalizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# the hyper parametes of the original model
EPOCHS = 20
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [None]:
## train data
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = TrainData(torch.FloatTensor(X_train), 
                       torch.FloatTensor(y_train))
## test data    
class TestData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = TestData(torch.FloatTensor(X_test))

In [None]:
# the data loader for the provider model
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
# models

class provider_model(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 12.
        self.layer_1 = nn.Linear(12, 100) 
        self.layer_2 = nn.Linear(100, 100)
        self.layer_out = nn.Linear(100, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(100)
        self.batchnorm2 = nn.BatchNorm1d(100)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x
class user_model(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 12.
        self.layer_1 = nn.Linear(12, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# initializing the provider model
model = provider_model()
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
# get the accuracy of the model
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
# train the provider model
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
# the accuracy of the provider model
counter = 0
for i in range(len(y_test)):
    if y_pred_list[i] == y_test[i]:
        counter += 1   
acc = counter/y_test.shape[0]
acc = acc * 100
print(acc)

In [None]:
# confusion_matrix of the provider model predictions.
confusion_matrix(y_test, y_pred_list)

In [None]:
# full report of the provider model predictions
print(classification_report(y_test, y_pred_list))

In [None]:
# declaring the percentage of the user data from the original training data
percentage = (len(X_train)*0.5)/100

In [None]:
# data owned by the user before the augmentation
percentage

In [None]:
# randomly selecting the user data
id_numbers = random.sample(range(0, len(X_train)), int(percentage))

In [None]:
# creatign the list of the user data
shared_data = []
for i in range(len(id_numbers)):
    shared_data.append(X_train[i])

In [None]:
# augmenting the data
n=3
alpha = 0.25
new_X_train = []
for i in range(len(shared_data)):
    for j in range(i+1,len(shared_data)):
        num_ftrs = len(shared_data[0])
        mixed_examples = []
        mixed = (shared_data[i] + shared_data[j])/2
        new_X_train.append(mixed)
        for i in range(n-1):
            lam = np.random.beta(alpha, alpha, size = num_ftrs)#Random weights of length num_ftrs from beta distribution)
            mixed = lam*shared_data[i] + (1- lam)*shared_data[j]
            new_X_train.append(mixed)

In [None]:
# total data owned by the user after the augmentation
len(new_X_train)

In [None]:
# the user data leader, this data will be labeled from the provider model.
new_X_train1 = TestData(torch.FloatTensor(new_X_train))
local_labeling_loader = DataLoader(dataset=new_X_train1, batch_size=1)

In [None]:
# query the data to be labeled.
Local_labels = []
model.eval()
with torch.no_grad():
    for X_batch in local_labeling_loader:
        X_batch = X_batch.to(device)
        # print(X_batch)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        Local_labels.append(y_pred_tag.cpu().numpy())

Local_labels = [a.squeeze().tolist() for a in Local_labels]

In [None]:
local_model = user_model()
local_model.to(device)
print(local_model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(local_model.parameters(), lr=LEARNING_RATE)

In [None]:
# save the user data and the labels of this data 
np.savetxt("/content/gdrive/MyDrive/work/adult/New_training_data_set.csv", new_X_train, delimiter=",")
np.savetxt("/content/gdrive/MyDrive/work/adult/Labels_of_the_new_training_data_set.csv", Local_labels, delimiter=",")

In [None]:
# the data loader of the user labeled data set
local_train_data = TrainData(torch.FloatTensor(new_X_train), 
                       torch.FloatTensor(Local_labels))
local_train_loader = DataLoader(dataset=local_train_data, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
# training the user model
local_model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in local_train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = local_model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(local_train_loader):.5f} | Acc: {epoch_acc/len(local_train_loader):.3f}')

In [None]:
y_pred_list = []
local_model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = local_model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
# confusion_matrix of the user model predictions.
confusion_matrix(y_test, y_pred_list)

In [None]:
# the user model accuracy
counter = 0
for i in range(len(y_test)):
    if y_pred_list[i] == y_test[i]:
        counter += 1   
acc = counter/y_test.shape[0]
acc = acc * 100
print(acc)

In [None]:
# full report of the user model predictions
print(classification_report(y_test, y_pred_list))

In [None]:
# save the provider model
torch.save(model.state_dict(), '/content/gdrive/MyDrive/work/adult/original_model.pth')

In [None]:
# save the user model
torch.save(local_model.state_dict(), '/content/gdrive/MyDrive/work/adult/local_model.pth')