<a href="https://colab.research.google.com/github/aayushhyadav/MIA/blob/feature%2Fadult/Adult.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install seaborn

In [None]:
!pip install tensorflow-privacy

In [None]:
!pip install -U matplotlib

In [None]:
!pip install -U tensorflow-datasets

In [None]:
!pip install -U pandas

In [None]:
from google.colab import files
files.upload()

In [None]:
!unzip adult.zip

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
import math
import statistics
import random

In [None]:
def create_model(optimizer, loss):
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Input(shape=(104,)))
  model.add(tf.keras.layers.Dense(128, activation='relu'))
  model.add(tf.keras.layers.Dense(128, activation='relu'))
  model.add(tf.keras.layers.Dense(2, activation='softmax'))

  model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

  return model

In [None]:
def get_unequal_class_distribution(data_x, data_y, neg_class):
  class_dist = [0] * 2

  examples = list()
  train_X = list()
  train_y = list()

  for i in range(len(data_y)):
    if(data_y[i][0] == 1 and class_dist[0] <= neg_class):
      class_dist[0] += 1
      examples.append(i)
    elif(data_y[i][0] == 0):
      class_dist[1] += 1
      examples.append(i)

  for i in range(len(examples)):
    train_X.append(data_x[examples[i]])
    train_y.append(data_y[examples[i]])

  train_X = np.array(train_X)
  train_y = np.array(train_y)

  print(f"{data_x.shape, train_X.shape}")
  print(f"{data_y.shape, train_y.shape}")

  return train_X, train_y

In [None]:
'''
  # plots the loss and accuracy for training and validation sets
'''
def plot_learning_curves(history):
  fig, (ax1, ax2) = plt.subplots(1, 2)
  ax1.set_title('Cross Entropy Loss')
  ax1.plot(history.history['loss'], color = 'blue', label = 'train')
  ax1.plot(history.history['val_loss'], color = 'red', label = 'validation')

  ax2.set_title('Accuracy')
  ax2.plot(history.history['accuracy'], color = 'blue', label = 'train')
  ax2.plot(history.history['val_accuracy'], color = 'red', label = 'validation')

  plt.show()

In [None]:
'''
  # computes entropy for probability vectors
  # prob_vec - probability vector
'''
def cal_entropy(prob_vec):
  entropy = 0

  for prob in prob_vec:
    if(prob != 0):
      entropy += prob * math.log(prob, 2)

  return -1 * entropy

In [None]:
'''
  # computes standard deviation for probability vector
'''
def cal_std(prob_vec):
  mean = statistics.mean(prob_vec)
  variance = 0

  for prob in prob_vec:
    variance += (prob - mean) ** 2
  variance /= len(prob_vec)

  return math.sqrt(variance)

In [None]:
'''
  # calculates AUC score for attack model using trapezoidal method
  # train_vec - entropy for probability vectors of training examples
  # test_vec - entropy for probability vectors of testing examples
  # pos_count - number of examples which are members of training set
  # neg_count - number of examples which are non-members of training set
  # inc_threshold - controls the number of thresholds for which AUC is computed
'''
def cal_auc(train_vec, test_vec, pos_count, neg_count, inc_threshold = 0.1):
  threshold = 0
  tpr_list, fpr_list = list(), list()
  tuple_tpr, tuple_fpr = list(), list()
  points = list()
  auc = 0

  while(threshold <= 1):
    tp_count = 0
    fp_count = 0

    for val in train_vec:
      if(val <= threshold):
        tp_count += 1

    for val in test_vec:
      if(val <= threshold):
        fp_count += 1

    tpr = tp_count / pos_count
    fpr = fp_count / neg_count

    tpr_list.append(tpr)
    fpr_list.append(fpr)

    threshold += inc_threshold

  for i in range(len(tpr_list)):
    points.append([fpr_list[i], tpr_list[i]])

  points.sort()

  for i in range(len(points) - 1):
    tuple_tpr.append([points[i][1], points[i + 1][1]])
    tuple_fpr.append([points[i][0], points[i + 1][0]])

  auc = sum(np.trapz(tuple_tpr, tuple_fpr))

  plt.scatter(fpr_list, tpr_list)
  plt.show()

  return auc

In [None]:
def get_threshold(vec):
  return np.percentile(vec, 90)

In [None]:
'''
  # computes the precision and recall for attack model
'''
def cal_pre_recall(train_vec, test_vec, threshold):
  tp = 0
  fp = 0
  fn = 0

  for val in train_vec:
    if(val >= threshold):
      tp += 1
    else:
      fn += 1

  for val in test_vec:
    if(val >= threshold):
      fp += 1

  precision = tp / (tp + fp)
  recall = tp / (tp + fn)

  return precision, recall

In [None]:
'''
  # makes the distribution of metrics (entropy, std, max probability) of test set similar to training set
  # num_class is the number of classes in the classification task
  # max_prob_train is the list of maximum probabilities of training examples
  # max_prob_class is the class predicted by the model for a specific test example
  # returns a list of probabilities for the specific test example
'''
def make_metric_dist_same(num_class, max_prob_train, max_prob_class):
  p = [0] * num_class
  p[max_prob_class]  = random.choice(max_prob_train)
  rem_p = 1 - p[max_prob_class]
  i = 0

  while(rem_p > 0):
    if(i != max_prob_class):
      update_p_i = random.uniform(0, min(rem_p, p[max_prob_class]))

      if(p[i] + update_p_i < p[max_prob_class]):
        p[i] += update_p_i
        rem_p -= p[i]

    i = (i + 1) % num_class

  return p

In [None]:
# computes average entropy, standard deviation, and maximum posterior probability
# for prediction vector of training set examples

def compute_train_metrics(prob_vec_train):
  entropy_train = list()
  std_train = list()
  max_prob_train = list()

  sum_entropy = 0
  sum_std = 0
  sum_max_prob = 0

  for prob_vec in prob_vec_train:
    cur_entropy = cal_entropy(prob_vec)
    cur_std = cal_std(prob_vec)

    cur_max_prob = max(prob_vec)

    entropy_train.append(cur_entropy)
    std_train.append(cur_std)
    max_prob_train.append(cur_max_prob)

    sum_entropy += cur_entropy
    sum_std += cur_std
    sum_max_prob += cur_max_prob

  avg_train_entropy = sum_entropy / len(prob_vec_train)
  avg_train_std = sum_std / len(prob_vec_train)
  avg_train_max_prob = sum_max_prob / len(prob_vec_train)

  print(f"Avg train entropy - {avg_train_entropy}, Avg train std - {avg_train_std}, Avg train max probability - {avg_train_max_prob}")

  return entropy_train, std_train, max_prob_train, avg_train_entropy, avg_train_std, avg_train_max_prob

In [None]:
# computes average entropy, standard deviation, and maximum posterior probability
# for prediction vector of test set examples

def compute_test_metrics(prob_vec_test):
  entropy_test = list()
  std_test = list()
  max_prob_test = list()

  sum_entropy = 0
  sum_std = 0
  sum_max_prob = 0

  for prob_vec in prob_vec_test:
    cur_entropy = cal_entropy(prob_vec)
    cur_std = cal_std(prob_vec)
    cur_max_prob = max(prob_vec)

    entropy_test.append(cur_entropy)
    std_test.append(cur_std)
    max_prob_test.append(cur_max_prob)

    sum_entropy = sum_entropy + cur_entropy
    sum_std = sum_std + cur_std
    sum_max_prob += cur_max_prob

  avg_test_entropy = sum_entropy / len(prob_vec_test)
  avg_test_std = sum_std / len(prob_vec_test)
  avg_test_max_prob = sum_max_prob / len(prob_vec_test)

  print(f"Avg test entropy - {avg_test_entropy}, Avg test std - {avg_test_std}, Avg test max probability - {avg_test_max_prob}")

  return entropy_test, std_test, max_prob_test, avg_test_entropy, avg_test_std, avg_test_max_prob

In [None]:
def plot_entropy_dist(entropy_train, entropy_test, entropy_shrinked):
  figure, axis = plt.subplots(1, 3, figsize=(18, 5), sharex=True)
  sns.distplot(ax=axis[0], a=entropy_train, bins=10)
  sns.distplot(ax=axis[1], a=entropy_test, bins=10)
  # sns.distplot(ax=axis[2], a=entropy_shrinked, bins=10)

  axis[0].set_title("Entropy Distribution for Training Set")
  axis[1].set_title("Entropy Distribution for Test Set")
  axis[2].set_title("Metric Mapping")

  axis[0].set_xlabel("Entropy")
  axis[1].set_xlabel("Entropy")
  axis[2].set_xlabel("Entropy")

In [None]:
def plot_std_dist(std_train, std_test, std_shrinked):
  figure, axis = plt.subplots(1, 3, figsize=(18, 5), sharex=True)
  sns.distplot(ax=axis[0], a=std_train, bins=10)
  sns.distplot(ax=axis[1], a=std_test, bins=10)
  # sns.distplot(ax=axis[2], a=entropy_shrinked, bins=10)

  axis[0].set_title("Std Distribution for Training Set")
  axis[1].set_title("Std Distribution for Test Set")
  axis[2].set_title("Metric Mapping")

  axis[0].set_xlabel("Std")
  axis[1].set_xlabel("Std")
  axis[2].set_xlabel("Std")

In [None]:
def plot_max_prob_dist(max_prob_train, max_prob_test, max_prob_shrinked):
  figure, axis = plt.subplots(1, 3, figsize=(18, 5), sharex=True)
  sns.distplot(ax=axis[0], a=max_prob_train, bins=10)
  sns.distplot(ax=axis[1], a=max_prob_test, bins=10)
  # sns.distplot(ax=axis[2], a=entropy_shrinked, bins=10)

  axis[0].set_title("Max Prob Distribution for Training Set")
  axis[1].set_title("Max Prob Distribution for Test Set")
  axis[2].set_title("Metric Mapping")

  axis[0].set_xlabel("Max Probability")
  axis[1].set_xlabel("Max Probability")
  axis[2].set_xlabel("Max Probability")

In [None]:
data = pd.read_csv("adult.csv")
data.head()

# deleting records with missing values

missing_workclass = data.index[data['workclass'] == '?'].tolist()
data = data.drop(index = missing_workclass)

missing_occupation = data.index[data['occupation'] == '?'].tolist()
data = data.drop(index = missing_occupation)

missing_country = data.index[data['native-country'] == '?'].tolist()
data = data.drop(index = missing_country)

print(data.info())

# extracting numerical features
num_features = data.select_dtypes(include=['int64', 'float64']).columns
num_cols = data[num_features]
print(num_cols.head())

# applying one-hot encoding on categorical features
ohe_data = pd.get_dummies(data)

ohe_data = ohe_data.drop(columns=['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week'])
ohe_cols = list(ohe_data)
ohe_data = np.array(ohe_data)
ohe_data = pd.DataFrame(ohe_data, columns=ohe_cols)

print(ohe_data.head())

# feature scaling for numerical features
sc = StandardScaler()
scaled_num_cols = sc.fit_transform(num_cols)
scaled_data = pd.DataFrame(scaled_num_cols, columns=['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week'])
print(scaled_data.head())

# aggregating pre-processed data
pre_pro_data = pd.concat([scaled_data, ohe_data], axis=1)
pre_pro_data = pre_pro_data.dropna()
print(pre_pro_data.head())

# extracting target labels
target_labels = pre_pro_data[['income_<=50K', 'income_>50K']]
features = pre_pro_data.drop(columns=['income_<=50K', 'income_>50K'])
target_labels = pd.DataFrame(target_labels, columns=['income_<=50K', 'income_>50K'])
print(target_labels.head())

# features
print(features.head())

optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)

loss = tf.keras.losses.CategoricalCrossentropy(
    from_logits=False, reduction=tf.losses.Reduction.NONE)

train_X, test_X, train_y, test_y = train_test_split(features, target_labels, test_size = 0.25, random_state = 42)

# for class imbalance
# train_X, train_y = get_unequal_class_distribution(np.array(train_X), np.array(train_y), 20000)

test_X = np.array(test_X)
test_y = np.array(test_y)
print(f'training examples - {train_X.shape}')

model = create_model(optimizer, loss)
history = model.fit(train_X, train_y, epochs = 100, batch_size = 64, validation_data = (test_X, test_y))

plot_learning_curves(history)

prob_vec_train = model.predict(train_X)
prob_vec_test = model.predict(test_X)

entropy_train, std_train, max_prob_train, avg_train_entropy, avg_train_std, avg_train_max_prob = compute_train_metrics(prob_vec_train)
entropy_test, std_test, max_prob_test, avg_test_entropy, avg_test_std, avg_test_max_prob = compute_test_metrics(prob_vec_test)

solutions = []

# applying metric mapping
for p in prob_vec_test:
  p = p.tolist()
  cur_max_prob = p.index(max(p))
  solutions.append(make_metric_dist_same(2, max_prob_train, cur_max_prob))

entropy_shrinked, std_shrinked, max_prob_shrinked, avg_shrinked_entropy, avg_shrinked_std, avg_shrinked_max_prob = compute_test_metrics(np.array(solutions))

# calculate auc score using entropy as the parameter
cal_auc(entropy_train, entropy_test, len(entropy_train), len(entropy_test), 0.00001)

# calculate auc score using standard deviation as the parameter
cal_auc(std_train, std_test, len(std_train), len(std_test), 0.00001)

# calculate auc score using max probability as the parameter
cal_auc(max_prob_train, max_prob_test, len(max_prob_train), len(max_prob_test), 0.0001)

# get class distribution for training set
train_y = pd.DataFrame(train_y, columns=['income_<=50K', 'income_>50K'])
class_examples_train = [0] * 2

for example in train_y['income_<=50K']:
  if(example == 1):
    class_examples_train[0] += 1
  else:
    class_examples_train[1] += 1

print(f'class distribution for training dataset - {class_examples_train}')

# get class distribution for testing set
test_y = pd.DataFrame(test_y, columns=['income_<=50K', 'income_>50K'])
class_examples_test = [0] * 2

for example in test_y['income_<=50K']:
  if(example == 1):
    class_examples_test[0] += 1
  else:
    class_examples_test[1] += 1

print(f'class distribution for training dataset - {class_examples_test}')

threshold = get_threshold(entropy_shrinked)
precision, recall = cal_pre_recall(entropy_train, entropy_shrinked, threshold)
print(f"precision - {precision}, recall - {recall}")

threshold = get_threshold(std_shrinked)
precision, recall = cal_pre_recall(std_train, std_shrinked, threshold)
print(f"precision - {precision}, recall - {recall}")

threshold = get_threshold(max_prob_shrinked)
precision, recall = cal_pre_recall(max_prob_train, max_prob_shrinked, threshold)
print(f"precision - {precision}, recall - {recall}")