In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve, roc_curve, auc, accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, average_precision_score, silhouette_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
import pickle
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import Metadata
from sklearn.cluster import KMeans

In [None]:
# Load in Data
df = pd.read_csv('train_data_ads.csv')

In [None]:
# Set up the same training and testing data as before
df_numeric = df.select_dtypes(include=[float, int])

X_all = df_numeric.drop(columns=['gender'])
y_all = df_numeric['gender']

X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size=0.3, random_state=13)

selected_features_x = ['emui_dev', 'series_group', 'residence', 'age', 'city']

X_train = X_train_all[selected_features_x]
X_test = X_test_all[selected_features_x]

In [None]:
# Create metadata file for CTGAN
train_data = pd.concat([X_train, y_train_all], axis=1)

metadata = Metadata.detect_from_dataframe(df_top, table_name="CTR")
metadata.update_column('city', sdtype='numerical')
metadata.update_column('series_group', sdtype='numerical')
metadata.update_column('age', sdtype='numerical')
metadata.update_column('gender', sdtype='numerical')
metadata

In [None]:
# Run CTGAN to generate synthetic data
ctgan = CTGANSynthesizer(metadata,epochs=100)
ctgan.fit(train_data)

In [None]:
# Sample the generated synthetic data
synthetic_data = ctgan.sample(len(train_data))
synthetic_data.to_csv('RF_Top5_CTGAN_data.csv', index=False) # Save for use by other models
synthetic_X = synthetic_data.drop(columns=['gender'])
synthetic_y = synthetic_data['gender']

In [None]:
# Train one model on just the synthetic data
rf_ctgan_synth = RandomForestClassifier(random_state=13)
rf_ctgan_synth.fit(synthetic_X, synthetic_y)

In [None]:
# Train another model on the combination of real and synthetic data
X_train_combined = pd.concat([X_train, synthetic_X])
y_train_combined = pd.concat([y_train_all, synthetic_y])

rf_ctgan_combo = RandomForestClassifier(random_state=13)
rf_ctgan_combo.fit(X_train_combined, y_train_combined)

In [None]:
# Report Performance of model on CTGAN synthetic data

y_pred_synth = rf_ctgan_synth.predict(X_test)

accuracy_ctgan_synth = accuracy_score(y_test_all, y_pred_synth)
class_report_ctgan_synth = classification_report(y_test_all, y_pred_synth)

print("Accuracy:", accuracy_ctgan_synth)
print("Classification Report:\n", class_report_ctgan_synth)

In [None]:
# Report Performance of model on CTGAN combination of real and synthetic data

y_pred_combo = rf_ctgan_combo.predict(X_test)

accuracy_ctgan_combo = accuracy_score(y_test_all, y_pred_combo)
class_report_ctgan_combo = classification_report(y_test_all, y_pred_combo)

print("Accuracy:", accuracy_ctgan_combo)
print("Classification Report:\n", class_report_ctgan_combo)

In [None]:
# Precision-Recall for model trained on combination of real and synthetic data - Utility

y_test_bin = label_binarize(y_test_all, classes=[2, 3, 4])
y_pred_proba = rf_ctgan_combo.predict_proba(X_test)
n_classes = y_test_bin.shape[1]

colors = ['blue', 'gold', 'red']

plt.figure(figsize=(12, 10))
for i in range(n_classes):
    precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_pred_proba[:, i])
    average_precision = average_precision_score(y_test_bin[:, i], y_pred_proba[:, i])
    display = PrecisionRecallDisplay(precision=precision, recall=recall)
    display.plot(ax=plt.gca(), label=f'Gender {i + 2}', color=colors[i])

plt.title('Precision-Recall Curve for CTGANSynthesizer-Trained Random Forest', fontsize=18)
plt.xlabel('Recall', fontsize=18)
plt.ylabel('Precision', fontsize=18)
plt.legend(loc = 'lower left', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('ctgan_prec_rec_curve.pdf')
plt.close()

In [None]:
# ROC for model trained on combination of real and synthetic data - Utility

plt.figure(figsize=(12, 10))

for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
    roc_auc = auc(fpr, tpr)
    display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=f'Class {i + 2}')
    display.plot(ax=plt.gca(), label=f'Gender {i + 2} (AUC={roc_auc:.2f})', color=colors[i])

plt.axline((0, 0), slope=1, color='black', linestyle='--')
plt.title('ROC Curve for CTGANSynthesizer-Trained Random Forest', fontsize=18)
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.legend(fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig('ctgan_roc_curve.pdf')
plt.close()

In [None]:
# Confusion matrix to show performance - Utility

plt.figure(figsize=(8, 8))
cm = confusion_matrix(y_test_all, y_pred_combo)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[2, 3, 4])
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix', fontsize=12)
plt.savefig('ctgan_confmat.pdf')
plt.close()

In [None]:
# Observe the feature importances of each feature used

feature_importances = rf_ctgan_combo.feature_importances_
feature_names = X_train.columns
sorted_indices = feature_importances.argsort()[::-1]

plt.figure(figsize=(10, 8))
plt.barh(range(5), feature_importances[sorted_indices[:5]], align='center', color='gold')
plt.bar_label(plt.gca().containers[0], label_type='center')
plt.yticks(range(5), feature_names[sorted_indices[:5]])
plt.xlabel('Feature Importance', fontsize=18)
plt.title('Top 5 Feature Importances', fontsize=20)
plt.tight_layout()
plt.savefig('ctgan_featimp.pdf')
plt.close()

In [None]:
# Compare distributions of original and synthetic data - Fidelity

feature = 'emui_dev'  # Change feature to whatever you want to test
plt.figure(figsize=(12, 8))
plt.hist(X_train[feature], bins=30, alpha=0.5, label='Original', color = 'blue')
plt.hist(synthetic_X[feature], bins=30, alpha=0.5, label='Synthetic', color = 'gold')
plt.legend(fontsize=14)
plt.title(f'Distribution of {feature}', fontsize=20)
plt.xlabel(feature, fontsize=16)
plt.ylabel('Frequency', fontsize=16)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig('data_dist1.pdf')
plt.close()

In [None]:
# Comparing data correlations between original and synthetic - Fidelity

corr_original = X_train.corr()
corr_synthetic = synthetic_X.corr()

fig, ax = plt.subplots(1, 2, figsize=(16, 8))

sns.heatmap(
    corr_original,
    ax=ax[0],
    cmap='Greens',
    cbar=False,
    annot=True,
    fmt=".2f",
    annot_kws={"size": 10}
)
ax[0].set_title('Original Data Correlation', fontsize=20)

sns.heatmap(
    corr_synthetic,
    ax=ax[1],
    cmap='Greens',
    cbar=False,
    annot=True,
    fmt=".2f",
    annot_kws={"size": 10}
)
ax[1].set_title('Synthetic Data Correlation', fontsize=20)

plt.tight_layout()
plt.savefig('ctgan_correlations.pdf')
plt.close()

In [None]:
# Test Privacy with Unsupervised membership inference attack

combined_data = pd.concat([X_train, synthetic_X])
labels = np.concatenate([np.ones(len(X_train)), np.zeros(len(synthetic_X))])

# Scale the data for better clustering
scaler = StandardScaler()
scaled_data = scaler.fit_transform(combined_data)

# Split data into train and test
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(scaled_data, labels, test_size=0.3, random_state=13)

# K-Means clustering
kmeans = KMeans(n_clusters=2, random_state=13)
kmeans.fit(X_train_split)

# Predict on test data
cluster_labels_test = kmeans.predict(X_test_split)

# Map clusters to original labels based on the training data
mapping = {}
for cluster in range(2):
    majority_label = int(np.round(y_train_split[kmeans.labels_ == cluster].mean()))
    mapping[cluster] = majority_label

# Assign predictions based off the clusters
predicted_labels_test = np.array([mapping[label] for label in cluster_labels_test])

# Performance metrics
precision = precision_score(y_test_split, predicted_labels_test)
recall = recall_score(y_test_split, predicted_labels_test)
accuracy = accuracy_score(y_test_split, predicted_labels_test)
f1 = f1_score(y_test_split, predicted_labels_test)
fpr, tpr, _ = roc_curve(y_test_split, predicted_labels_test)
roc_auc = auc(fpr, tpr)

# Confusion Matrix
cm = confusion_matrix(y_test_split, predicted_labels_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Synthetic", "Real"])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix for K-Means Clustering")
plt.savefig('ctgan_unsupervised_privacy_cf.pdf')
plt.close()

# ROC
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Clustering Attack (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Unsupervised Membership Inference Attack')
plt.legend()
plt.savefig('ctgan_unsupervised_privacy_roc.pdf')
plt.close()

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'Accuracy: {accuracy:.2f}')
print(f'F1-Score: {f1:.2f}')
print(f'AUC for Unsupervised Privacy Attack: {roc_auc:.2f}')