In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve, roc_curve, auc, accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, average_precision_score, silhouette_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
import pickle
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import Metadata
from sklearn.cluster import KMeans

In [None]:
# Load in Data
df = pd.read_csv('train_data_ads.csv')

In [None]:
# Set up the same training and testing data as before
df_numeric = df.select_dtypes(include=[float, int])

X_all = df_numeric.drop(columns=['gender'])
y_all = df_numeric['gender']

X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size=0.3, random_state=13)

selected_features_x = ['emui_dev', 'series_group', 'residence', 'age', 'city']

X_train = X_train_all[selected_features_x]
X_test = X_test_all[selected_features_x]

In [None]:
# Load in the privbayes synthetic data

df_privbayes = pd.read_csv('privbayes_genderall_frac1_sgd_imp_feats_20percofreal.csv')
print(df_privbayes['gender'].unique())

In [None]:
# Select Features used for Logistic Regression on PrivBayes

selected_features_pb = ['app_score', 'app_second_class', 'series_group', 'emui_dev', 'series_dev', 'gender']
X_train_all['gender'] = y_train_all
df_og_tr = X_train_all[selected_features_pb]
df_og_tr_pb = pd.concat([df_privbayes, df_og_tr])
X_train_pb = df_og_tr_pb.drop(columns=['gender'])
y_train_pb = df_og_tr_pb['gender']
X_test_pb = X_test_all[['app_score', 'app_second_class', 'series_group', 'emui_dev', 'series_dev']]

In [None]:
# Train Model on combination of real and synthetic PrivBayes data

rf_ctgan_pb = RandomForestClassifier(random_state=13)
rf_ctgan_pb.fit(X_train_pb, y_train_pb)

In [None]:
# Evaluate the accuracy and obtain the classification report for the model trained on real and synthetic PrivBayes data

y_pred_pb = rf_ctgan_pb.predict(X_test_pb)

accuracy_ctgan_pb = accuracy_score(y_test_all, y_pred_pb)
class_report_ctgan_pb = classification_report(y_test_all, y_pred_pb)

print("Accuracy:", accuracy_ctgan_pb)
print("Classification Report:\n", class_report_ctgan_pb)

In [None]:
# Train the model on the real data only -- using different features

X_train_pb = df_og_tr.drop(columns=['gender'])
y_train_pb = df_og_tr['gender']

rf_ctgan_pb = RandomForestClassifier(random_state=13)
rf_ctgan_pb.fit(X_train_pb, y_train_pb)

In [None]:
# Evaluate the accuracy and obtain the classification report for the model trained on real data

y_pred_pb = rf_ctgan_pb.predict(X_test_pb)

accuracy_ctgan_pb = accuracy_score(y_test_all, y_pred_pb)
class_report_ctgan_pb = classification_report(y_test_all, y_pred_pb)

print("Accuracy:", accuracy_ctgan_pb)
print("Classification Report:\n", class_report_ctgan_pb)

In [None]:
# Train the model on the PrivBayes synthetic data only

X_train_pb = df_privbayes.drop(columns=['gender'])
y_train_pb = df_privbayes['gender']

rf_ctgan_pb = RandomForestClassifier(random_state=13)
rf_ctgan_pb.fit(X_train_pb, y_train_pb)

In [None]:
# Evaluate the accuracy and obtain the classification report for the model trained on PrivBayes synthetic data

y_pred_pb = rf_ctgan_pb.predict(X_test_pb)

accuracy_ctgan_pb = accuracy_score(y_test_all, y_pred_pb)
class_report_ctgan_pb = classification_report(y_test_all, y_pred_pb)

print("Accuracy:", accuracy_ctgan_pb)
print("Classification Report:\n", class_report_ctgan_pb)