In [None]:
!pip install ctgan

Collecting ctgan
  Downloading ctgan-0.10.1-py3-none-any.whl.metadata (11 kB)
Collecting rdt>=1.11.0 (from ctgan)
  Downloading rdt-1.13.0-py3-none-any.whl.metadata (10 kB)
Collecting Faker>=17 (from rdt>=1.11.0->ctgan)
  Downloading Faker-30.6.0-py3-none-any.whl.metadata (15 kB)
Downloading ctgan-0.10.1-py3-none-any.whl (24 kB)
Downloading rdt-1.13.0-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Faker-30.6.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker, rdt, ctgan
Successfully installed Faker-30.6.0 ctgan-0.10.1 rdt-1.13.0


In [None]:
import pandas as pd
import numpy as np
from ctgan import CTGAN
from sklearn.preprocessing import MinMaxScaler

## Why CTGAN for synthetic data generation?

CTGAN is suitable for our synthetic data generation because it can effectively manage mixed data types (binary, categorical, and continuous) present in our dataset. Its ability to capture complex relationships between variables ensures that the synthetic data maintains the statistical properties of the original records. Additionally, CTGAN allows for conditional generation, which can help create tailored samples based on specific features, enhancing the relevance and realism of the synthetic data we produce.

In [None]:
#contains all 36 metrics as well as the soldier name which is kept soldier_0, soldier_1 etc since we do not know their language and how'd they would name their people.
def create_empty_betrayal_metrics_df():
    columns = [
        'username','name','superhero_name', "preffered_age_range","sexuality","gender","city","education_level","income_level","curiosity_level","organized_chaos","social_butterfly","team_player_vibes","chill_factor","adventure_seeker","perfectionist_mode","party_starter","harmony_seeker","mood_meter","dietary_preferences","pet_preferences","fitness_preferences","smoking_habits","alcohol_consumption","preferred_occupation1","preferred_occupation2","preferred_occupation3","education_level","income_level","hobby1","hobby2","hobby3","movie_preference1","movie_preference2","movie_preference3","song_preference1","song_preference2","song_preference3","have_dated","dating_status"
    ]
    df = pd.DataFrame(columns=columns)
    return df

In [None]:
database_df = create_empty_betrayal_metrics_df()

In [None]:

binary_columns = [
    "gender","smoking_habits","dating_status","have_dated"
]

# continuous_columns = [
#     'risk', 'discipline','edu_level','mental_health','risk_last_op','active_duty_record','failure'
# ]

integer_columns = list(set(database_df.columns) - set(binary_columns)  - {"username"} - {"name"}- {"superhero_name"} - {"city"})

In [None]:
variable_ranges = {
    # # Continuous variables with 0-1 range
    # 'risk': (0, 1),
    # 'risk_last_op': (0, 1),
    # 'mental_health': (0, 1),
    # 'edu_level': (0, 1),
    # 'discipline': (0, 1),
    # 'active_duty_record': (0, 1),
    # 'failure': (0, 1),

    # Integer variables with specific ranges
    'preffered_age_range': (0,6),
    'sexuality': (0,4),
    'education_level':(0,4),
    'income_level':(0,4),
    'curiosity_level':(0,10),
    'organized_chaos':(0,10),
    'social_butterfly':(0,10),
    'team_player_vibes':(0,10),
    'chill_factor':(0,10),
    'adventure_seeker':(0,10),
    'perfectionist_mode':(0,10),
    'party_starter':(0,10),
    'harmony_seeker':(0,10),
    'mood_meter':(0,10),
    'dietary_preferences':(0,4),
    'pet_preferences':(0,3),
    'fitness_preferences':(0,4),
    'alcohol_consumption':(0,3),
    'preferred_occupation1':(0,6),
    'preferred_occupation2':(0,6),
    'preferred_occupation3':(0,6),
    'hobby1':(0,7),
    'hobby2':(0,7),
    'hobby3':(0,7),
    'movie_preference1':(0,8),
    'movie_preference2':(0,8),
    'movie_preference3':(0,8),
    'song_preference1':(0,8),
    'song_preference2':(0,8),
    'song_preference3':(0,8),
}


In [None]:
def generate_random_data(n_samples=50):
    data = []
    for _ in range(n_samples):
        sample = {}

        for col in binary_columns:
            sample[col] = np.random.choice([0, 1])

        for col in integer_columns:
            sample[col] = np.random.randint(variable_ranges[col][0], variable_ranges[col][1]+1)

        sample['name'] = 'placeholder_name'

        data.append(sample)

    return pd.DataFrame(data)

In [None]:
initial_data = generate_random_data(n_samples=500)

df_train = initial_data.drop(columns=['name'])

In [None]:
#normalising the data
# scaler = MinMaxScaler()
# df_train_normalized = pd.DataFrame(scaler.fit_transform(df_train), columns=df_train.columns)

In [None]:
ctgan = CTGAN(epochs=1000)
ctgan.fit(df_train)

In [None]:
synthetic_data_normalized = ctgan.sample(500)

In [None]:
synthetic_data_normalized

Unnamed: 0,gender,smoking_habits,dating_status,have_dated,hobby2,sexuality,movie_preference2,hobby3,party_starter,preffered_age_range,...,chill_factor,income_level,pet_preferences,mood_meter,fitness_preferences,harmony_seeker,alcohol_consumption,song_preference1,dietary_preferences,song_preference3
0,1,1,1,1,9,1,8,2,4,1,...,7,1,1,3,3,7,1,7,2,8
1,0,0,0,1,1,2,1,0,5,1,...,6,3,0,3,2,-1,3,3,2,5
2,0,0,0,1,5,0,2,2,9,1,...,13,0,3,10,3,4,2,2,3,7
3,1,0,0,1,3,1,6,2,5,4,...,7,0,2,9,0,2,2,6,3,9
4,1,1,0,1,4,0,6,4,4,3,...,9,1,0,4,3,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0,0,0,1,4,2,8,6,4,1,...,8,0,1,10,2,4,2,5,1,7
496,1,0,0,0,1,4,1,5,8,4,...,5,2,3,8,-1,0,2,6,3,9
497,1,0,1,0,4,4,0,1,0,0,...,4,1,1,2,2,3,1,8,0,7
498,0,1,0,1,3,3,8,6,5,6,...,9,3,2,13,3,4,1,2,2,7


Denormalizing in CTGAN is essential for the following reasons since

synthetic data generated by CTGAN is often normalized to improve training stability. Denormalizing the data restores it to its original scale, making it easier to interpret and use in practical applications

Ensures that the generated data accurately reflects the actual value ranges and distributions of the original dataset, enhancing its realism and usability.


a common challenge when using generative models like CTGAN is that they learn the overall distribution of the data but don't inherently understand hard constraints like minimum and maximum values.

so we apply post-processing to the data

In [None]:
synthetic_data_normalized.describe()

Unnamed: 0,gender,smoking_habits,dating_status,have_dated,hobby2,sexuality,movie_preference2,hobby3,party_starter,preffered_age_range,...,chill_factor,income_level,pet_preferences,mood_meter,fitness_preferences,harmony_seeker,alcohol_consumption,song_preference1,dietary_preferences,song_preference3
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,...,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.43,0.516,0.358,0.426,3.426,1.702,4.82,3.558,4.518,3.064,...,7.42,1.65,1.616,8.108,1.9,4.7,1.474,4.698,1.666,5.132
std,0.495572,0.500244,0.479892,0.494989,2.588034,1.347898,3.051922,2.248722,3.136455,2.169174,...,3.410405,1.544108,1.134512,3.433353,1.383412,3.413694,1.040909,2.838117,1.419321,2.681222
min,0.0,0.0,0.0,0.0,-2.0,-1.0,-2.0,-2.0,-2.0,-1.0,...,-1.0,-1.0,0.0,0.0,-1.0,-2.0,0.0,-2.0,-1.0,-1.0
25%,0.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,2.0,1.0,...,4.0,0.0,1.0,5.0,1.0,2.0,1.0,2.0,1.0,3.0
50%,0.0,1.0,0.0,0.0,3.0,1.0,5.5,4.0,4.0,3.0,...,8.0,1.0,1.0,9.0,2.0,5.0,1.0,5.0,2.0,5.0
75%,1.0,1.0,1.0,1.0,6.0,3.0,7.0,5.0,6.0,5.0,...,10.0,3.0,3.0,11.0,3.0,7.25,2.0,7.0,3.0,7.0
max,1.0,1.0,1.0,1.0,9.0,5.0,10.0,9.0,13.0,7.0,...,13.0,5.0,4.0,13.0,4.0,12.0,4.0,10.0,5.0,10.0


In [None]:
def clip_to_range(data, ranges):
    for col, (min_val, max_val) in ranges.items():
        if col in data.columns:
            data[col] = data[col].clip(min_val, max_val)
    return data

for col in binary_columns:
    synthetic_data_normalized[col] = synthetic_data_normalized[col].round().astype(int)

synthetic_data_normalized = clip_to_range(synthetic_data_normalized, variable_ranges)

for col in integer_columns:
    if col in synthetic_data_normalized.columns:
        synthetic_data_normalized[col] = synthetic_data_normalized[col].round().astype(int)


In [None]:
synthetic_data_normalized['name'] = ['user_' + str(i) for i in range(len(synthetic_data_normalized))]

In [None]:
#verification of adherence to ranges
for col, (min_val, max_val) in variable_ranges.items():
    if col in synthetic_data_normalized.columns:
        actual_min = synthetic_data_normalized[col].min()
        actual_max = synthetic_data_normalized[col].max()
        if actual_min < min_val or actual_max > max_val:
            print(f"Warning: {col} has values outside the specified range. Min: {actual_min}, Max: {actual_max}")

In [None]:
!pip install skimpy

Collecting skimpy
  Downloading skimpy-0.0.15-py3-none-any.whl.metadata (28 kB)
Collecting ipykernel<7.0.0,>=6.7.0 (from skimpy)
  Downloading ipykernel-6.29.5-py3-none-any.whl.metadata (6.3 kB)
Collecting polars<0.21,>=0.19 (from skimpy)
  Downloading polars-0.20.31-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting typeguard==4.2.1 (from skimpy)
  Downloading typeguard-4.2.1-py3-none-any.whl.metadata (3.7 kB)
Collecting comm>=0.1.1 (from ipykernel<7.0.0,>=6.7.0->skimpy)
  Downloading comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting jedi>=0.16 (from ipython>=7.23.1->ipykernel<7.0.0,>=6.7.0->skimpy)
  Downloading jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading skimpy-0.0.15-py3-none-any.whl (16 kB)
Downloading typeguard-4.2.1-py3-none-any.whl (34 kB)
Downloading ipykernel-6.29.5-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDo

In [None]:
import skimpy as skim

skim.skim(synthetic_data_normalized)

In [None]:
#saving the synthetic data to a CSV file
synthetic_data_normalized.to_csv('dating_dataset.csv', index=False)
print("Synthetic data has been saved to 'dating_dataset.csv'")

Synthetic data has been saved to 'dating_dataset.csv'
