In [8]:
!pip install ctgan



In [9]:
import pandas as pd
import numpy as np
from ctgan import CTGAN
from sklearn.preprocessing import MinMaxScaler

## Why CTGAN for synthetic data generation?

CTGAN is suitable for our synthetic data generation because it can effectively manage mixed data types (binary, categorical, and continuous) present in our dataset. Its ability to capture complex relationships between variables ensures that the synthetic data maintains the statistical properties of the original records. Additionally, CTGAN allows for conditional generation, which can help create tailored samples based on specific features, enhancing the relevance and realism of the synthetic data we produce.

In [10]:
#contains all 36 metrics as well as the soldier name which is kept soldier_0, soldier_1 etc since we do not know their language and how'd they would name their people.
def create_empty_betrayal_metrics_df():
    columns = [
        'soldier_name','familial_history','dependents', 'kin', 'pay_gap', 'risk_last_op', 'active_duty_record', 'served_time', 'off_time', 'current_posting', 'posting_period', 'failure','corruption', 'physical_health', 'injury_type', 'mental_health', 'campaign_count', 'campaign_cause', 'peer_based', 'grievance', 'n_reports', 'complaint_c', 'absents', 'task_sf', 'genb_location', 'genc_location', 'age', 'edu_level', 'previously_c', 'attack','riots','emergency','x_incentives','p_incentives','risk','cultural_change','discipline'
    ]
    df = pd.DataFrame(columns=columns)
    return df

In [11]:
betrayal_df = create_empty_betrayal_metrics_df()

In [12]:
#division of columns as per their values it can contain
binary_columns = [
    'campaign_cause','complaint_c','task_sf','attack','riots','emergency','cultural_change'
]

continuous_columns = [
    'risk', 'discipline','edu_level','mental_health','risk_last_op','active_duty_record','failure'
]

integer_columns = list(set(betrayal_df.columns) - set(binary_columns) - set(continuous_columns) - {'soldier_name'})

In [13]:
variable_ranges = {
    # Continuous variables with 0-1 range
    'risk': (0, 1),
    'risk_last_op': (0, 1),
    'mental_health': (0, 1),
    'edu_level': (0, 1),
    'discipline': (0, 1),
    'active_duty_record': (0, 1),
    'failure': (0, 1),

    # Integer variables with specific ranges
    'served_time': (0, 50),
    'familial_history': (0, 30),
    'off_time': (0, 1000),
    'current_posting': (0, 11),
    'posting_period': (0, 15),
    'corruption': (0, 10000),
    'physical_health': (0, 1000),
    'injury_type': (0, 3),
    'campaign_count': (0, 50),
    'peer_based': (0, 100),
    'grievance': (0, 1000),
    'n_reports': (0, 100),
    'absents': (0, 100),
    'kin': (0, 20),
    'dependents': (0, 20),
    'pay_gap': (0, 10000),
    'genb_location': (0, 500),
    'genc_location': (0, 500),
    'age': (18, 60),
    'previously_c': (0, 100),
    'x_incentives': (0, 10000),
    'p_incentives': (0, 10000)
}


In [14]:
def generate_random_data(n_samples=1000):
    data = []
    for _ in range(n_samples):
        sample = {}
        #binary columns: generate 0 or 1
        for col in binary_columns:
            sample[col] = np.random.choice([0, 1])

        #continuous columns: generate a float between the range 0 and 1
        for col in continuous_columns:
            sample[col] = np.random.uniform(0, 1)

        #integer columns: generate an integer within the specified range
        for col in integer_columns:
            sample[col] = np.random.randint(variable_ranges[col][0], variable_ranges[col][1] + 1)

        sample['soldier_name'] = 'placeholder_name'

        data.append(sample)

    return pd.DataFrame(data)

In [15]:
initial_data = generate_random_data(n_samples=1000)

df_train = initial_data.drop(columns=['soldier_name'])

In [16]:
#normalising the data
scaler = MinMaxScaler()
df_train_normalized = pd.DataFrame(scaler.fit_transform(df_train), columns=df_train.columns)

In [17]:
ctgan = CTGAN(epochs=500)
ctgan.fit(df_train_normalized)

In [18]:
synthetic_data_normalized = ctgan.sample(500)

Denormalizing in CTGAN is essential for the following reasons since

synthetic data generated by CTGAN is often normalized to improve training stability. Denormalizing the data restores it to its original scale, making it easier to interpret and use in practical applications

Ensures that the generated data accurately reflects the actual value ranges and distributions of the original dataset, enhancing its realism and usability.


In [19]:
#denormalising the data
synthetic_data = pd.DataFrame(scaler.inverse_transform(synthetic_data_normalized), columns=df_train.columns)

a common challenge when using generative models like CTGAN is that they learn the overall distribution of the data but don't inherently understand hard constraints like minimum and maximum values.

so we apply post-processing to the data

In [20]:
def clip_to_range(data, ranges):
    for col, (min_val, max_val) in ranges.items():
        if col in data.columns:
            data[col] = data[col].clip(min_val, max_val)
    return data

for col in binary_columns:
    synthetic_data[col] = synthetic_data[col].round().astype(int)

synthetic_data = clip_to_range(synthetic_data, variable_ranges)

for col in integer_columns:
    if col in synthetic_data.columns:
        synthetic_data[col] = synthetic_data[col].round().astype(int)


In [21]:
synthetic_data['soldier_name'] = ['soldier_' + str(i) for i in range(len(synthetic_data))]

In [22]:
#verification of adherence to ranges
for col, (min_val, max_val) in variable_ranges.items():
    if col in synthetic_data.columns:
        actual_min = synthetic_data[col].min()
        actual_max = synthetic_data[col].max()
        if actual_min < min_val or actual_max > max_val:
            print(f"Warning: {col} has values outside the specified range. Min: {actual_min}, Max: {actual_max}")

In [23]:
!pip install skimpy

Collecting skimpy
  Downloading skimpy-0.0.15-py3-none-any.whl.metadata (28 kB)
Collecting ipykernel<7.0.0,>=6.7.0 (from skimpy)
  Downloading ipykernel-6.29.5-py3-none-any.whl.metadata (6.3 kB)
Collecting polars<0.21,>=0.19 (from skimpy)
  Downloading polars-0.20.31-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting typeguard==4.2.1 (from skimpy)
  Downloading typeguard-4.2.1-py3-none-any.whl.metadata (3.7 kB)
Collecting comm>=0.1.1 (from ipykernel<7.0.0,>=6.7.0->skimpy)
  Downloading comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting jedi>=0.16 (from ipython>=7.23.1->ipykernel<7.0.0,>=6.7.0->skimpy)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading skimpy-0.0.15-py3-none-any.whl (16 kB)
Downloading typeguard-4.2.1-py3-none-any.whl (34 kB)
Downloading ipykernel-6.29.5-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hD

In [24]:
import skimpy as skim

skim.skim(synthetic_data)

In [25]:
#saving the synthetic data to a CSV file
synthetic_data.to_csv('soldiers_Xernia.csv', index=False)
print("Synthetic data has been saved to 'soldiers_Xernia.csv'")

Synthetic data has been saved to 'soldiers_Xernia.csv'
