In [None]:
%pip install sdv

In [None]:
import pandas as pd

In [None]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"

In [None]:
df = pd.read_csv(url)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Check for missing values
print(f"Missing Ages: {df['Age'].isnull().sum()}")

# Check for fractional ages (numbers that aren't whole)
# This looks for ages where the decimal part is NOT zero
fractional_ages = df[df['Age'] % 1 != 0]
print(f"Fractional Ages (Babies): {len(fractional_ages)}")
print(fractional_ages['Age'].head())

In [None]:
print(f"Unique Pclass values: {df['Pclass'].nunique()}")
print(f"Unique Fare values:   {df['Fare'].nunique()}")

In [None]:
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import Metadata

print("âœ… GAN engine Loaded Successfully")

In [None]:
# 2. Autodetect the structure from dataframe
metadata = Metadata.detect_from_dataframe(data=df, table_name='titanic')

In [None]:
# Creating the Synthesizer
synthesizer = CTGANSynthesizer(
    metadata=metadata,
    epochs=100,
    verbose=True
)

# Training the AI
print("---TRAINING STARTED---")
synthesizer.fit(df)
print("---TRAINING COMPLETE")

In [None]:
# asking AI to generate fake columns
synthetic_data = synthesizer.sample(num_rows=100)
synthetic_data.head(10)

In [None]:
# Filling missing ages
mean_age = synthetic_data['Age'].mean()
synthetic_data['Age'] = synthetic_data['Age'].fillna(mean_age)

In [None]:
# Round the "Age" to the nearest whole number
synthetic_data['Age'] = synthetic_data['Age'].round().astype(int).clip(lower=1)

# Round the "Fare" to 2 decimal places
synthetic_data['Fare'] = synthetic_data['Fare'].round(2)

# Polished data
synthetic_data.head(10)