In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the original dataset
df = pd.read_csv('df.csv')

# Drop rows with null values
df_clean = df.dropna()

print(f"Original dataset shape: {df.shape}")
print(f"Dataset shape after dropping null values: {df_clean.shape}")

# Function to generate synthetic data
def generate_synthetic_data(df, n_synthetic):
    # Standardize features
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    
    # Generate synthetic data
    synthetic_data = []
    for _ in range(n_synthetic):
        new_sample = df_scaled.sample(n=1).values[0]
        
        # Add random noise to features
        new_sample += np.random.normal(0, 0.1, size=new_sample.shape)
        
        synthetic_data.append(new_sample)
    
    # Convert back to original scale
    synthetic_df = pd.DataFrame(synthetic_data, columns=df_scaled.columns)
    synthetic_df = pd.DataFrame(scaler.inverse_transform(synthetic_df), columns=df.columns)
    
    # Round numerical columns to original precision
    for col in df.columns:
        if df[col].dtype != 'object':  # Check if the column is numeric
            non_zero_min = df[col][df[col] != 0].abs().min()
            if pd.notnull(non_zero_min) and non_zero_min != 0:
                decimal_places = max(0, -int(np.floor(np.log10(non_zero_min))))
            else:
                decimal_places = 0
            synthetic_df[col] = synthetic_df[col].round(decimal_places)
    
    return synthetic_df

# Generate synthetic data
n_synthetic = 5000 - len(df_clean)
synthetic_df = generate_synthetic_data(df_clean, n_synthetic)

# Combine original (clean) and synthetic data
expanded_df = pd.concat([df_clean, synthetic_df], ignore_index=True)

# Save the expanded dataset
expanded_df.to_csv('expanded_kidney_disease_dataset.csv', index=False)

print(f"Expanded dataset created with {len(expanded_df)} rows.")

Original dataset shape: (400, 24)
Dataset shape after dropping null values: (159, 24)
Expanded dataset created with 5000 rows.


In [7]:
df.head().T


Unnamed: 0,0,1,2,3,4
id,0.0,1.0,2.0,3.0,4.0
age,48.0,7.0,62.0,48.0,51.0
bp,80.0,50.0,80.0,70.0,80.0
sg,1.02,1.02,1.01,1.005,1.01
al,1.0,4.0,2.0,4.0,2.0
su,0.0,0.0,3.0,0.0,0.0
rbc,,,1.0,1.0,1.0
pc,1.0,1.0,1.0,0.0,1.0
bgr,121.0,,423.0,117.0,106.0
bu,36.0,18.0,53.0,56.0,26.0
