In [10]:
import pandas as pd
import numpy as np

# Original data
data = {
    'gear_count': [7, 10, 12, 21, 18, 24, 30, 8, 9, 15, 11, 20, 22, 16, 14, 25, 28, 19, 13, 17],
    'weight_kg': [10.5, 9.8, 8.9, 7.5, 8.0, 7.2, 6.8, 9.9, 10.1, 8.3, 9.0, 7.7, 7.3, 8.5, 9.2, 7.1, 6.9, 7.8, 8.7, 8.2],
    'frame_material': [0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1],  # 0=aluminio, 1=carbono
    'price_usd': [300, 450, 600, 700, 650, 800, 900, 320, 310, 560, 470, 680, 720, 590, 430, 810, 880, 670, 520, 600]
}

original_df = pd.DataFrame(data)

# Define how many new rows you want
num_synthetic_rows = 7502

# Calculate statistics from your original data for numerical columns
mean_gear = original_df['gear_count'].mean()
std_gear = original_df['gear_count'].std()
mean_weight = original_df['weight_kg'].mean()
std_weight = original_df['weight_kg'].std()
mean_price = original_df['price_usd'].mean()
std_price = original_df['price_usd'].std()

# Define possible values for the new columns
bike_brands = ['Giant', 'Specialized', 'Trek', 'Scott', 'Cannondale', 'Canyon', 'Cube', 'Merida', 'Orbea', 'BH']
years = list(range(2018, 2026)) # Years from 2018 to 2025
gear_brands = ['Shimano', 'SRAM', 'Campagnolo']
wheel_diameters_mm = [680, 690, 700, 710, 720, 730]
shifting_types = [0, 1] # 0 = Mechanical, 1 = Electronic
brake_types = [0, 1] # 0 = Rim/Caliper, 1 = Disc

# Set random seed for reproducibility
np.random.seed(42)

# First generate frame material with original proportions
frame_material_probs = original_df['frame_material'].value_counts(normalize=True).sort_index().values
synthetic_frame_material = np.random.choice([0, 1], size=num_synthetic_rows, p=frame_material_probs)

# Generate synthetic data with relationships between features
synthetic_gear_count = np.random.normal(loc=mean_gear, scale=std_gear, size=num_synthetic_rows).round().astype(int)
synthetic_gear_count = np.clip(synthetic_gear_count, original_df['gear_count'].min(), original_df['gear_count'].max())

# Generate weight based on frame material
synthetic_weight_kg = np.zeros(num_synthetic_rows)
for i in range(num_synthetic_rows):
    if synthetic_frame_material[i] == 0:  # Aluminum
        synthetic_weight_kg[i] = np.random.normal(loc=9.5, scale=0.5)
    else:  # Carbon
        synthetic_weight_kg[i] = np.random.normal(loc=7.5, scale=0.7)
synthetic_weight_kg = np.clip(synthetic_weight_kg, original_df['weight_kg'].min(), original_df['weight_kg'].max())

# Generate electronic shifting based on frame material
synthetic_electronic_shifting = np.zeros(num_synthetic_rows, dtype=int)
for i in range(num_synthetic_rows):
    if synthetic_frame_material[i] == 0:  # Aluminum - rare
        synthetic_electronic_shifting[i] = np.random.choice(shifting_types, p=[0.95, 0.05])
    else:  # Carbon - more common
        synthetic_electronic_shifting[i] = np.random.choice(shifting_types, p=[0.7, 0.3])

# Generate brake type based on year and frame material
synthetic_brake_type = np.zeros(num_synthetic_rows, dtype=int)
synthetic_year = np.random.choice(years, size=num_synthetic_rows)
for i in range(num_synthetic_rows):
    if synthetic_year[i] >= 2018:
        # After 2018, most bikes have disc brakes, especially carbon
        if synthetic_frame_material[i] == 1:  # Carbon
            synthetic_brake_type[i] = np.random.choice(brake_types, p=[0.1, 0.9])
        else:  # Aluminum
            synthetic_brake_type[i] = np.random.choice(brake_types, p=[0.3, 0.7])
    else:
        # Before 2018, rim brakes were more common
        if synthetic_frame_material[i] == 1:  # Carbon
            synthetic_brake_type[i] = np.random.choice(brake_types, p=[0.6, 0.4])
        else:  # Aluminum
            synthetic_brake_type[i] = np.random.choice(brake_types, p=[0.8, 0.2])

# Generate other features
synthetic_bike_brand = np.random.choice(bike_brands, size=num_synthetic_rows)
synthetic_gear_brand = np.random.choice(gear_brands, size=num_synthetic_rows)
synthetic_wheel_diameter_mm = np.random.choice(wheel_diameters_mm, size=num_synthetic_rows)

# Function to calculate price based on features with more realistic adjustments
def calculate_price(row):
    # Base prices
    if row['frame_material'] == 0:  # Aluminum
        base_price = 800
    else:  # Carbon
        base_price = 1500
    
    # Year adjustment (newer bikes are more expensive)
    year_adjustment = 1 + 0.05 * (row['year'] - 2018)
    
    # Gear count adjustment
    gear_adjustment = 1 + 0.02 * (row['gear_count'] - 12)
    
    # Weight adjustment (lighter is more expensive)
    if row['frame_material'] == 0:  # Aluminum
        weight_adjustment = 1 + (10 - row['weight_kg']) * 0.02
    else:  # Carbon
        weight_adjustment = 1 + (8 - row['weight_kg']) * 0.05
    
    # Electronic shifting premium
    electronic_shifting_premium = 1.4 if row['electronic_shifting'] == 1 else 1.0
    
    # Brake type multiplier
    brake_multiplier = 1.2 if row['brake_type'] == 1 else 1.0
    
    # Wheel diameter adjustment
    wheel_adjustment = 1 + (row['wheel_diameter_mm'] - 700) * 0.002
    
    # Gear brand adjustment
    if row['gear_brand'] == 'Shimano':
        brand_multiplier = 1.0
    elif row['gear_brand'] == 'SRAM':
        brand_multiplier = 1.1
    else:  # Campagnolo
        brand_multiplier = 1.3
    
    # Calculate final price
    final_price = (base_price * year_adjustment * gear_adjustment * 
                   weight_adjustment * electronic_shifting_premium * 
                   brake_multiplier * wheel_adjustment * brand_multiplier)
    
    return final_price

# Create the DataFrame
synthetic_df_final = pd.DataFrame({
    'gear_count': synthetic_gear_count,
    'weight_kg': synthetic_weight_kg,
    'frame_material': synthetic_frame_material,
    'bike_brand': synthetic_bike_brand,
    'year': synthetic_year,
    'gear_brand': synthetic_gear_brand,
    'wheel_diameter_mm': synthetic_wheel_diameter_mm,
    'electronic_shifting': synthetic_electronic_shifting,
    'brake_type': synthetic_brake_type
})

# Calculate prices based on features
synthetic_df_final['price_usd'] = synthetic_df_final.apply(calculate_price, axis=1).round().astype(int)

# Ensure prices are within a reasonable range
# min_price = original_df['price_usd'].min()
# max_price = original_df['price_usd'].max() * 1.5  # Allow some higher prices for newer models
# synthetic_df_final['price_usd'] = np.clip(synthetic_df_final['price_usd'], min_price, max_price)

print(f"Original DataFrame size: {len(original_df)} rows")
print(f"Synthetic DataFrame (final) size: {len(synthetic_df_final)} rows")
print("\nPrimeras 5 filas de datos sintéticos extendidos:\n", synthetic_df_final.head())
print("\nÚltimas 5 filas de datos sintéticos extendidos:\n", synthetic_df_final.tail())
print("\nTipos de datos de las columnas del DataFrame extendido:\n", synthetic_df_final.dtypes)

# Save to CSV
synthetic_df_final.to_csv('precios_bicis.csv', index=False)

Original DataFrame size: 20 rows
Synthetic DataFrame (final) size: 7502 rows

Primeras 5 filas de datos sintéticos extendidos:
    gear_count  weight_kg  frame_material bike_brand  year  gear_brand  \
0          21  10.012387               0      Giant  2023  Campagnolo   
1          17   8.136256               1         BH  2018        SRAM   
2           7   6.802058               1       Cube  2025        SRAM   
3          11   6.800000               1      Orbea  2018     Shimano   
4          21   9.294413               0       Trek  2025     Shimano   

   wheel_diameter_mm  electronic_shifting  brake_type  price_usd  
0                710                    0           1       1877  
1                680                    0           1       2077  
2                690                    1           1       3498  
3                720                    1           1       2722  
4                700                    0           1       1551  

Últimas 5 filas de datos sinté

In [11]:
synthetic_df_final

Unnamed: 0,gear_count,weight_kg,frame_material,bike_brand,year,gear_brand,wheel_diameter_mm,electronic_shifting,brake_type,price_usd
0,21,10.012387,0,Giant,2023,Campagnolo,710,0,1,1877
1,17,8.136256,1,BH,2018,SRAM,680,0,1,2077
2,7,6.802058,1,Cube,2025,SRAM,690,1,1,3498
3,11,6.800000,1,Orbea,2018,Shimano,720,1,1,2722
4,21,9.294413,0,Trek,2025,Shimano,700,0,1,1551
...,...,...,...,...,...,...,...,...,...,...
7497,19,7.970909,1,Merida,2025,SRAM,710,0,1,3113
7498,11,9.038626,0,Cube,2023,Campagnolo,720,0,1,1621
7499,19,7.498385,1,Scott,2021,SRAM,710,0,1,2714
7500,13,7.946962,1,BH,2025,Campagnolo,710,0,1,3295
