In [3]:
import pandas as pd
import numpy as np
from faker import Faker

In [4]:
fake = Faker()
Faker.seed(42)
np.random.seed(42)

num_points = 1000

In [13]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker for realistic synthetic data (optional)
fake = Faker()
Faker.seed(42)
np.random.seed(42)

# Number of data points
num_points = 1000

# Generate region_id
region_ids = np.arange(1, num_points + 1)

# ---------------------------
# 1. demographics.csv
# ---------------------------
def generate_demographics(region_ids):
    demographics = pd.DataFrame({
        'region_id': region_ids,
        'region_name': [fake.city() for _ in region_ids],
        'population': np.random.randint(1000, 100000, size=num_points),
        'population_density': np.round(np.random.uniform(100, 10000, size=num_points), 2),  # people per sq km
        'age_median': np.random.randint(20, 80, size=num_points),  # median age
        'households': np.random.randint(300, 50000, size=num_points),
        'vehicles_per_household': np.round(np.random.uniform(0, 5, size=num_points), 2)
    })
    return demographics

# ---------------------------
# 2. economic_indicators.csv
# ---------------------------
def generate_economic_indicators(region_ids):
    economic = pd.DataFrame({
        'region_id': region_ids,
        'average_income': np.round(np.random.uniform(20000, 100000, size=num_points), 2),  # in USD
        'unemployment_rate': np.round(np.random.uniform(0, 20, size=num_points), 2),  # in %
        'GDP_per_capita': np.round(np.random.uniform(10000, 50000, size=num_points), 2),  # in USD
        'economic_growth_rate': np.round(np.random.uniform(-5, 10, size=num_points), 2)  # in %
    })
    return economic

# ---------------------------
# 3. transport_infrastructure.csv
# ---------------------------
def generate_transport_infrastructure(region_ids):
    transport = pd.DataFrame({
        'region_id': region_ids,
        'public_transit_routes': np.random.randint(1, 100, size=num_points),
        'public_transit_frequency': np.round(np.random.uniform(1, 30, size=num_points), 2),  # trips per hour
        'road_density': np.round(np.random.uniform(1, 500, size=num_points), 2),  # km of road per sq km
        'congestion_level': np.round(np.random.uniform(0, 100, size=num_points), 2),  # index
        'cycling_paths_length': np.round(np.random.uniform(0, 100, size=num_points), 2),  # in km
        'pedestrian_paths_length': np.round(np.random.uniform(0, 100, size=num_points), 2),  # in km
        'parking_spaces': np.random.randint(100, 10000, size=num_points)
    })
    return transport

# ---------------------------
# 4. public_transit_demand.csv
# ---------------------------
def generate_public_transit_demand(region_ids, demographics, economic, transport):
    # For a more realistic synthetic demand, we can create it based on other factors
    # Example: transit_demand increases with population, decreases with congestion, etc.
    
    # Normalize relevant features
    population_norm = (demographics['population'] - demographics['population'].min()) / (demographics['population'].max() - demographics['population'].min())
    income_norm = (economic['average_income'] - economic['average_income'].min()) / (economic['average_income'].max() - economic['average_income'].min())
    transit_routes_norm = (transport['public_transit_routes'] - transport['public_transit_routes'].min()) / (transport['public_transit_routes'].max() - transport['public_transit_routes'].min())
    transit_freq_norm = (transport['public_transit_frequency'] - transport['public_transit_frequency'].min()) / (transport['public_transit_frequency'].max() - transport['public_transit_frequency'].min())
    congestion_norm = 1 - ((transport['congestion_level'] - transport['congestion_level'].min()) / (transport['congestion_level'].max() - transport['congestion_level'].min()))
    
    # Weighted sum to simulate demand
    demand = (population_norm * 0.5) + (income_norm * 0.2) + (transit_routes_norm * 0.15) + (transit_freq_norm * 0.1) + (congestion_norm * 0.05)
    
    # Scale to realistic ridership numbers
    transit_demand = np.round(demand * 100000).astype(int)
    
    # Ensure demand is at least some minimum value
    transit_demand = np.where(transit_demand < 100, 100, transit_demand)
    
    public_transit_demand = pd.DataFrame({
        'region_id': region_ids,
        'transit_demand': transit_demand
    })
    return public_transit_demand

# ---------------------------
# Generate All Datasets
# ---------------------------
demographics = generate_demographics(region_ids)
economic_indicators = generate_economic_indicators(region_ids)
transport_infrastructure = generate_transport_infrastructure(region_ids)
public_transit_demand = generate_public_transit_demand(region_ids, demographics, economic_indicators, transport_infrastructure)

# ---------------------------
# Save to CSV Files
# ---------------------------
demographics.to_csv('demographics.csv', index=False)
economic_indicators.to_csv('economic_indicators.csv', index=False)
transport_infrastructure.to_csv('transport_infrastructure.csv', index=False)
public_transit_demand.to_csv('public_transit_demand.csv', index=False)

print("Synthetic datasets generated and saved as CSV files successfully!")


Synthetic datasets generated and saved as CSV files successfully!


In [12]:
demographics = generate_demographics(region_ids)
demographics

Unnamed: 0,region_id,region_name,population,population_density,age_median,households,vehicles_per_household
0,1,North Judithbury,16795,2448.55,62,47038,4.87
1,2,East Jill,1860,1107.65,34,38729,0.14
2,3,New Roberttown,77820,1613.31,41,6442,4.20
3,4,East Jessetown,55886,2534.98,79,12409,3.76
4,5,Lake Debra,7265,1690.75,56,22310,1.14
...,...,...,...,...,...,...,...
995,996,Port Wendybury,76703,4030.12,33,17376,4.22
996,997,Kellyburgh,77090,4527.31,55,1405,1.12
997,998,East Brittany,18260,6045.88,72,15717,1.20
998,999,Robintown,4812,5205.23,27,25029,2.56


In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [15]:
# Load the CSV files into pandas DataFrames
demographics = pd.read_csv('demographics.csv')
economic = pd.read_csv('economic_indicators.csv')
infrastructure = pd.read_csv('transport_infrastructure.csv')
transit_demand = pd.read_csv('public_transit_demand.csv')


In [16]:
# Display first few rows of each DataFrame
print("Demographics Data:")
print(demographics.head())

print("\nEconomic Indicators Data:")
print(economic.head())

print("\nTransport Infrastructure Data:")
print(infrastructure.head())

print("\nPublic Transit Demand Data:")
print(transit_demand.head())


Demographics Data:
   region_id       region_name  population  population_density  age_median  \
0          1  North Judithbury       16795             2448.55          62   
1          2         East Jill        1860             1107.65          34   
2          3    New Roberttown       77820             1613.31          41   
3          4    East Jessetown       55886             2534.98          79   
4          5        Lake Debra        7265             1690.75          56   

   households  vehicles_per_household  
0       47038                    4.87  
1       38729                    0.14  
2        6442                    4.20  
3       12409                    3.76  
4       22310                    1.14  

Economic Indicators Data:
   region_id  average_income  unemployment_rate  GDP_per_capita  \
0          1        23523.23              15.36        36573.57   
1          2        75466.81              12.57        23735.11   
2          3        71858.87              12

In [18]:
# Merge datasets sequentially on 'region_id'
data = demographics.merge(economic, on='region_id') \
                  .merge(infrastructure, on='region_id') \
                  .merge(transit_demand, on='region_id')

# Display merged data
print("Merged Data:")
data.head()


Merged Data:


Unnamed: 0,region_id,region_name,population,population_density,age_median,households,vehicles_per_household,average_income,unemployment_rate,GDP_per_capita,economic_growth_rate,public_transit_routes,public_transit_frequency,road_density,congestion_level,cycling_paths_length,pedestrian_paths_length,parking_spaces,transit_demand
0,1,North Judithbury,16795,2448.55,62,47038,4.87,23523.23,15.36,36573.57,4.04,83,14.11,71.82,16.71,93.11,27.04,190,30083
1,2,East Jill,1860,1107.65,34,38729,0.14,75466.81,12.57,23735.11,-3.39,37,25.86,153.06,69.85,36.15,29.7,3472,29898
2,3,New Roberttown,77820,1613.31,41,6442,4.2,71858.87,12.67,10763.97,2.62,66,2.4,411.44,78.03,39.69,36.04,6136,63379
3,4,East Jessetown,55886,2534.98,79,12409,3.76,38518.41,9.01,37591.65,6.03,55,5.31,110.15,33.01,17.41,98.88,2042,45493
4,5,Lake Debra,7265,1690.75,56,22310,1.14,74508.24,10.62,47819.79,9.88,77,13.01,122.24,32.23,37.82,78.3,3905,35962


In [19]:
# Check for missing values
print("Missing Values in Each Column:")
print(data.isnull().sum())

# If missing values exist, decide on handling strategy
# For simplicity, we'll drop rows with any missing values
data = data.dropna()

# Verify no missing values remain
print("\nAfter Dropping Missing Values:")
print(data.isnull().sum())


Missing Values in Each Column:
region_id                   0
region_name                 0
population                  0
population_density          0
age_median                  0
households                  0
vehicles_per_household      0
average_income              0
unemployment_rate           0
GDP_per_capita              0
economic_growth_rate        0
public_transit_routes       0
public_transit_frequency    0
road_density                0
congestion_level            0
cycling_paths_length        0
pedestrian_paths_length     0
parking_spaces              0
transit_demand              0
dtype: int64

After Dropping Missing Values:
region_id                   0
region_name                 0
population                  0
population_density          0
age_median                  0
households                  0
vehicles_per_household      0
average_income              0
unemployment_rate           0
GDP_per_capita              0
economic_growth_rate        0
public_transit_routes   

In [20]:
# Drop 'region_id' and 'region_name' as they are identifiers
data = data.drop(['region_id', 'region_name'], axis=1)
