<a href="https://colab.research.google.com/github/brendanpshea/data-science/blob/main/data/MakeZombies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import sqlite3

# Set random seed for reproducibility
np.random.seed(52179)

# Generate dates
start_date = datetime(2030, 1, 1)
end_date = datetime(2040, 1, 1)
dates = [start_date + timedelta(days=x) for x in range((end_date - start_date).days)]

# Generate data
n_samples = 1000

data = {
    'Date': np.random.choice(dates, n_samples),
    'Location': np.random.choice(['Minneapolis', 'Chicago', 'Winnipeg', 'Milwaukee', 'Madison',
                                  'Des Moines', 'Fargo', 'Rochester', 'St. Louis', 'Kansas City'], n_samples),
    'ZombieType': np.random.choice(['Walker', 'Runner', 'Crawler', 'Jumper'], n_samples,
                                   p=[0.5, 0.3, 0.15, 0.05]),  # Weighted probabilities
    'VictimCount': np.random.negative_binomial(n=10, p=0.5, size=n_samples),  # More realistic distribution for count data
    'SurvivalRate': np.random.beta(a=2, b=5, size=n_samples),  # Beta distribution for rates between 0 and 1
    'WeatherCondition': np.random.choice(['Sunny', 'Rainy', 'Cloudy', 'Foggy', 'Stormy'], n_samples),
    'MoonPhase': np.random.choice(['New Moon', 'Waxing Crescent', 'First Quarter', 'Waxing Gibbous',
                                   'Full Moon', 'Waning Gibbous', 'Last Quarter', 'Waning Crescent'], n_samples),
    'TemperatureCelsius': np.random.normal(loc=20, scale=7, size=n_samples),  # Normal distribution for temperature
    'HumidityPercent': np.random.uniform(low=30, high=90, size=n_samples),
    'WindSpeedKmh': np.random.exponential(scale=10, size=n_samples),  # Exponential distribution for wind speed
    'PopulationDensity': np.random.lognormal(mean=5, sigma=1, size=n_samples),  # Log-normal distribution for population density
    'EmergencyResponseTime': np.random.gamma(shape=2, scale=5, size=n_samples)  # Gamma distribution for response time
}

# Create DataFrame
df = pd.DataFrame(data)

# Add some correlations and adjustments for more realism
df['VictimCount'] = np.where(df['ZombieType'] == 'Runner',
                             df['VictimCount'] * 1.5,
                             df['VictimCount'])  # Runners cause more victims

df['SurvivalRate'] = np.where(df['EmergencyResponseTime'] < df['EmergencyResponseTime'].median(),
                              df['SurvivalRate'] * 1.2,
                              df['SurvivalRate'])  # Better survival with quicker response

df['TemperatureCelsius'] = np.where(df['WeatherCondition'] == 'Sunny',
                                    df['TemperatureCelsius'] + 5,
                                    df['TemperatureCelsius'])  # Sunnier days are warmer

# Add correlation between temperature and month
df['Month'] = df['Date'].dt.month
df['TemperatureCelsius'] = df['TemperatureCelsius'] + (df['Month'] - 6) * 2  # Warmer in summer months, colder in winter

# Add city-specific weather patterns
city_temp_adjustment = {
    'Minneapolis': -5,
    'Chicago': -3,
    'Winnipeg': -10,
    'Milwaukee': -4,
    'Madison': -4,
    'Des Moines': 0,
    'Fargo': -8,
    'Rochester': -6,
    'St. Louis': 2,
    'Kansas City': 3
}

df['TemperatureCelsius'] = df.apply(lambda row: row['TemperatureCelsius'] + city_temp_adjustment[row['Location']], axis=1)




# Ensure data is within realistic bounds
df['VictimCount'] = df['VictimCount'].clip(lower=0)
df['SurvivalRate'] = df['SurvivalRate'].clip(lower=0, upper=1)
df['TemperatureCelsius'] = df['TemperatureCelsius'].clip(lower=-30, upper=45)  # Adjust bounds for new correlations
df['HumidityPercent'] = df['HumidityPercent'].clip(lower=0, upper=100)
df['WindSpeedKmh'] = df['WindSpeedKmh'].clip(lower=0, upper=200)
df['PopulationDensity'] = df['PopulationDensity'].clip(lower=1)
df['EmergencyResponseTime'] = df['EmergencyResponseTime'].clip(lower=1)

# Round numerical columns to reasonable precision
df['SurvivalRate'] = df['SurvivalRate'].round(4)
df['TemperatureCelsius'] = df['TemperatureCelsius'].round(1)
df['HumidityPercent'] = df['HumidityPercent'].round(1)
df['WindSpeedKmh'] = df['WindSpeedKmh'].round(1)
df['PopulationDensity'] = df['PopulationDensity'].round(1)
df['EmergencyResponseTime'] = df['EmergencyResponseTime'].round(1)

# Function to introduce random nulls
def insert_random_nulls(df, probability=0.01):
    df = df.applymap(lambda x: np.nan if np.random.rand() < probability else x)
    return df

# Apply random nulls
df = insert_random_nulls(df)

# Save to CSV
df.to_csv('zombie_attacks.csv', index=False)

# Save to SQLite
conn = sqlite3.connect('zombie_attacks.db')
df.to_sql('ZombieAttacks', conn, if_exists='replace', index=False)
conn.close()

print("Data generated and saved to 'zombie_attacks.csv' and 'zombie_attacks.db'")


Data generated and saved to 'zombie_attacks.csv' and 'zombie_attacks.db'


In [None]:
df.describe()

Unnamed: 0,Date,VictimCount,SurvivalRate,TemperatureCelsius,HumidityPercent,WindSpeedKmh,PopulationDensity,EmergencyResponseTime,Month
count,991,994.0,992.0,993.0,994.0,987.0,992.0,992.0,990.0
mean,2023-06-26 06:48:18.890010112,11.496479,0.319554,18.228399,59.333199,10.492503,253.028024,9.8625,6.367677
min,2023-01-01 00:00:00,0.0,0.0099,-12.6,30.0,0.0,4.8,1.0,1.0
25%,2023-03-29 00:00:00,7.5,0.183525,10.2,44.525,2.85,77.15,4.8,3.0
50%,2023-06-24 00:00:00,10.5,0.29455,18.1,58.85,7.1,151.15,8.2,6.0
75%,2023-09-21 12:00:00,15.0,0.439925,25.5,74.475,14.7,317.875,13.3,9.0
max,2023-12-31 00:00:00,39.0,0.883,45.0,90.0,81.3,4691.1,50.7,12.0
std,,5.767634,0.174349,10.872145,17.462086,10.294276,321.245288,6.797707,3.419767
