In [72]:
import pandas as pd
import gzip
import gdown
import os

original_file_id = "1Jt5cnymhL_OLdFyF9YZNZGH6EWUVGe1I"
texas_file_id = "1gyBuhTcea5WgUOa5idJlvj5WiM8xejCG"
virginia_file_id = "1bMKS1mMrdRT8rEvLyg_WqeCRpdg9RP9I"
california_file_id = "1X942vMl8jntbRs7heqYa22HFb8-q6M3T"
five_hunderdk_file_id = "1sG69mVymv4WC9ddreESc87Pl9P6gU8Ye"

output_path = f"../data/accident_data.csv"

def load_data(file_id):
    if not os.path.exists(output_path):
        gdown.download(f"https://drive.google.com/uc?id={file_id}", output_path, quiet=False)

    return pd.read_csv(output_path, sep=",", low_memory=False)  # Prevent dtype warnings

def load_local_data(path):
    return pd.read_csv(path, sep=",", low_memory=False)  # Prevent dtype warnings

def cut_data():
    input_csv = "/Users/santoshkumar/MSAAI/statistics/project/US_Accidents_March23.csv"
    output_csv = "/Users/santoshkumar/MSAAI/statistics/project/latest_2mn_records.csv"

    # Define parameters
    num_records = 2_000_000
    chunk_size = 100_000  # Adjust based on memory

    # List to store chunks
    chunks_list = []

    # Read file in chunks
    for chunk in pd.read_csv(input_csv, chunksize=chunk_size):
        chunks_list.append(chunk)  # Append to the list

        # Keep only the last 1 million records to avoid memory bloat
        if len(chunks_list) > (num_records // chunk_size) + 1:
            chunks_list.pop(0)  # Remove the oldest chunk

    # Combine only necessary data
    latest_records = pd.concat(chunks_list).tail(num_records)

    # Save to CSV
    latest_records.to_csv(output_csv, index=False)

    print(f"Stored latest {num_records} records in {output_csv}")

# # cut_data()
# data = load_local_data("/Users/santoshkumar/MSAAI/statistics/project/US_Accidents_March23.csv")
# print(data.columns)
# print(data['State'].value_counts().head(10))
# data.head(100)

california_data = load_data(california_file_id)



In [68]:
# florida_data = data[data['State'] == 'VA']
# florida_data.to_csv('/Users/santoshkumar/MSAAI/statistics/project/virginia.csv', index=False)

In [74]:
print(california_data.columns)

# Group by city and count the number of accidents
accidents_by_city = california_data['Start_Time'].value_counts().head(10)

# Display the top 10 cities with the most accidents
print(accidents_by_city)

Index(['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
       'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
       'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')
Start_Time
2022-04-26 16:14:30    54
2020-11-10 13:09:30    46
2022-08-09 06:42:51    45
2018-11-12 00:37:27    40
2021-04-14 10:40:43    38
2021-02-10 23:01:30    38
2022-07-29 07:36:45    35
2020-11-17 13:25:30    35
2016-04-10 08:59:26    35
2022-08-08 14

In [76]:
# Group by county and count the number of accidents
accidents_by_county = california_data['County'].value_counts().head(10)

# Display the top 10 counties with the most accidents
print(accidents_by_county)

County
Los Angeles       526851
San Bernardino    109631
Orange            107580
San Diego         104165
Alameda            98553
Sacramento         95377
Riverside          88248
Santa Clara        70482
Contra Costa       51338
Kern               37614
Name: count, dtype: int64


In [77]:
# Filter the california_data dataframe for Alameda county
alameda_data = california_data[california_data['County'] == 'Alameda']

# Display the new dataframe
print(alameda_data[alameda_data['Zipcode'] == '94588'].value_counts().head(10))

ID         Source   Severity  Start_Time           End_Time             Start_Lat  Start_Lng    End_Lat    End_Lng      Distance(mi)  Description                                                                Street   City        County   State  Zipcode  Country  Timezone    Airport_Code  Weather_Timestamp    Temperature(F)  Wind_Chill(F)  Humidity(%)  Pressure(in)  Visibility(mi)  Wind_Direction  Wind_Speed(mph)  Precipitation(in)  Weather_Condition  Amenity  Bump   Crossing  Give_Way  Junction  No_Exit  Railway  Roundabout  Station  Stop   Traffic_Calming  Traffic_Signal  Turning_Loop  Sunrise_Sunset  Civil_Twilight  Nautical_Twilight  Astronomical_Twilight
A-3530658  Source1  2         2017-01-12 05:41:53  2017-01-12 11:41:53  37.701296  -121.866939  37.701170  -121.854690  0.670         Between Santa Rita Rd/Tassajara Rd and El Charro Rd/Fallon Rd - Accident.  I-580 E  Pleasanton  Alameda  CA     94588    US       US/Pacific  KLVK          2017-01-12 05:53:00  45.0            41.8 

In [78]:
# Group by severity and weather condition, and count the number of accidents
severity_weather_classification = alameda_data.groupby(['Severity', 'Weather_Condition', "Visibility(mi)"]).size().reset_index(name='Count')

# Display the classification
print(severity_weather_classification)

# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 6))
# plt.bar(severity_weather_classification['Weather_Condition'], severity_weather_classification['Count'])
# plt.xlabel('Weather Condition')
# plt.ylabel('Count')
# plt.title('Severity vs. Weather Condition in Alameda County')
# plt.show()

     Severity Weather_Condition  Visibility(mi)  Count
0           1             Clear            10.0      7
1           1            Cloudy             5.0      1
2           1            Cloudy             7.0      3
3           1            Cloudy             8.0      8
4           1            Cloudy             9.0      4
..        ...               ...             ...    ...
577         4       Shallow Fog            10.0      1
578         4             Smoke             5.0      1
579         4             Smoke             6.0      1
580         4             Smoke             9.0      1
581         4             Smoke            10.0      4

[582 rows x 4 columns]


In [79]:
import folium

severity_weather_classification_sorted = severity_weather_classification.sort_values(by='Severity')
print(severity_weather_classification_sorted)
# Create a map centered around Alameda County
# map_alameda = folium.Map(location=[37.77, -122.27], zoom_start=10)

# Add markers for each accident location
# for index, row in alameda_data.iterrows():
#     folium.Marker([row['Start_Lat'], row['Start_Lng']], popup=row['Description']).add_to(map_alameda)

# Display the map
# map_alameda

     Severity   Weather_Condition  Visibility(mi)  Count
0           1               Clear            10.0      7
21          1          Light Rain            10.0     24
22          1  Light Rain / Windy             5.0      1
23          1  Light Rain / Windy             9.0      1
24          1       Mostly Cloudy             9.0      3
..        ...                 ...             ...    ...
546         4       Light Drizzle             8.0      1
547         4          Light Rain             2.0      1
548         4          Light Rain             3.0      1
550         4          Light Rain             5.0      3
581         4               Smoke            10.0      4

[582 rows x 4 columns]
