In [2]:
# PURPOSE OF THIS CELL: clean raw data and set the dataframe to the csv of cleaned data
import pandas as pd

# TIP: make sure the Car Accidents data file is called "data.csv" locally
df = pd.read_csv('data.csv')

# Reformatting the start and end times to allow for filtering
df['Start_Time'] = df['Start_Time'].str.split('.').str[0]
df['End_Time']   = df['End_Time'].str.split('.').str[0]

df['Start_Time'] = pd.to_datetime(df['Start_Time'])
df['End_Time'] = pd.to_datetime(df['End_Time'])

# Filtering the data to focus on accidents during specific years
df = df[df['Start_Time'].dt.year >= 2023]
df = df[df['End_Time'].dt.year >= 2023]

# Filter to focus on California
df = df[df["State"] == "FL"]

# Removes all records with an empty cell
new_df = df.dropna() 

# Saves the cleaned data to its own csv file
new_df.to_csv('clean-data.csv', index=False)

print('num rows after cleaning: ', len(new_df))

# Setting the data frame to the cleaned data
df = pd.read_csv('clean-data.csv')

num rows after cleaning:  21209


In [3]:
# Run this cell to get a better idea of the content of each column
# The output is best viewed if you click 'view as a scrollable element' after running it
from pandas.api.types import is_numeric_dtype, is_bool_dtype

def summarize_column(col, series, cat_threshold=50):
    colData = series
    
    print(f'--- {col} ---')
    if is_bool_dtype(colData):
        print('Type: boolean')
        counts = colData.value_counts(dropna=False)
        print('Counts:', counts.to_dict())
    elif is_numeric_dtype(colData):
        print('Type: numeric')
        print('Min:', colData.min())
        print('Max:', colData.max())
    else:
        nunique = colData.nunique(dropna=False)
        if nunique == 0:
            print('No values')
        elif nunique <= cat_threshold:
            uniques = colData.unique()
            print(f'Unique values ({nunique}):', uniques)
        else:
            print(f'Unique values count: {nunique} (showing first {cat_threshold}):')
            print(colData.unique()[:cat_threshold])
    print()

for col in df.columns:
    summarize_column(col, df[col])

--- ID ---
Unique values count: 21209 (showing first 50):
['A-3650842' 'A-3650866' 'A-3651206' 'A-3651472' 'A-3651551' 'A-3652317'
 'A-3652630' 'A-3652642' 'A-3652661' 'A-3653754' 'A-3653820' 'A-3654670'
 'A-3654822' 'A-3654889' 'A-3655101' 'A-3655733' 'A-3655772' 'A-3655793'
 'A-3656047' 'A-3656095' 'A-3656448' 'A-3656459' 'A-3656572' 'A-3657336'
 'A-3657447' 'A-3658002' 'A-3658615' 'A-3658630' 'A-3658728' 'A-3658812'
 'A-3658835' 'A-3659153' 'A-3659243' 'A-3659771' 'A-3659999' 'A-3660357'
 'A-3660457' 'A-3661274' 'A-3661320' 'A-3661741' 'A-3661958' 'A-3662044'
 'A-3662117' 'A-3662143' 'A-3662163' 'A-3663060' 'A-3663552' 'A-3664098'
 'A-3664182' 'A-3664377']

--- Source ---
Unique values (1): ['Source1']

--- Severity ---
Type: numeric
Min: 2
Max: 4

--- Start_Time ---
Unique values count: 13121 (showing first 50):
['2023-03-24 16:50:00' '2023-03-24 19:36:30' '2023-02-27 09:05:24'
 '2023-02-27 07:31:04' '2023-02-27 00:32:52' '2023-03-31 16:08:14'
 '2023-02-26 19:26:17' '2023-02-26 21:

In [4]:
# Converting starting lat and lng to an array
# Source: https://stackoverflow.com/questions/31789160/convert-select-columns-in-pandas-dataframe-to-numpy-array
properties = df[['Start_Lat', 'Start_Lng', 'Temperature(F)', 'Humidity(%)', 'Precipitation(in)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Chill(F)', 'Wind_Speed(mph)', 'Wind_Direction', 'Start_Time', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight', 'Severity', 'Airport_Code', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']]
array = properties.to_numpy()
print('have all values?', len(array))

have all values? 21209


In [5]:
import json
geojson = {
    "type": "FeatureCollection",
    "features": []
}

for lat, lng, temp, humid, percip, pressure, visible, wchill, wspeed, wdirect, startTime, sunrise, civil, nautical, astron, severity, airportcode, amenity, bump, crossing, giveway, junction, no_exit, railway, roundabout, station, stop, tcalm, tsig, turnloop in array:
    feature = {
        "type": "Feature",
        "geometry": {
            "type": "Point",
            "coordinates": [lng, lat]
        },
        "properties": {
            "temperature": temp,
            "humidity": humid,
            "precipitation": percip,
            "pressure": pressure,
            "visibility": visible,
            "windchill": wchill,
            "windspeed": wspeed,
            "winddirection": wdirect,
            "starttime": startTime,
            "sunrise_sunset": sunrise,
            "civil_twilight": civil,
            "nautical_twilight": nautical,
            "astronomical_twilight": astron,
            "severity": severity,
            "airportcode": airportcode,
            "amenity": amenity,
            "bump": bump,
            "crossing": crossing,
            "giveway": giveway,
            "junction": junction,
            "no_exit": no_exit,
            "railway": railway,
            "roundabout": roundabout,
            "station": station,
            "stop": stop,
            "traffic_calming": tcalm,
            "traffic_signal": tsig,
            "turning_loop": turnloop
        }
    }
    geojson["features"].append(feature)

with open("FLdata.json", "w") as f:
    json.dump(geojson, f, indent=2)

print('the length of array', len(array), 'should match length of features', len(geojson["features"]))

#https://stackoverflow.com/questions/12309269/how-do-i-write-json-data-to-a-file

the length of array 21209 should match length of features 21209
