In [2]:
# PURPOSE OF THIS CELL: clean raw data and set the dataframe to the csv of cleaned data
import pandas as pd

# TIP: make sure the Car Accidents data file is called "data.csv" locally
df = pd.read_csv('data.csv')

# Reformatting the start and end times to allow for filtering
df['Start_Time'] = df['Start_Time'].str.split('.').str[0]
df['End_Time']   = df['End_Time'].str.split('.').str[0]

df['Start_Time'] = pd.to_datetime(df['Start_Time'])
df['End_Time'] = pd.to_datetime(df['End_Time'])

# Filtering the data to focus on accidents during specific years
df = df[df['Start_Time'].dt.year >= 2023]
df = df[df['End_Time'].dt.year >= 2023]

# Filter to focus on California
df = df[df["State"] == "FL"]

# Removes all records with an empty cell
new_df = df.dropna() 

# Saves the cleaned data to its own csv file
new_df.to_csv('clean-data.csv', index=False)

print('num rows after cleaning: ', len(new_df))

# Setting the data frame to the cleaned data
df = pd.read_csv('clean-data.csv')

num rows after cleaning:  21209


In [16]:
print(df["Severity"].unique())

df.head()

[2 4]


Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-3650842,Source1,2,2023-03-24 16:50:00,2023-03-24 19:45:00,29.019024,-81.219254,29.018383,-81.224112,0.297,...,False,False,False,False,True,False,Day,Day,Day,Day
1,A-3650866,Source1,2,2023-03-24 19:36:30,2023-03-24 23:59:00,28.742182,-81.298842,28.740334,-81.300644,0.168,...,False,False,False,False,True,False,Day,Day,Day,Day
2,A-3651206,Source1,2,2023-02-27 09:05:24,2023-02-27 15:05:23,30.2538,-81.512367,30.25308,-81.516114,0.229,...,False,False,False,False,False,False,Day,Day,Day,Day
3,A-3651472,Source1,2,2023-02-27 07:31:04,2023-02-27 13:31:03,28.568379,-81.286192,28.568183,-81.286191,0.014,...,False,False,False,False,True,False,Day,Day,Day,Day
4,A-3651551,Source1,2,2023-02-27 00:32:52,2023-02-27 06:32:51,27.429327,-82.575542,27.428724,-82.575547,0.042,...,False,False,False,False,False,False,Night,Night,Night,Night


In [17]:
# Converting starting lat and lng to an array
# Source: https://stackoverflow.com/questions/31789160/convert-select-columns-in-pandas-dataframe-to-numpy-array
latlng = df[['Start_Lat', 'Start_Lng', 'Severity']]
array = latlng.to_numpy()
print('have all values?', len(array))

have all values? 21209


In [18]:
import json
geojson = {
    "type": "FeatureCollection",
    "features": []
}

for lat, lng, severity in array:
    feature = {
        "type": "Feature",
        "geometry": {
            "type": "Point",
            "coordinates": [lng, lat]
        },
        "properties": {
            "severity": [severity]
        }
    }
    geojson["features"].append(feature)

with open("data.json", "w") as f:
    json.dump(geojson, f, indent=2)

print('the length of array', len(array), 'should match length of features', len(geojson["features"]))

#https://stackoverflow.com/questions/12309269/how-do-i-write-json-data-to-a-file

the length of array 21209 should match length of features 21209
