In [1]:
import pandas as pd
import altair as alt

In [2]:
collisions = pd.read_csv("../data/preprocessed-colisions.csv")
collisions.head(1)
# len(collisions)

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,TOTAL_INJURED,TOTAL_KILLED,PEDESTRIANS_INJURED,PEDESTRIANS_KILLED,...,CYCLIST_KILLED,MOTORIST_INJURED,MOTORIST_KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2
0,09/06/2020,18:05,,,40.771038,-73.83413,0.0,1.0,0,0,...,0,0,1,Unsafe Lane Changing,Following Too Closely,,,,Station Wagon/Sport Utility Vehicle,Motorcycle


In [3]:
print(f'The dataset has {len(collisions)} rows and {len(collisions.columns)} columns')

The dataset has 115740 rows and 21 columns


## Visualize the data

### Visualization 1

In [4]:
collisions1 = pd.DataFrame({'CRASH DATE': pd.to_datetime(collisions["CRASH DATE"])})
collisions1['day'] = collisions1['CRASH DATE'].dt.day_name()
collisions1['year'] = collisions1['CRASH DATE'].dt.year
collisions1 = collisions1.groupby(['day', 'year']).count().reset_index(names=['day', 'year'])

collisions1 = collisions1.rename(columns={'CRASH DATE': 'count'})
collisions1.head()

Unnamed: 0,day,year,count
0,Friday,2018,12887
1,Friday,2020,5487
2,Monday,2018,11141
3,Monday,2020,5167
4,Saturday,2018,10694


In [5]:
chart1 = alt.Chart(collisions1).mark_bar().encode(
    x='year:O',
    y=alt.Y('count:Q', title='Number of Collisions'),
    color='year:N',
    column=alt.Column('day:N', title='Day of the Week', 
    sort=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
)

chart1

### Visualization 2

In [15]:
collisions['VEHICLE TYPE CODE 1'].value_counts().head(20)

VEHICLE TYPE CODE 1
Sedan                                  54025
Station Wagon/Sport Utility Vehicle    40658
Taxi                                    4806
Pickup                                  3479
Box truck                               2325
Bike                                    1938
Bus                                     1420
Truck                                   1087
Motorcycle                              1051
Van                                      836
Ambulance                                467
Convertible                              411
Dump                                     325
E-Scooter                                240
Flat bed                                 232
Garbage                                  193
Others                                   156
Carry All                                146
E-Bike                                   146
Moped                                    135
Name: count, dtype: int64

In [24]:
collisions2 = pd.DataFrame({'vehicle_type': collisions['VEHICLE TYPE CODE 1']})
collisions2 = collisions2.groupby(['vehicle_type']).size().reset_index(name='count')
collisions2 = collisions2.sort_values(by=['count'], ascending=False)
collisions2.head(10)

Unnamed: 0,vehicle_type,count
31,Sedan,54025
33,Station Wagon/Sport Utility Vehicle,40658
35,Taxi,4806
28,Pickup,3479
5,Box truck,2325
4,Bike,1938
6,Bus,1420
39,Truck,1087
25,Motorcycle,1051
41,Van,836


In [31]:
alt.Chart(collisions2.iloc[:20]).mark_bar().encode(
  x = 'count:Q',
  y = alt.Y('vehicle_type:O', sort='-x', title = 'Vehicle Type'), 
) + alt.Chart(collisions2.iloc[:20]).mark_rule(color='red').encode(
  x = 'mean(count):Q'
)

### Visualization 3

In [21]:
collisions3 = pd.DataFrame({'CRASH TIME': pd.to_datetime(collisions["CRASH TIME"]), 
                            'CRASH DATE': pd.to_datetime(collisions["CRASH DATE"])})

collisions3 = pd.DataFrame({'hour': (collisions3['CRASH TIME'].apply(lambda x: str(x.hour))),
                            'year': collisions3['CRASH DATE'].dt.year})

collisions3['hour'] = collisions3['hour'].apply(lambda x: int(x))

collisions3 = collisions3.groupby(['hour', 'year']).size().reset_index(name='counts')
collisions3.head()

  collisions3 = pd.DataFrame({'CRASH TIME': pd.to_datetime(collisions["CRASH TIME"]),


Unnamed: 0,hour,year,counts
0,0,2018,2937
1,0,2020,1689
2,1,2018,1245
3,1,2020,923
4,2,2018,934


In [33]:
alt.Chart(collisions3).mark_line().encode(
    x='hour:O',
    y='counts:Q',
    color='year:N'
)

### Visualization 4

In [60]:
collisions4 = collisions[['LATITUDE', 'LONGITUDE', 'BOROUGH']]
collisions4 = collisions4.dropna()
collisions4.head()

Unnamed: 0,LATITUDE,LONGITUDE,BOROUGH
2,40.824757,-73.94052,MANHATTAN
4,40.840508,-73.85515,BRONX
5,40.744232,-73.861275,QUEENS
7,40.67743,-73.87591,BROOKLYN
8,40.824898,-73.927635,BRONX


In [63]:
alt.data_transformers.disable_max_rows()

map = alt.topo_feature('../data/NY_map.geojson', 'NY_map')

NYmap = alt.Chart(map).mark_geoshape().properties(
    width=500,
    height=500
)

accidents = alt.Chart(collisions4).transform_aggregate(
    latitude='mean(LATITUDE)',
    longitude='mean(LONGITUDE)',
    count='count()',
    groupby=['BOROUGH'] 
).mark_circle(size=50).encode(
    latitude='latitude:Q',
    longitude='longitude:Q',
    size=alt.Size('count:Q', title='Number of Accidents'),
    color=alt.value('red')
).properties(
    width=500,
    height=500
)

NYmap + accidents

### Visualization 5

In [8]:
collisions5 = pd.DataFrame({'datetime': pd.to_datetime(collisions["CRASH DATE"])})
collisions5 = collisions5.groupby(['datetime']).size().reset_index(name='collisions') 
collisions5.head()

Unnamed: 0,datetime,collisions
0,2018-06-01,751
1,2018-06-02,622
2,2018-06-03,525
3,2018-06-04,698
4,2018-06-05,688


In [9]:
weather_original = pd.read_csv("../data/weather.csv")
weather = weather_original[['datetime', 'temp', 'precip']]
weather['datetime'] = pd.to_datetime(weather['datetime'])
weather.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather['datetime'] = pd.to_datetime(weather['datetime'])


Unnamed: 0,datetime,temp,precip
0,2018-06-01,21.6,0.282
1,2018-06-02,25.1,0.346
2,2018-06-03,17.0,2.929
3,2018-06-04,16.8,223.796
4,2018-06-05,19.8,0.0


In [10]:
collisions5 = pd.merge(collisions5, weather, on='datetime')
collisions5['year'] = collisions5['datetime'].dt.year
collisions5.head()

Unnamed: 0,datetime,collisions,temp,precip,year
0,2018-06-01,751,21.6,0.282,2018
1,2018-06-02,622,25.1,0.346,2018
2,2018-06-03,525,17.0,2.929,2018
3,2018-06-04,698,16.8,223.796,2018
4,2018-06-05,688,19.8,0.0,2018


In [11]:
alt.Chart(collisions5).mark_point(color = 'red').encode(
    x='temp:Q',
    y='collisions:Q',
    color='year:N'
)



# alt.Chart(collisions5).mark_point().encode(
#     alt.X(alt.repeat("column"), type='quantitative'),
#     alt.Y(alt.repeat("row"), type='quantitative'),
#     color='year:Q'
# ).repeat(
#     row = ['collisions'],
#     column = ['temperature', 'humidity', 'wind_speed']
# ).properties(
#     width=150,
#     height=150
# )

In [12]:
alt.Chart(collisions5).mark_point(color = 'red').encode(
    x='precip:Q',
    y='collisions:Q',
    color='year:N'
)