In [135]:
import pandas as pd
import altair as alt
from altair import datum

alt.data_transformers.disable_max_rows()
collisions = pd.read_csv("../data/preprocessed-colisions.csv")
collisions['datetime'] = pd.to_datetime(collisions['CRASH_DATE'] + ' ' + collisions['CRASH_TIME'], format='%m/%d/%Y %H:%M')
collisions['day_week'] = collisions['datetime'].dt.day_name()
collisions['type_day'] = collisions['day_week'].apply(lambda day: 'Weekend' if day in ['Saturday', 'Sunday'] else 'Weekday')
collisions.head()

Unnamed: 0,CRASH_DATE,CRASH_TIME,BOROUGH,ZIP_CODE,LATITUDE,LONGITUDE,TOTAL_INJURED,TOTAL_KILLED,PEDESTRIANS_INJURED,PEDESTRIANS_KILLED,...,CONTRIBUTING_FACTOR_VEHICLE1,CONTRIBUTING_FACTOR_VEHICLE2,CONTRIBUTING_FACTOR_VEHICLE3,CONTRIBUTING_FACTOR_VEHICLE4,CONTRIBUTING_FACTOR_VEHICLE5,VEHICLE_TYPE_CODE1,VEHICLE_TYPE_CODE2,datetime,day_week,type_day
0,09/06/2020,18:05,,,40.771038,-73.83413,0.0,1.0,0,0,...,Unsafe Lane Changing,Following Too Closely,,,,Station Wagon/Sport Utility Vehicle,Motorcycle,2020-09-06 18:05:00,Sunday,Weekend
1,09/20/2020,9:14,,,40.722095,-73.77772,0.0,0.0,0,0,...,Driver Inattention/Distraction,Unspecified,,,,Sedan,Pickup,2020-09-20 09:14:00,Sunday,Weekend
2,09/24/2020,22:00,MANHATTAN,10039.0,40.824757,-73.94052,0.0,0.0,0,0,...,Pedestrian/Bicyclist/Other Pedestrian Error/Co...,Driver Inattention/Distraction,,,,Sedan,Motorscooter,2020-09-24 22:00:00,Thursday,Weekday
3,08/06/2020,5:30,,,,,1.0,0.0,0,0,...,Unspecified,Unspecified,,,,Sedan,Station Wagon/Sport Utility Vehicle,2020-08-06 05:30:00,Thursday,Weekday
4,08/01/2020,17:16,BRONX,10462.0,40.840508,-73.85515,0.0,1.0,0,0,...,Failure to Yield Right-of-Way,Unspecified,,,,Sedan,E-Bike,2020-08-01 17:16:00,Saturday,Weekend


### At what time of the day are accidents more common?

To examine the temporal patterns of accidents throughout the day, we will employ a line chart. The x-axis will represent hours, with the y-axis indicating the corresponding number of accidents. Opting for a line chart enables a clear depiction of how accident frequencies evolve over time. We will differentiate the data by year, using distinct colors for 2018 and 2020, providing a comparative analysis.

In [136]:
c31 = alt.Chart(collisions).mark_line(strokeWidth=2, point=True).encode(
    alt.X('hours(datetime):O').title('Time of Day'),
    alt.Y('count():Q').title('Number of Collisions'),
    color= alt.Color('year(datetime):O', scale = alt.Scale(domain=[2018, 2020], range=['steelblue', '#ff7f0e']))
)

# c31

To enhance the visualization, we will see that the total number of collisions by hour. These encoding make it challenging to intuitively grasp the frequency of accidents for each hour each day. To address this, we will refine the visualization by encoding the average number of accidents of each day, accompanied by an error bar indicating the standard deviation so that we can assess the variance of the data.

In [137]:
c32 = alt.Chart(collisions).mark_line(strokeWidth=2, point=True).encode(
    x = alt.X('hours:Q').title('Time of day'),
    y = alt.Y('avg:Q').title('Average number of collisions'),
    color = alt.Color('year:O', scale = alt.Scale(domain=[2018, 2020], range=['steelblue', '#ff7f0e']))
).transform_calculate(
  year = 'year(datum.datetime)',
  hours = 'hours(datum.datetime)'
).transform_aggregate(
   count='count()',
   groupby=['year', 'hours', 'CRASH_DATE']
).transform_aggregate(
    avg = 'mean(count)',
    groupby=['year', 'hours']
)

c33 = alt.Chart(collisions).mark_errorbar(ticks=True).encode(
    x=alt.X('hours:Q'),
    y=alt.Y('count:Q',axis=alt.Axis(title=None)).scale(zero=False),
    color = alt.Color('year:O', scale = alt.Scale(domain=[2018, 2020], range=['steelblue', '#ff7f0e']))
).transform_calculate(
  year = 'year(datum.datetime)',
  hours = 'hours(datum.datetime)'
).transform_aggregate(
   count='count()',
   groupby=['year', 'hours', 'CRASH_DATE']
)

# (c32 + c33).properties(width=600, height=400)

Upon analyzing the hourly collisions, a clear trend emerges: higher collision rates during the day and lower rates during the night. This pattern aligns with the increased presence of cars on the road during daylight hours and decreased activity during nighttime. Further we can distinguish different patterns between morning, afternoon, and evening periods. Mornings exhibit fewer collisions, likely attributed to work-related activities, whereas afternoons register higher incidents, potentially linked to leisure activities and transporting children to extracurricular activities. Evenings witness a decline in collisions as people conclude their activities and return home.

We can further enhance our chart by introducing an additional variable to glean more insights. One pivotal factor of high importance is the total number of kills. It's crucial not only to identify peak collision times throughout the day but also to comprehend the magnitude of the human cost associated with these incidents. These variable will be encoded through the line thickness, with thicker lines indicating a higher number of deaths.

In [138]:
c34 = alt.Chart(collisions).mark_trail().encode(
    x = alt.X('hours:Q').title('Time of day'),
    y = alt.Y('avg_collisions:Q').title('Average number of collisions'),
    color = alt.Color('year:O', scale = alt.Scale(domain=[2018, 2020], range=['steelblue', '#ff7f0e'])).title('Year'),
    size = alt.Size('avg_killed:Q').title('Average killed')
).transform_calculate(
  year = 'year(datum.datetime)',
  hours = 'hours(datum.datetime)'
).transform_aggregate(
   count_collisions='count()',
   count_killed='sum(TOTAL_KILLED)',
   groupby=['year', 'hours', 'CRASH_DATE']
).transform_aggregate(
    avg_collisions='mean(count_collisions)',
    avg_killed='mean(count_killed)',
    groupby=['year', 'hours']
)

# (c34 + c33).properties(width=600, height=400).properties(title='Average collisions and killings over time')

We finally achieved the final version of the graph. This visualization facilitates the identification of peak accident times. While the period with the highest collision frequency occurs around 16:00, instances of more severe outcomes, particularly deaths, are notable at 20:00 and 04:00 in 2018, and between 19:00 and 00:00, as well as at 04:00 in 2020. The deaths in the late night coincide with the times when people are returning home after socializing, often under the influence of alcohol, which make the accidents more dangerous.

*At what time of the day are accidents more common?*

In chart C3 you can see a line chart with the average accidents per hour along the different years. We use a different color for each year and line thickness to encode the killed people. You can see that the accidents are more common during the afternoon, having the peak at 16:00, and the killed people are more common during the evening and late night.

### Is there a correlation between weather conditions and accidents?

Before starting to create visualizations it is necessary to choose the attributes of the 'weather.csv' dataset. Furthermore, since we have one row per day in the weather dataset, we need to group the number of collisions per day in order to merge the two datasets appropriately. 

In [139]:
weather_original = pd.read_csv("../data/weather.csv")
weather = weather_original[['datetime', 'temp', 'precip', 'windspeed', 'humidity', 'cloudcover', 'conditions', 'visibility']]
weather['datetime'] = pd.to_datetime(weather['datetime'])
weather.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather['datetime'] = pd.to_datetime(weather['datetime'])


Unnamed: 0,datetime,temp,precip,windspeed,humidity,cloudcover,conditions,visibility
0,2018-06-01,21.6,0.282,12.6,86.8,65.9,"Rain, Partially cloudy",11.3
1,2018-06-02,25.1,0.346,22.3,74.0,35.4,"Rain, Partially cloudy",15.8
2,2018-06-03,17.0,2.929,24.1,75.0,92.7,"Rain, Overcast",15.6
3,2018-06-04,16.8,3.91978,16.7,76.6,71.6,"Rain, Partially cloudy",15.4
4,2018-06-05,19.8,0.0,25.9,60.7,35.7,Partially cloudy,16.0


In [140]:
coll_weather = pd.DataFrame({'datetime': collisions["CRASH_DATE"]})
coll_weather['datetime'] = pd.to_datetime(coll_weather['datetime'])
coll_weather = coll_weather.groupby(['datetime']).size().reset_index(name='collisions')
coll_weather = pd.merge(coll_weather, weather, on='datetime')
coll_weather['year'] = coll_weather['datetime'].dt.year
coll_weather.head()

Unnamed: 0,datetime,collisions,temp,precip,windspeed,humidity,cloudcover,conditions,visibility,year
0,2018-06-01,751,21.6,0.282,12.6,86.8,65.9,"Rain, Partially cloudy",11.3,2018
1,2018-06-02,622,25.1,0.346,22.3,74.0,35.4,"Rain, Partially cloudy",15.8,2018
2,2018-06-03,525,17.0,2.929,24.1,75.0,92.7,"Rain, Overcast",15.6,2018
3,2018-06-04,698,16.8,3.91978,16.7,76.6,71.6,"Rain, Partially cloudy",15.4,2018
4,2018-06-05,688,19.8,0.0,25.9,60.7,35.7,Partially cloudy,16.0,2018


In [141]:
# change the overcast conditions to rain
coll_weather['conditions'] = coll_weather['conditions'].apply(lambda x: 'Rain, Overcast' if x=='Overcast' else x)

# devide the coll_weather into two parts: 2018 and 2020
coll_weather_2018 = coll_weather[coll_weather['year']==2018]
coll_weather_2020 = coll_weather[coll_weather['year']==2020]

Now it is time to create graphs to see if there is any correlation between weather condiditon and accidents.

In [142]:
alt.Chart(coll_weather, width=500).transform_window(
    index='count()'
).transform_fold(
    ['temp', 'precip', 'windspeed', 'humidity', 'cloudcover', 'visibility']
).mark_line().encode(
    x='key:N',
    y='value:Q',
    color='year:N',
    detail='index:N',
    opacity=alt.value(0.5)
)

In [143]:
custom_sort_order = ['collisions', 'visibility', 'windspeed', 'temp', 'humidity', 'cloudcover']


alt.Chart(coll_weather).transform_window(
    index='count()'
).transform_fold(
    ['temp', 'windspeed', 'collisions', 'humidity', 'cloudcover', 'visibility']
).transform_joinaggregate(
     min='min(value)',
     max='max(value)',
     groupby=['key']
).transform_calculate(
    minmax_value=(datum.value-datum.min)/(datum.max-datum.min),
    mid=(datum.min+datum.max)/2
).mark_line().encode(
    x=alt.X('key:N', sort=custom_sort_order),  # Use the custom sort order
    y='minmax_value:Q',
    color='year:N',
    detail='index:N',
    opacity=alt.value(0.5)
).properties(width=500)

In [144]:
import altair as alt

# Assuming coll_weather_2018 and coll_weather_2020 are your data for the respective years

custom_sort_order = ['windspeed', 'collisions', 'visibility', 'temp', 'humidity', 'cloudcover']

# Chart for coll_weather_2018
chart_2018 = alt.Chart(coll_weather_2018).transform_window(
    index='count()'
).transform_fold(
    ['temp', 'windspeed', 'collisions', 'humidity', 'cloudcover', 'visibility']
).transform_joinaggregate(
     min='min(value)',
     max='max(value)',
     groupby=['key']
).transform_calculate(
    minmax_value=(datum.value-datum.min)/(datum.max-datum.min),
    mid=(datum.min+datum.max)/2
).mark_line().encode(
    x=alt.X('key:N', sort=custom_sort_order),  # Use the custom sort order
    y='minmax_value:Q',
    color=alt.value('steelblue'),  # Set the color to blue
    detail='index:N',
    opacity=alt.value(0.5)
).properties(width=500, title='2018')

# Chart for coll_weather_2020
chart_2020 = alt.Chart(coll_weather_2020).transform_window(
    index='count()'
).transform_fold(
    ['temp', 'windspeed', 'collisions', 'humidity', 'cloudcover', 'visibility']
).transform_joinaggregate(
     min='min(value)',
     max='max(value)',
     groupby=['key']
).transform_calculate(
    minmax_value=(datum.value-datum.min)/(datum.max-datum.min),
    mid=(datum.min+datum.max)/2
).mark_line().encode(
    x=alt.X('key:N', sort=custom_sort_order),  # Use the custom sort order
    y='minmax_value:Q',
    color=alt.value('#ff7f0e'),  # Set the color to orange
    detail='index:N',
    opacity=alt.value(0.5)
).properties(width=500, title='2020')

# Combine the two charts side by side
combined_chart = alt.hconcat(chart_2018, chart_2020)
combined_chart

In [145]:
import altair as alt
from vega_datasets import data

base = alt.Chart(
    data.iris.url
).transform_window(
    index="count()"
).transform_fold(
    ["petalLength", "petalWidth", "sepalLength", "sepalWidth"]
).transform_joinaggregate(
    min="min(value)",
    max="max(value)",
    groupby=["key"]
).transform_calculate(
    norm_val="(datum.value - datum.min) / (datum.max - datum.min)",
    mid="(datum.min + datum.max) / 2"
).properties(width=600, height=300)

lines = base.mark_line(opacity=0.3).encode(
    x='key:N',
    y=alt.Y('norm_val:Q', axis=None),
    color="species:N",
    detail="index:N",
    tooltip=["petalLength:N", "petalWidth:N", "sepalLength:N", "sepalWidth:N"]
)

rules = base.mark_rule(
    color="#ccc", tooltip=None
).encode(
    x="key:N",
    detail="count():Q",
)

def ytick(yvalue, field):
    scale = base.encode(x='key:N', y=alt.value(yvalue), text=f"min({field}):Q")
    return alt.layer(
        scale.mark_text(baseline="middle", align="right", dx=-5, tooltip=None),
        scale.mark_tick(size=8, color="#ccc", orient="horizontal", tooltip=None)
    )

alt.layer(
    lines, rules, ytick(0, "max"), ytick(150, "mid"), ytick(300, "min")
).configure_axisX(
    domain=False, labelAngle=0, tickColor="#ccc", title=None
).configure_view(
    stroke=None
)


Heatmap

In [146]:
alt.Chart(coll_weather).mark_rect().encode(
    x=alt.X('temp:O', bin=True),  # Binarize temperature
    y=alt.Y('visibility:O', bin=True),  # Binarize windspeed
    color='average(collisions):Q',
)

In [147]:
alt.Chart(coll_weather).mark_point(opacity = 0.5, filled = True).encode(
    alt.X('temp:Q').title('Average Daily Temperature (C)').scale(domain=[15, 31]),
    #alt.X('windspeed:Q').title('Avearge Daily Windspeed (km/h)').scale(domain=[8, 45]),
    #alt.X('humidity:Q').title('Average Daily Humidity (%)').scale(domain=[40, 95]),
    alt.Size('visibility:Q').title('Average Daily Visibility (km)').scale(domain=[11, 16]),
    #alt.Size('precip:Q').title('Average Daily Precipitation (mm)').scale(domain=[0, 50]),
    alt.Color('conditions').title('Weather Conditions'),
    alt.Y('collisions').title('Number of Collisions').scale(domain=[150, 900]),
    alt.Shape('year:N').title('Year')
).properties(
    width=600,
    height=400
)

In [148]:
# Chart for 2018
chart_2018 = alt.Chart(coll_weather_2018).mark_point(opacity=0.5, filled=True).encode(
    alt.X('temp:Q').title('Average Daily Temperature (C)').scale(domain=[15, 31]),
    alt.Size('visibility:Q').title('Average Daily Visibility (km)').scale(domain=[11, 16]),
    alt.Color('conditions').title('Weather Conditions'),
    alt.Y('collisions').title('Number of Collisions').scale(domain=[350, 900]),
).properties(
    title='Collisions and Weather Conditions in 2018',
    width=600,
    height=400
)

# Chart for 2020
chart_2020 = alt.Chart(coll_weather_2020).mark_point(opacity=0.5, filled=True).encode(
    alt.X('temp:Q').title('Average Daily Temperature (C)').scale(domain=[15, 31]),
    alt.Size('visibility:Q').title('Average Daily Visibility (km)').scale(domain=[11, 16]),
    alt.Color('conditions').title('Weather Conditions'),
    alt.Y('collisions').title('Number of Collisions').scale(domain=[150, 500]),
).properties(
    title='Collisions and Weather Conditions in 2020',
    width=600,
    height=400
)

# Display the charts side by side
chart_2018 | chart_2020

In [149]:
# describe of the number or accidents in 2020 and 2018
coll_weather_2018.describe()
coll_weather_2020.describe()

Unnamed: 0,datetime,collisions,temp,precip,windspeed,humidity,cloudcover,visibility,year
count,122,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0
mean,2020-07-31 12:00:00,298.008197,24.351639,1.53109,19.418033,67.492623,28.47377,15.772131,2020.0
min,2020-06-01 00:00:00,196.0,15.9,0.0,11.0,40.8,0.4,11.9,2020.0
25%,2020-07-01 06:00:00,275.5,22.7,0.0,15.9,61.05,9.7,15.8,2020.0
50%,2020-07-31 12:00:00,300.0,24.7,0.0,18.4,68.5,23.9,16.0,2020.0
75%,2020-08-30 18:00:00,320.0,26.4,0.33025,22.25,75.55,47.1,16.0,2020.0
max,2020-09-30 00:00:00,451.0,30.4,31.034,43.9,90.2,93.6,16.0,2020.0
std,,40.817776,3.09503,4.436601,5.397888,11.312636,23.241544,0.511239,0.0


In [161]:
alt.Chart(coll_weather).mark_bar().encode(
    y=alt.Y('conditions:N', sort='-x', title='Weather Conditions'),
    x=alt.X('collisions:Q', axis=alt.Axis(title='Number of Collisions'))
)

In [158]:
alt.Chart(coll_weather).mark_bar().encode(
    y=alt.Y('conditions:N', sort='-x', title='Weather Conditions'),
    x=alt.X('average_collisions_condition:Q', axis=alt.Axis(title='Average Number of Collisions per Day')),
).transform_aggregate(
    total_days_condition='count()',
    total_collisions_condiditon='sum(collisions)',
    groupby=['conditions']
).transform_calculate(
    average_collisions_condition='datum.total_collisions_condiditon / datum.total_days_condition'
)


In [152]:
alt.Chart(coll_weather, width=100).transform_density(
    'collisions',
    as_=['collisions', 'density'],
    extent=[0, 1200],
    groupby=['conditions']
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('collisions:Q'),
    alt.Color('conditions:N'),
    alt.Column('conditions:N')
        .spacing(0)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0)
).configure_view(
    stroke=None
)

In [153]:
violin_right = (
    alt.Chart(coll_weather, width=100)
    .transform_density(
        "collisions",
        as_=["collisions", "density"],
        extent=[0, 1200],
        groupby=["conditions"]
    )
    .mark_area(orient="horizontal")
    .encode(
        alt.X("density:Q")
            .impute(None)
            .title(None)
            .axis(labels=False, grid=False, ticks=True),
        alt.Y("collisions:Q"),
        alt.Color("conditions:N")
    )
)

violin_left = (
    violin_right
    .copy()
    .transform_calculate(density="-datum.density")
)

boxplot = (
    alt.Chart(coll_weather, width=100)
    .mark_boxplot(outliers=False, size=10, extent=20)
    .encode(y="collisions:Q", color=alt.value("black"))
)

chart = (
    alt.layer(violin_left, violin_right,boxplot)
    .facet(alt.Column("conditions:N"))
    .configure_view(stroke=None)
)
chart

In [154]:
boxplot = alt.Chart().mark_boxplot(color='black').encode(
    alt.Y(f'collisions:Q')
).properties(width=100)

violin = alt.Chart().transform_density(
    'collisions',
    as_=['collisions', 'density'],
    extent=[0, 1000],
    groupby=['conditions']
).mark_area(orient='horizontal').encode(
    y='collisions:Q',
    color=alt.Color('conditions:N', legend=None, scale=alt.Scale(scheme='set2')),
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        scale=alt.Scale(nice=False, zero=False),
        axis=alt.Axis(labels=False, values=[0], grid=False, ticks=True),
    ),
).properties(
    width=100,
    height=400
)

facet = lambda coll_weather, title: alt.layer(violin, boxplot, data=coll_weather).facet(column='conditions:N').\
    resolve_scale(x=alt.ResolveMode("independent")).properties(title=alt.TitleParams(text=title, anchor="middle", align="center"))

alt.hconcat(facet(coll_weather_2018, "Summer 2018"),facet(coll_weather_2020, "Sumer 2020")).configure_facet(
    spacing=0,
).configure_header(
    titleOrient='bottom',
    labelOrient='bottom'
).configure_view(
    stroke=None
).properties(
    title='Collisions and Weather Conditions in 2018 and 2020',
)

In [155]:
boxplot = alt.Chart().mark_boxplot(color='black').encode(
    alt.Y(f'collisions:Q')
).properties(width=100)

violin = alt.Chart().transform_density(
    'collisions',
    as_=['collisions', 'density'],
    extent=[0, 1000],
    groupby=['conditions']
).mark_area(orient='horizontal').encode(
    y='collisions:Q',
    color=alt.Color('conditions:N', legend=None, scale=alt.Scale(scheme='set2')),
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        scale=alt.Scale(nice=False, zero=False),
        axis=alt.Axis(labels=False, values=[0], grid=False, ticks=True),
    ),
).properties(
    width=100,
    height=400
).transform_filter(
    # conditions = "Rain, Overcast" and year = 2020 do not work because it only has one data point
    (alt.datum.conditions != "Rain, Overcast") & (alt.datum.year == 2020)
)



facet = lambda coll_weather, title: alt.layer(violin, boxplot, data=coll_weather).facet(column='conditions:N').\
    resolve_scale(x=alt.ResolveMode("independent")).properties(title=alt.TitleParams(text=title, anchor="middle", align="center"))

alt.hconcat(facet(coll_weather_2018, "Summer 2018"),facet(coll_weather_2020, "Sumer 2020")).configure_facet(
    spacing=0,
).configure_header(
    titleOrient='bottom',
    labelOrient='bottom'
).configure_view(
    stroke=None
).properties(
    title='Collisions and Weather Conditions in 2018 and 2020',
)

In [156]:
gaussian_jitter = alt.Chart(coll_weather, title='Normally distributed jitter').mark_circle(size=20).encode(
    y="conditions:N",
    x="collisions:Q",
    yOffset="jitter:Q",
    color=alt.Color('conditions:N').legend(None),
    shape=alt.Shape('year:N')
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
).properties(
    width=300, height=200
)

uniform_jitter = gaussian_jitter.transform_calculate(
    # Generate uniform jitter
    jitter='random()'
).encode(
    alt.Y('conditions:N').axis(None)
).properties(
    title='Uniformly distributed jitter',
    width=300, height=200,
)

(gaussian_jitter | uniform_jitter).resolve_scale(yOffset='independent')

In [157]:
# create a df from source = data.movies.url
df = pd.read_json(source)
df.head()

ValueError: Expected object or value