# Importing

In [4]:
import pandas as pd
pd.set_option('display.max_columns', 50)

import numpy as np

In [5]:
%store -r df

In [6]:
df_cleaning = df

# Making Colum Names Lowercase

Just to make the columns easier to work with, we will make them all lowercase

In [13]:
df_cleaning.columns = df_cleaning.columns.str.lower()

# Dropping Useless Columns

Right off the bat, there are a few columns that we do not need for this analysis. Without any further investigation, we know that we do not need the information contained in the following columns: source, tmc, description, state, and country. 

In [15]:
df_cleaning = df_cleaning.drop(columns=['source',
                                        'tmc',
                                        'description',
                                        'state',
                                        'country'])

# Null Values

In [16]:
df_cleaning.isnull().sum()

id                            0
severity                      0
start_time                    0
end_time                      0
start_lat                     0
start_lng                     0
end_lat                  119090
end_lng                  119090
distance(mi)                  0
number                   110467
street                        0
side                          0
city                         27
county                        0
zipcode                       2
timezone                      2
airport_code                 30
weather_timestamp           681
temperature(f)             1084
wind_chill(f)             68542
humidity(%)                1121
pressure(in)               2070
visibility(mi)             1530
wind_direction             1687
wind_speed(mph)           15056
precipitation(in)         85178
weather_condition          1541
amenity                       0
bump                          0
crossing                      0
give_way                      0
junction

We will drop values that have more than 100k missing observations without any further investigation. There is limited time for this project, and those particular columns do not possess the most critical information

## Dropping Columns with Over 100k Null

The columns being dropped, which have null values totalling over 100k are the following: end_lat, end_lng and number

In [17]:
df_cleaning = df_cleaning.drop(columns=['end_lat',
                                        'end_lng',
                                        'number'])

In [18]:
df_cleaning.isnull().sum()

id                           0
severity                     0
start_time                   0
end_time                     0
start_lat                    0
start_lng                    0
distance(mi)                 0
street                       0
side                         0
city                        27
county                       0
zipcode                      2
timezone                     2
airport_code                30
weather_timestamp          681
temperature(f)            1084
wind_chill(f)            68542
humidity(%)               1121
pressure(in)              2070
visibility(mi)            1530
wind_direction            1687
wind_speed(mph)          15056
precipitation(in)        85178
weather_condition         1541
amenity                      0
bump                         0
crossing                     0
give_way                     0
junction                     0
no_exit                      0
railway                      0
roundabout                   0
station 

Let's tacke the rest one at a time

## City

In [20]:
df_cleaning[df_cleaning.city.isnull() == True]

Unnamed: 0,id,severity,start_time,end_time,start_lat,start_lng,distance(mi),street,side,city,county,zipcode,timezone,airport_code,weather_timestamp,temperature(f),wind_chill(f),humidity(%),pressure(in),visibility(mi),wind_direction,wind_speed(mph),precipitation(in),weather_condition,amenity,bump,crossing,give_way,junction,no_exit,railway,roundabout,station,stop,traffic_calming,traffic_signal,turning_loop,sunrise_sunset,civil_twilight,nautical_twilight,astronomical_twilight
2514991,A-2515032,4,2016-12-11 16:33:08,2016-12-11 22:33:08,41.03674,-73.67549,3.271,King St,R,,Westchester,6831,US/Eastern,KHPN,2016-12-11 16:25:00,30.9,24.3,57.0,30.41,3.0,SSW,6.9,,Haze,False,False,False,False,True,False,False,False,False,False,False,False,False,,,,
2537327,A-2537368,4,2016-11-17 06:51:47,2016-11-17 12:51:47,41.03674,-73.67549,0.069,King St,R,,Westchester,6831,US/Eastern,KHPN,2016-11-17 06:56:00,39.0,36.8,89.0,29.97,8.0,WNW,3.5,,Clear,False,False,False,False,True,False,False,False,False,False,False,False,False,,,,
2543327,A-2543368,2,2016-09-16 07:21:23,2016-09-16 13:21:23,41.03674,-73.67549,3.271,King St,R,,Westchester,6831,US/Eastern,KHPN,2016-09-16 06:56:00,55.0,,89.0,30.31,10.0,Calm,,,Partly Cloudy,False,False,False,False,True,False,False,False,False,False,False,False,False,,,,
2545886,A-2545927,2,2016-09-30 15:28:51,2016-09-30 21:28:51,41.03674,-73.67549,3.271,King St,R,,Westchester,6831,US/Eastern,KHPN,2016-09-30 15:56:00,57.9,,87.0,30.26,8.0,NE,12.7,0.01,Overcast,False,False,False,False,True,False,False,False,False,False,False,False,False,,,,
2555509,A-2555550,2,2016-08-14 07:20:11,2016-08-14 13:20:11,41.03674,-73.67549,3.271,King St,R,,Westchester,6831,US/Eastern,KHPN,2016-08-14 06:56:00,75.0,,90.0,29.91,10.0,Calm,,,Partly Cloudy,False,False,False,False,True,False,False,False,False,False,False,False,False,,,,
2557744,A-2557785,2,2016-08-22 06:51:45,2016-08-22 12:51:45,41.03674,-73.67549,3.271,King St,R,,Westchester,6831,US/Eastern,KHPN,2016-08-22 06:56:00,66.9,,84.0,29.88,10.0,NNW,8.1,,Clear,False,False,False,False,True,False,False,False,False,False,False,False,False,,,,
2561234,A-2561275,2,2016-09-01 09:14:56,2016-09-01 15:14:56,41.03674,-73.67549,3.271,King St,R,,Westchester,6831,US/Eastern,KHPN,2016-09-01 08:56:00,72.0,,91.0,29.84,5.0,Calm,,0.02,Light Rain,False,False,False,False,True,False,False,False,False,False,False,False,False,,,,
2573992,A-2574033,2,2016-07-25 08:53:43,2016-07-25 14:53:43,41.03674,-73.67549,3.271,King St,R,,Westchester,6831,US/Eastern,KHPN,2016-07-25 08:56:00,78.1,,76.0,29.93,10.0,South,3.5,0.0,Light Rain,False,False,False,False,True,False,False,False,False,False,False,False,False,,,,
2585008,A-2585049,2,2016-05-04 06:02:34,2016-05-04 12:02:34,41.03674,-73.67549,3.271,King St,R,,Westchester,6831,US/Eastern,KHPN,2016-05-04 05:56:00,48.0,,93.0,29.81,9.0,Calm,,,Overcast,False,False,False,False,True,False,False,False,False,False,False,False,False,,,,
2587025,A-2587066,4,2016-06-04 19:55:02,2016-06-05 01:55:02,41.03674,-73.67549,3.271,King St,R,,Westchester,6831,US/Eastern,KHPN,2016-06-04 19:56:00,73.0,,73.0,29.93,10.0,South,8.1,,Mostly Cloudy,False,False,False,False,True,False,False,False,False,False,False,False,False,,,,


After some googling, it was found that these observations were actually in Greenwich CT and not NY. Therefore, we will drop these observations

### Dropping Zipcode 06831 (CT not NY)

In [32]:
df_cleaning.zipcode.astype(str)

194255     10974
194256     10901
194257     10605
194258     10509
194259     10509
           ...  
3513232    11373
3513236    10707
3513237    11566
3513244    10305
3513245    11375
Name: zipcode, Length: 160815, dtype: object

In [33]:
df_cleaning = df_cleaning[df_cleaning.zipcode != '06831']

## Zipcode

In [23]:
df_cleaning[df_cleaning.zipcode.isnull() == True]

Unnamed: 0,id,severity,start_time,end_time,start_lat,start_lng,distance(mi),street,side,city,county,zipcode,timezone,airport_code,weather_timestamp,temperature(f),wind_chill(f),humidity(%),pressure(in),visibility(mi),wind_direction,wind_speed(mph),precipitation(in),weather_condition,amenity,bump,crossing,give_way,junction,no_exit,railway,roundabout,station,stop,traffic_calming,traffic_signal,turning_loop,sunrise_sunset,civil_twilight,nautical_twilight,astronomical_twilight
1460598,A-1460630,2,2018-12-13 06:32:57,2018-12-13 07:02:32,40.573544,-73.884964,0.0,Marine Parkway Gil Hodges Memorial Brg,R,Queens,Queens,,,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Day,Day
1863330,A-1863369,2,2018-05-02 09:32:50,2018-05-02 10:02:03,40.573486,-73.884911,0.0,Marine Parkway Gil Hodges Memorial Brg,R,Queens,Queens,,,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day


It looks at though these values are missing values for many different columns. For this reason, we will drop these two observations

### Dropping No Information Observations

In [27]:
df_cleaning = df_cleaning[df_cleaning.id != 'A-1460630']

In [28]:
df_cleaning = df_cleaning[df_cleaning.id != 'A-1863369']

## Airport Code (still have to change to correct airport code)

In [35]:
df_cleaning[df_cleaning.airport_code.isnull() == True]

Unnamed: 0,id,severity,start_time,end_time,start_lat,start_lng,distance(mi),street,side,city,county,zipcode,timezone,airport_code,weather_timestamp,temperature(f),wind_chill(f),humidity(%),pressure(in),visibility(mi),wind_direction,wind_speed(mph),precipitation(in),weather_condition,amenity,bump,crossing,give_way,junction,no_exit,railway,roundabout,station,stop,traffic_calming,traffic_signal,turning_loop,sunrise_sunset,civil_twilight,nautical_twilight,astronomical_twilight
515134,A-515140,2,2020-05-27 17:50:13,2020-05-27 18:34:29,41.304852,-74.029488,0.0,Palisades Interstate Pkwy N,R,Fort Montgomery,Orange,10922,US/Eastern,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
556373,A-556381,2,2020-06-26 08:03:32,2020-06-26 09:37:44,41.305256,-74.031013,0.0,Palisades Interstate Pkwy S,R,Fort Montgomery,Orange,10922,US/Eastern,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
605568,A-605576,2,2020-03-24 10:42:15,2020-03-24 12:14:21,44.60088,-73.673042,0.0,Peasleeville Rd,L,Schuyler Falls,Clinton,12985-2601,US/Eastern,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
632392,A-632403,2,2020-04-27 08:09:48,2020-04-27 08:39:29,41.305256,-74.031013,0.0,Palisades Interstate Pkwy S,R,Fort Montgomery,Orange,10922,US/Eastern,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
632395,A-632406,2,2020-04-27 08:16:40,2020-04-27 09:32:54,41.305256,-74.031013,0.0,Palisades Interstate Pkwy S,R,Fort Montgomery,Orange,10922,US/Eastern,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
633578,A-633589,2,2020-04-28 18:58:19,2020-04-28 19:57:52,41.305256,-74.031013,0.0,Palisades Interstate Pkwy S,R,Fort Montgomery,Orange,10922,US/Eastern,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
659076,A-659089,2,2020-01-15 10:11:32,2020-01-15 11:42:48,41.34914,-74.047173,0.0,Route 293,R,Fort Montgomery,Orange,10922,US/Eastern,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
677995,A-678016,2,2020-01-25 13:29:01,2020-01-25 14:28:42,41.30558,-74.035385,0.0,Long Mountain Traffic Cir,R,Fort Montgomery,Orange,10922,US/Eastern,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
691330,A-691352,2,2020-02-03 07:17:51,2020-02-03 08:47:42,43.22781,-75.753593,0.0,State Route 49,L,North Bay,Oneida,13123-7700,US/Eastern,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
707129,A-707151,2,2020-02-11 16:22:10,2020-02-11 17:07:03,43.480492,-73.825844,0.0,Athol Rd,R,Athol,Warren,12810-1902,US/Eastern,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day


Looks like most of the observations are in Orange. Let's check all observations to check if we can guess the airport code from other observations in Orange county

In [36]:
df_cleaning[df_cleaning.county == "Orange"]

Unnamed: 0,id,severity,start_time,end_time,start_lat,start_lng,distance(mi),street,side,city,county,zipcode,timezone,airport_code,weather_timestamp,temperature(f),wind_chill(f),humidity(%),pressure(in),visibility(mi),wind_direction,wind_speed(mph),precipitation(in),weather_condition,amenity,bump,crossing,give_way,junction,no_exit,railway,roundabout,station,stop,traffic_calming,traffic_signal,turning_loop,sunrise_sunset,civil_twilight,nautical_twilight,astronomical_twilight
194471,A-194473,2,2016-12-01 11:58:27,2016-12-01 12:27:58,41.318748,-74.143044,0.010,US-6 W,R,Monroe,Orange,10950,US/Eastern,KSWF,2016-12-01 11:45:00,48.2,,71.0,29.63,20.0,NW,13.8,,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
194749,A-194751,3,2016-12-04 15:53:06,2016-12-04 16:38:06,41.373512,-74.615234,0.010,I-84 E,R,Port Jervis,Orange,12771,US/Eastern,KFWN,2016-12-04 15:53:00,42.1,39.2,41.0,30.18,10.0,North,4.6,,Clear,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
194961,A-194963,3,2016-12-05 14:19:43,2016-12-05 15:04:28,41.509865,-74.074226,0.010,New York Trwy N,R,Newburgh,Orange,12550,US/Eastern,KSWF,2016-12-05 13:45:00,44.6,40.2,81.0,29.94,20.0,WSW,8.1,,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
195161,A-195163,3,2016-12-06 15:51:58,2016-12-06 16:36:36,41.311024,-74.122162,0.010,I-87 S,R,Central Valley,Orange,10917,US/Eastern,KSWF,2016-12-06 15:45:00,41.0,38.0,70.0,29.96,20.0,Variable,4.6,,Overcast,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
195197,A-195199,3,2016-12-06 18:48:05,2016-12-06 19:32:48,41.509865,-74.074226,0.010,New York Trwy N,R,Newburgh,Orange,12550,US/Eastern,KSWF,2016-12-06 18:45:00,41.0,,65.0,29.99,10.0,Calm,,,Light Drizzle,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3499617,A-3499781,4,2019-07-25 13:29:14,2019-07-25 13:58:46,41.237894,-74.177395,0.500,Route 17,R,Southfields,Orange,10975-3113,US/Eastern,KSWF,2019-07-25 13:45:00,81.0,81.0,42.0,29.62,10.0,CALM,0.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
3500134,A-3500298,3,2019-07-26 03:51:30,2019-07-26 04:20:10,41.302790,-74.132710,10.497,I-87 S,R,Harriman,Orange,10926,US/Eastern,KSWF,2019-07-26 03:45:00,63.0,63.0,100.0,29.72,10.0,CALM,0.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Day
3500895,A-3501059,2,2019-07-28 18:56:46,2019-07-28 19:25:48,41.520820,-74.010190,1.249,Beacon Brg E,R,Newburgh,Orange,12550,US/Eastern,KSWF,2019-07-28 18:45:00,86.0,86.0,58.0,29.49,10.0,W,12.0,0.0,Partly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
3500899,A-3501063,3,2019-07-28 17:12:32,2019-07-28 17:40:48,41.519461,-74.024800,0.321,Beacon Brg E,R,Newburgh,Orange,12550,US/Eastern,KSWF,2019-07-28 16:45:00,88.0,88.0,55.0,29.51,10.0,WSW,12.0,0.0,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day


**Looks to be KSWF is the airport code for Orange County**

What about for Warren?

In [37]:
df_cleaning[df_cleaning.county == "Warren"]

Unnamed: 0,id,severity,start_time,end_time,start_lat,start_lng,distance(mi),street,side,city,county,zipcode,timezone,airport_code,weather_timestamp,temperature(f),wind_chill(f),humidity(%),pressure(in),visibility(mi),wind_direction,wind_speed(mph),precipitation(in),weather_condition,amenity,bump,crossing,give_way,junction,no_exit,railway,roundabout,station,stop,traffic_calming,traffic_signal,turning_loop,sunrise_sunset,civil_twilight,nautical_twilight,astronomical_twilight
512235,A-512241,3,2020-05-24 21:20:08,2020-05-24 21:49:48,43.486835,-73.753616,0.000,Adirondack Northway S,R,Lake George,Warren,12845,US/Eastern,KGFL,2020-05-24 20:53:00,66.0,66.0,40.0,29.78,10.0,S,7.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Day,Day
512575,A-512581,2,2020-05-25 11:03:16,2020-05-25 12:18:57,43.438061,-73.705429,0.000,Lake Shore Dr,L,Lake George,Warren,12845-6433,US/Eastern,KGFL,2020-05-25 10:53:00,64.0,64.0,70.0,29.82,10.0,S,9.0,0.0,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
513702,A-513708,2,2020-05-26 11:18:50,2020-05-26 12:19:38,43.332191,-73.652840,0.000,Quaker Rd,L,Queensbury,Warren,12804-1729,US/Eastern,KGFL,2020-05-26 10:53:00,73.0,73.0,66.0,29.82,10.0,VAR,5.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,True,False,Day,Day,Day,Day
513703,A-513709,2,2020-05-26 11:25:56,2020-05-26 12:26:53,43.318192,-73.615967,0.000,Quaker Rd,R,Queensbury,Warren,12804-3959,US/Eastern,KGFL,2020-05-26 11:53:00,77.0,77.0,62.0,29.81,10.0,SE,3.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,True,False,Day,Day,Day,Day
516512,A-516518,2,2020-05-28 04:59:57,2020-05-28 05:29:44,43.403519,-73.713982,0.000,Adirondack Northway S,R,Lake George,Warren,12845,US/Eastern,KGFL,2020-05-28 04:53:00,67.0,67.0,84.0,29.79,10.0,S,3.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495689,A-3495853,4,2019-07-17 15:53:16,2019-07-17 16:22:45,43.296619,-73.679206,0.333,Adirondack Northway N,R,Queensbury,Warren,12804,US/Eastern,KGFL,2019-07-17 15:53:00,77.0,77.0,82.0,29.52,10.0,SW,7.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
3495690,A-3495854,2,2019-07-17 22:34:28,2019-07-17 23:02:21,43.296619,-73.679206,0.291,Adirondack Northway N,R,Queensbury,Warren,12804,US/Eastern,KGFL,2019-07-17 22:31:00,70.0,70.0,93.0,29.58,3.0,CALM,0.0,0.0,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Day
3500647,A-3500811,2,2019-07-27 22:20:45,2019-07-27 22:49:48,43.361900,-73.703950,0.466,Adirondack Northway S,R,Lake George,Warren,12845,US/Eastern,KGFL,2019-07-27 21:53:00,70.0,70.0,81.0,29.77,10.0,CALM,0.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Day
3500864,A-3501028,2,2019-07-28 20:00:31,2019-07-28 20:29:46,43.721800,-73.818500,0.226,Adirondack Northway N,R,Pottersville,Warren,12860,US/Eastern,KGFL,2019-07-28 19:53:00,81.0,81.0,58.0,29.61,10.0,W,3.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day


**Looks like the airport code is KGFL**

What about Clinton?

In [40]:
df_cleaning[df_cleaning.county == "Clinton"].airport_code.value_counts()

KPBG    60
KFSO    10
Name: airport_code, dtype: int64

Since there are two we checked for value counts of both. **Looks like the airport code is most likely KPBG**

What about Albany?

In [41]:
df_cleaning[df_cleaning.county == "Albany"]

Unnamed: 0,id,severity,start_time,end_time,start_lat,start_lng,distance(mi),street,side,city,county,zipcode,timezone,airport_code,weather_timestamp,temperature(f),wind_chill(f),humidity(%),pressure(in),visibility(mi),wind_direction,wind_speed(mph),precipitation(in),weather_condition,amenity,bump,crossing,give_way,junction,no_exit,railway,roundabout,station,stop,traffic_calming,traffic_signal,turning_loop,sunrise_sunset,civil_twilight,nautical_twilight,astronomical_twilight
194276,A-194278,3,2016-11-30 17:25:30,2016-11-30 18:10:18,42.728401,-73.796188,0.010,I-87 S,R,Albany,Albany,12211,US/Eastern,KALB,2016-11-30 17:17:00,51.1,,89.0,29.79,10.0,South,5.8,0.00,Light Rain,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Day,Day
194277,A-194279,3,2016-11-30 17:21:50,2016-11-30 18:06:29,42.643890,-73.750336,0.010,Dunn Memorial Brg,R,Albany,Albany,12202,US/Eastern,KALB,2016-11-30 17:17:00,51.1,,89.0,29.79,10.0,South,5.8,0.00,Light Rain,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Day,Day
194283,A-194285,3,2016-11-30 17:42:28,2016-11-30 18:27:10,42.667244,-73.733116,0.010,I-787 N,R,Albany,Albany,12204,US/Eastern,KALB,2016-11-30 17:47:00,51.8,,82.0,29.75,10.0,SSE,11.5,0.00,Light Rain,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Day
194285,A-194287,3,2016-11-30 17:58:43,2016-11-30 18:43:34,42.753929,-73.933632,0.010,I-90 E,R,Schenectady,Albany,12303,US/Eastern,KALB,2016-11-30 17:51:00,51.1,,86.0,29.76,10.0,South,13.8,0.00,Light Rain,False,False,False,False,True,False,False,False,False,False,False,False,False,Night,Night,Night,Day
194298,A-194300,2,2016-11-30 19:03:00,2016-11-30 19:47:45,42.762939,-73.762451,0.010,Adirondack Northway N,R,Latham,Albany,12110,US/Eastern,KALB,2016-11-30 18:58:00,50.0,,89.0,29.76,10.0,South,15.0,0.00,Light Rain,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3508397,A-3508561,2,2019-08-13 19:09:33,2019-08-13 19:37:31,42.790131,-73.759691,0.253,Adirondack Northway N,R,Cohoes,Albany,12047,US/Eastern,KALB,2019-08-13 18:59:00,72.0,72.0,81.0,29.45,10.0,ESE,3.0,0.00,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
3509520,A-3509684,3,2019-08-15 09:20:13,2019-08-15 09:49:20,42.700390,-73.844540,6.137,I-90 W,R,Albany,Albany,12205,US/Eastern,KALB,2019-08-15 08:51:00,68.0,68.0,63.0,29.74,10.0,N,8.0,0.00,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
3509522,A-3509686,3,2019-08-15 12:51:33,2019-08-15 13:20:17,42.630760,-73.779420,6.680,New York Trwy N,R,Albany,Albany,12209,US/Eastern,KALB,2019-08-15 12:51:00,78.0,78.0,54.0,29.72,10.0,VAR,5.0,0.00,Partly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
3510089,A-3510253,2,2019-08-16 17:30:28,2019-08-16 18:00:11,42.745650,-73.704920,0.749,NY-7 E,R,Watervliet,Albany,12189,US/Eastern,KALB,2019-08-16 17:51:00,74.0,74.0,82.0,29.68,7.0,CALM,0.0,0.02,Light Rain with Thunder,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day


**Looks like the airport code is KALB**

Finally, what about Oneida?

In [42]:
df_cleaning[df_cleaning.county == "Oneida"]

Unnamed: 0,id,severity,start_time,end_time,start_lat,start_lng,distance(mi),street,side,city,county,zipcode,timezone,airport_code,weather_timestamp,temperature(f),wind_chill(f),humidity(%),pressure(in),visibility(mi),wind_direction,wind_speed(mph),precipitation(in),weather_condition,amenity,bump,crossing,give_way,junction,no_exit,railway,roundabout,station,stop,traffic_calming,traffic_signal,turning_loop,sunrise_sunset,civil_twilight,nautical_twilight,astronomical_twilight
512582,A-512588,2,2020-05-25 07:38:43,2020-05-25 08:23:32,43.190578,-75.434113,0.000,State Route 233,R,Rome,Oneida,13440-1322,US/Eastern,KRME,2020-05-25 07:53:00,57.0,57.0,83.0,29.64,6.0,E,8.0,0.04,Rain,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
512584,A-512590,2,2020-05-25 09:38:31,2020-05-25 10:38:51,43.235062,-75.450111,0.000,W Chestnut St,L,Rome,Oneida,13440-2619,US/Eastern,KRME,2020-05-25 09:53:00,60.0,60.0,86.0,29.61,10.0,E,6.0,0.02,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
513710,A-513716,2,2020-05-26 04:06:21,2020-05-26 04:36:06,43.203793,-75.409828,0.000,NY-49 E,R,Rome,Oneida,13440,US/Eastern,KRME,2020-05-26 03:53:00,66.0,66.0,75.0,29.58,10.0,E,8.0,0.00,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Day
513711,A-513717,2,2020-05-26 06:08:42,2020-05-26 07:28:10,42.992592,-75.245949,0.000,Holman City Rd,L,Sauquoit,Oneida,13456-3202,US/Eastern,KRME,2020-05-26 05:53:00,63.0,63.0,84.0,29.61,10.0,CALM,0.0,0.00,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
513712,A-513718,2,2020-05-26 07:37:25,2020-05-26 09:47:47,43.122204,-75.583748,0.000,I-90 E,R,Verona,Oneida,13478,US/Eastern,KRME,2020-05-26 07:53:00,68.0,68.0,76.0,29.63,10.0,E,7.0,0.00,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478009,A-3478173,4,2019-06-24 18:04:27,2019-06-24 18:34:06,42.912251,-75.353876,1.229,Route 20,R,Waterville,Oneida,13480-2209,US/Eastern,KRME,2019-06-24 17:53:00,78.0,78.0,46.0,29.27,10.0,SE,5.0,0.00,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
3478995,A-3479159,4,2019-06-26 06:34:50,2019-06-26 07:04:16,43.247850,-75.522950,0.692,Rome Taberg Rd,R,Rome,Oneida,13440-1760,US/Eastern,KRME,2019-06-26 06:53:00,62.0,62.0,90.0,29.49,10.0,ESE,6.0,0.00,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
3478996,A-3479160,4,2019-06-26 06:34:50,2019-06-26 07:04:16,43.254115,-75.533667,0.692,Rome Taberg Rd,R,Rome,Oneida,13440-1729,US/Eastern,KRME,2019-06-26 06:53:00,62.0,62.0,90.0,29.49,10.0,ESE,6.0,0.00,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
3508942,A-3509106,3,2019-08-14 15:16:10,2019-08-14 15:44:36,43.174207,-75.528148,0.014,State Route 365,R,Rome,Oneida,13440,US/Eastern,KRME,2019-08-14 14:53:00,79.0,79.0,50.0,29.35,10.0,NNW,10.0,0.00,Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day


**Looks like the airport code is KRME**

In [43]:
df_cleaning.isnull().sum()

id                           0
severity                     0
start_time                   0
end_time                     0
start_lat                    0
start_lng                    0
distance(mi)                 0
street                       0
side                         0
city                         0
county                       0
zipcode                      0
timezone                     0
airport_code                28
weather_timestamp          679
temperature(f)            1082
wind_chill(f)            68524
humidity(%)               1119
pressure(in)              2068
visibility(mi)            1528
wind_direction            1685
wind_speed(mph)          15049
precipitation(in)        85166
weather_condition         1539
amenity                      0
bump                         0
crossing                     0
give_way                     0
junction                     0
no_exit                      0
railway                      0
roundabout                   0
station 

## Weather TImestamp

In [44]:
df_cleaning.weather_timestamp

194255     2016-11-30 16:53:00
194256     2016-11-30 15:53:00
194257     2016-11-30 15:56:00
194258     2016-11-30 15:53:00
194259     2016-11-30 15:53:00
                  ...         
3513232    2019-08-23 16:51:00
3513236    2019-08-23 15:56:00
3513237    2019-08-23 16:53:00
3513244    2019-08-23 18:51:00
3513245    2019-08-23 17:51:00
Name: weather_timestamp, Length: 160788, dtype: object

This column might be hard to work with given the allotted timeframe, so we will just drop this column

In [45]:
df_cleaning = df_cleaning.drop(columns='weather_timestamp')

## Temperature

In [50]:
df_cleaning.corr()

Unnamed: 0,severity,start_lat,start_lng,distance(mi),temperature(f),wind_chill(f),humidity(%),pressure(in),visibility(mi),wind_speed(mph),precipitation(in),amenity,bump,crossing,give_way,junction,no_exit,railway,roundabout,station,stop,traffic_calming,traffic_signal,turning_loop
severity,1.0,-0.023361,0.038438,0.124686,0.025193,-0.029514,-0.048518,0.054198,0.012324,0.021544,0.010987,-0.046742,-0.002634,-0.050211,-0.023883,0.012966,0.003256,-0.011369,-0.004928,-0.042458,-0.036652,-0.002634,-0.11605,
start_lat,-0.023361,1.0,-0.735384,0.107602,-0.161028,-0.152648,0.114395,-0.266708,-0.044662,-0.028125,-0.076444,-0.059704,-0.003556,-0.002317,0.003129,-0.097546,-0.019401,-0.055241,0.007384,-0.10114,-0.001511,-0.003556,0.065602,
start_lng,0.038438,-0.735384,1.0,-0.081637,0.122048,0.117334,-0.105332,0.256139,0.06218,-0.043129,0.038286,0.021831,0.001268,-0.010143,0.017746,0.058293,0.010133,0.027163,-0.004874,0.057636,-0.006956,0.001268,-0.097948,
distance(mi),0.124686,0.107602,-0.081637,1.0,-0.015482,-0.039031,0.030624,-0.017953,0.001458,-0.014271,-0.007344,-0.029949,-0.000788,-0.042792,-0.008352,0.006348,-0.000686,-0.014617,-0.001013,-0.02811,-0.019691,-0.000788,-0.079186,
temperature(f),0.025193,-0.161028,0.122048,-0.015482,1.0,0.992302,-0.152738,-0.135008,0.21782,-0.113819,0.01261,0.007852,-0.001318,0.015274,0.00718,0.021589,0.004154,0.002168,-0.002973,0.006962,0.001739,-0.001318,0.019491,
wind_chill(f),-0.029514,-0.152648,0.117334,-0.039031,0.992302,1.0,-0.121838,-0.33376,0.220805,-0.206862,-0.038129,0.025762,0.003395,0.026339,0.007592,-0.004682,0.00818,0.012374,-0.00137,0.01188,-0.008443,0.003395,0.036497,
humidity(%),-0.048518,0.114395,-0.105332,0.030624,-0.152738,-0.121838,1.0,-0.145577,-0.489657,-0.186176,0.009718,-0.01839,0.002392,-0.020473,-0.006241,-0.02721,-0.007234,-0.008712,-0.000683,-0.014963,0.002101,0.002392,-0.03006,
pressure(in),0.054198,-0.266708,0.256139,-0.017953,-0.135008,-0.33376,-0.145577,1.0,0.136229,-0.102036,0.110554,0.015693,0.003073,0.013174,-0.000736,0.040504,0.007115,0.018062,0.00113,0.041066,0.009956,0.003073,-0.025013,
visibility(mi),0.012324,-0.044662,0.06218,0.001458,0.21782,0.220805,-0.489657,0.136229,1.0,-0.008918,-0.006173,0.005988,-0.004099,0.018343,0.008939,-0.008665,0.002172,0.003045,0.002075,0.005312,-0.001562,-0.004099,0.0294,
wind_speed(mph),0.021544,-0.028125,-0.043129,-0.014271,-0.113819,-0.206862,-0.186176,-0.102036,-0.008918,1.0,-0.006718,-0.031319,-0.000279,-0.011267,-0.003615,0.022046,-0.007907,-0.01749,-0.002982,-0.001551,-0.001429,-0.000279,-0.00974,


Since temperature is so highly correlated with windchill, we will just set the temperature to windchill for relavant observations

In [73]:
def temp_to_chill(col1, col2):
    
    '''
    col1 is the column that possesses NaN values you want to fill
    col2 is the column that is most correlated with col1
    '''
    
    id_list = list(df_cleaning.id)
    
    for element in id_list:
        observation = df_cleaning[df_cleaning.id == element]
        if observation[col1] == 'NaN':
            try:
                observation[col1] = observation[col2]
            except:
                pass
        else:
            continue
        

In order to be able to run the code above, we must change the datatype of temperature and wind chill to strings

In [72]:
df_cleaning['temperature(f)'].astype(str)
df_cleaning['wind_chill(f)'].astype(str)

194255      nan
194256      nan
194257      nan
194258      nan
194259      nan
           ... 
3513232    75.0
3513236    75.0
3513237    73.0
3513244    75.0
3513245    75.0
Name: wind_chill(f), Length: 160788, dtype: object

In [74]:
temp_to_chill('temperature(f)', 'wind_chill(f)')

  result = method(y)


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [69]:
df_cleaning.dtypes

id                        object
severity                   int64
start_time                object
end_time                  object
start_lat                float64
start_lng                float64
distance(mi)             float64
street                    object
side                      object
city                      object
county                    object
zipcode                   object
timezone                  object
airport_code              object
temperature(f)           float64
wind_chill(f)            float64
humidity(%)              float64
pressure(in)             float64
visibility(mi)           float64
wind_direction            object
wind_speed(mph)          float64
precipitation(in)        float64
weather_condition         object
amenity                     bool
bump                        bool
crossing                    bool
give_way                    bool
junction                    bool
no_exit                     bool
railway                     bool
roundabout

# Datatypes