## Part 1. Data Processing

In [24]:
import math

import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db

In [25]:
TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
# add other constants to refer to any local data, e.g. uber & weather
UBER_CSV = "uber_rides_sample.csv"

NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

### Calculating Distance

In [26]:
from math import radians, cos, sin, asin, sqrt, atan, atan2
# longitude and latitude of pickup and dropoff locations
def calculate_distance(Pickup_Longitude, Pickup_Latitude, Dropoff_Longitude, Dropoff_Latitude):
    # transfer numbers to radius
    Pickup_Longitude = radians(Pickup_Longitude)
    Pickup_Latitude = radians(Pickup_Latitude)
    Dropoff_Longitude = radians(Dropoff_Longitude)
    Dropoff_Latitude = radians(Dropoff_Latitude)
    # the formulas of calculate_distance
    dlon = Dropoff_Longitude - Pickup_Longitude
    dlat = Dropoff_Latitude - Pickup_Latitude
    a = sin(dlat/2)**2 + cos(Pickup_Latitude) * cos(Dropoff_Latitude) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    r = 6371 # The Average Radius of Earth
    Distance = c*r
    return Distance

### Processing Uber Data
#### Import Uber Data

In [27]:
df_Uber = pd.read_csv(r'C:\Users\siqie\Desktop\uber_rides_sample.csv') # Load the data for Uber Rides

#### Clean Uber data by deleting unused columns

In [28]:
# Delete the unused columns from Uber data
del df_Uber["Unnamed: 0"]   
del df_Uber["key"]

#### Normalize the column names

In [29]:
df_Uber.rename(columns={
                        'fare_amount':'Fare_Amount', 
                        'pickup_datetime':'Pickup_Date_Time', 
                        'pickup_longitude':'Pickup_Longitude', 
                        'pickup_latitude':'Pickup_Latitude',
                        'dropoff_longitude':'Dropoff_Longitude',
                        'dropoff_latitude':'Dropoff_Latitude',
                        'passenger_count':'Passenger_Count'
                       }, inplace = True)

#### Only include rides that's in our circled area

In [30]:
# Circle the areas that we are considering: ((40.560445, -74.242330), (40.908524, -73.717047))
df_Uber=df_Uber[df_Uber["Pickup_Longitude"] <= -73.717047]  
df_Uber=df_Uber[df_Uber["Pickup_Longitude"] >= -74.242330]
df_Uber=df_Uber[df_Uber["Pickup_Latitude"] >= 40.560445]
df_Uber=df_Uber[df_Uber["Pickup_Latitude"] <= 40.908524]
df_Uber=df_Uber[df_Uber["Dropoff_Longitude"] <= -73.717047]
df_Uber=df_Uber[df_Uber["Dropoff_Longitude"] >= -74.242330]
df_Uber=df_Uber[df_Uber["Dropoff_Latitude"] >= 40.560445]
df_Uber=df_Uber[df_Uber["Dropoff_Latitude"] <= 40.908524]
# Remove all the rows that has cordinates with 0, which are considered as invalid value. 
df_Uber=df_Uber[df_Uber.Pickup_Longitude != 0]
df_Uber=df_Uber[df_Uber.Pickup_Latitude != 0]
df_Uber=df_Uber[df_Uber.Dropoff_Longitude != 0]
df_Uber=df_Uber[df_Uber.Dropoff_Latitude != 0]

#### Make Uber data a sample of 10,000 rides 

In [31]:
df_Uber = df_Uber.sample(n = 10000)

In [32]:
df_Uber

Unnamed: 0,Fare_Amount,Pickup_Date_Time,Pickup_Longitude,Pickup_Latitude,Dropoff_Longitude,Dropoff_Latitude,Passenger_Count
114380,7.30,2012-06-09 09:20:18 UTC,-74.015751,40.711285,-73.997996,40.716852,3
163354,12.90,2010-08-17 11:01:00 UTC,-73.990003,40.756530,-73.999115,40.726455,5
90063,19.50,2012-10-24 15:37:33 UTC,-73.996725,40.753029,-74.007264,40.707449,1
145828,12.10,2011-06-04 06:30:58 UTC,-73.982555,40.731299,-73.943402,40.708326,1
136922,5.50,2014-11-08 10:08:03 UTC,-73.973544,40.794967,-73.973126,40.785201,1
104774,16.10,2010-07-24 19:02:00 UTC,-74.003812,40.725432,-73.972303,40.782438,2
75123,9.50,2013-03-14 19:39:39 UTC,-73.979273,40.744381,-73.955213,40.769116,1
67721,8.10,2011-02-01 11:43:00 UTC,-73.989360,40.758510,-73.981727,40.752485,2
10683,13.30,2012-08-24 16:42:20 UTC,-73.986428,40.743365,-74.000053,40.761594,1
193060,10.90,2011-07-11 17:33:00 UTC,-73.980142,40.751867,-74.003455,40.743637,1


#### Insert a column for travel distance

### Processing Weather Data
#### Import weather data

In [33]:
# Loading all weather data from 2009 to 2015
df_Weather_2009 = pd.read_csv(r'C:\Users\siqie\Desktop\2009_weather.csv')
df_Weather_2010 = pd.read_csv(r'C:\Users\siqie\Desktop\2010_weather.csv')
df_Weather_2011 = pd.read_csv(r'C:\Users\siqie\Desktop\2011_weather.csv')
df_Weather_2012 = pd.read_csv(r'C:\Users\siqie\Desktop\2012_weather.csv')
df_Weather_2013 = pd.read_csv(r'C:\Users\siqie\Desktop\2013_weather.csv')
df_Weather_2014 = pd.read_csv(r'C:\Users\siqie\Desktop\2014_weather.csv')
df_Weather_2015 = pd.read_csv(r'C:\Users\siqie\Desktop\2015_weather.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


#### Remove unnecessary columns for each dataset

In [34]:
# Since there are too many columns we want to delete, we used "column number" method to clean the data
df_Weather_2009 = df_Weather_2009.drop(df_Weather_2009.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)
df_Weather_2010 = df_Weather_2010.drop(df_Weather_2010.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)
df_Weather_2011 = df_Weather_2011.drop(df_Weather_2011.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)
df_Weather_2012 = df_Weather_2012.drop(df_Weather_2012.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)
df_Weather_2013 = df_Weather_2013.drop(df_Weather_2013.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)
df_Weather_2014 = df_Weather_2014.drop(df_Weather_2014.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)
df_Weather_2015 = df_Weather_2015.drop(df_Weather_2015.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)

#### Put together hourly weather data

In [35]:
# Since we will only focus on the weather from 2012 to 2014, then we only clean up what we need.
## All the hourly data has REPORT_TYPE: FM-15, so we only select FM-15
df_Weather_2012_hourly = df_Weather_2012[df_Weather_2012['REPORT_TYPE'].str.contains('FM-15')]
df_Weather_2013_hourly = df_Weather_2013[df_Weather_2013['REPORT_TYPE'].str.contains('FM-15')]
df_Weather_2014_hourly = df_Weather_2014[df_Weather_2014['REPORT_TYPE'].str.contains('FM-15')]

## Put 3 hourly data into one hourly weather data
frames1 = [df_Weather_2012_hourly, df_Weather_2013_hourly, df_Weather_2014_hourly]
df_Weather_hourly = pd.concat(frames1)


## Later we will need Hourly Sustained Wind Speed information, which we will insert a new column for it. 

#### Clean up hourly weather data

In [36]:
# Delete unnecessary columns from the table (Most of the daily data). 
df_Weather_hourly = df_Weather_hourly.drop(df_Weather_hourly.columns[[9,10,11,12,13,14,15,16,17]], axis=1)


# Split Datatime to Date and Time
df_Weather_hourly['TIME']=df_Weather_hourly['DATE'].str.slice(11,19) # Create a new column that only has time

# Move TIME column right after DATE column
TIME = df_Weather_hourly["TIME"]
df_Weather_hourly.drop(labels=["TIME"], axis=1, inplace=True)
df_Weather_hourly.insert(1, "TIME", TIME)


# Only leave the date information in "DATE" column
df_Weather_hourly['DATE'] = df_Weather_hourly['DATE'].str[:-9]
df_Weather_hourly

# Add a new column that indicates days of a week
df_Weather_hourly['DATE'] = pd.to_datetime(df_Weather_hourly['DATE'])
df_Weather_hourly['DayofWeek'] = df_Weather_hourly['DATE'].dt.day_name()

# Move DayofWeek column right after TIME
DayofWeek = df_Weather_hourly["DayofWeek"]
df_Weather_hourly.drop(labels=["DayofWeek"], axis=1, inplace=True)
df_Weather_hourly.insert(2, "DayofWeek", DayofWeek)

#### Put together daily weather data

In [37]:
# For daily data we will also focus on 2012 to 2014
## All the daily data has REPORT_TYPE: SOD, so we only select SOD data.
df_Weather_2012_daily = df_Weather_2012[df_Weather_2012['REPORT_TYPE'].str.contains('SOD')]
df_Weather_2013_daily = df_Weather_2013[df_Weather_2013['REPORT_TYPE'].str.contains('SOD')]
df_Weather_2014_daily = df_Weather_2014[df_Weather_2014['REPORT_TYPE'].str.contains('SOD')]

## Put 3 daily data into one hourly weather data
frames2 = [df_Weather_2012_daily, df_Weather_2013_daily, df_Weather_2014_daily]
df_Weather_daily = pd.concat(frames2)

#### Clean up daily weather data

In [38]:
# Delete unnecessary columns (hourly data)
df_Weather_daily = df_Weather_daily.drop(df_Weather_daily.columns[[2,3,4,5,6]], axis=1)
df_Weather_daily

# Change the column name of "DailySustainedWindSpeed" to "SustainedWindSpeed"
df_Weather_daily.rename(columns={"DailySustainedWindSpeed":"SustainedWindSpeed"}, inplace = True)

# Only leave date information in DATE column
df_Weather_daily['DATE'] = df_Weather_daily['DATE'].str[:-9]
df_Weather_daily

# Add a new column that indicates days of a week
df_Weather_daily['DATE'] = pd.to_datetime(df_Weather_daily['DATE'])
df_Weather_daily['DayofWeek'] = df_Weather_daily['DATE'].dt.day_name()


# Move DayofWeek column right after TIME
DayofWeek = df_Weather_daily["DayofWeek"]
df_Weather_daily.drop(labels=["DayofWeek"], axis=1, inplace=True)
df_Weather_daily.insert(1, "DayofWeek", DayofWeek)
df_Weather_daily

# The column names are already normalized, no further action needed.
# Based on what is needed from this data, there's no more invalid data point that must be removed. 

Unnamed: 0,DATE,DayofWeek,REPORT_TYPE,Sunrise,Sunset,DailyAverageDewPointTemperature,DailyAverageDryBulbTemperature,DailyAverageWindSpeed,DailyMaximumDryBulbTemperature,DailyMinimumDryBulbTemperature,DailyPeakWindDirection,DailyPeakWindSpeed,DailyPrecipitation,SustainedWindSpeed
30,2012-01-01,Sunday,SOD,720.0,1639.0,,,,,,,,,
244,2012-01-10,Tuesday,SOD,720.0,1648.0,,,,,,,,,
274,2012-01-11,Wednesday,SOD,720.0,1649.0,,,,,,,,,
326,2012-01-12,Thursday,SOD,719.0,1650.0,,,,,,,,,
362,2012-01-13,Friday,SOD,719.0,1651.0,,,,,,,,,
435,2012-01-16,Monday,SOD,718.0,1654.0,,,,,,,,,
472,2012-01-17,Tuesday,SOD,717.0,1655.0,,,,,,,,,
590,2012-01-21,Saturday,SOD,715.0,1700.0,,,,,,,,,
669,2012-01-23,Monday,SOD,714.0,1702.0,,,,,,,,,
754,2012-01-26,Thursday,SOD,712.0,1706.0,,,,,,,,,


#### Generate values for hourly sustained wind speed

In [39]:
df_Weather_hourly = pd.merge(df_Weather_hourly, df_Weather_daily.loc[:,['DATE', 'SustainedWindSpeed']], how='left', on='DATE')
df_Weather_hourly

Unnamed: 0,DATE,TIME,DayofWeek,REPORT_TYPE,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyWindGustSpeed,HourlyWindSpeed,Sunrise,Sunset,SustainedWindSpeed
0,2012-01-01,06:00:00,Sunday,FM-15,36,37,,,8.0,,,
1,2012-01-01,08:00:00,Sunday,FM-15,36,37,,,0.0,,,
2,2012-01-12,08:00:00,Thursday,FM-15,43,43,0.03,,0.0,,,
3,2012-01-17,15:00:00,Tuesday,FM-15,41,43,0.01,18.0,10.0,,,
4,2012-01-27,01:00:00,Friday,FM-15,39,41,,,3.0,,,
5,2012-01-27,03:00:00,Friday,FM-15,43,43,,,5.0,,,
6,2012-01-27,07:00:00,Friday,FM-15,48,48,T,,7.0,,,
7,2012-01-27,11:00:00,Friday,FM-15,55,55,0.01,,3.0,,,
8,2012-02-02,00:51:00,Thursday,FM-15,34,48,,,5.0,,,
9,2012-02-02,01:51:00,Thursday,FM-15,32,46,,,3.0,,,
