## Part 1. Data Processing

In [55]:
import math

import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db
import sqlite3

In [2]:
TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
# add other constants to refer to any local data, e.g. uber & weather
UBER_CSV = "uber_rides_sample.csv"

NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

### Calculating Distance

In [3]:
from math import radians, cos, sin, asin, sqrt, atan, atan2
# longitude and latitude of pickup and dropoff locations
def calculate_distance(Pickup_Longitude, Pickup_Latitude, Dropoff_Longitude, Dropoff_Latitude):
    # transfer numbers to radius
    Pickup_Longitude = radians(Pickup_Longitude)
    Pickup_Latitude = radians(Pickup_Latitude)
    Dropoff_Longitude = radians(Dropoff_Longitude)
    Dropoff_Latitude = radians(Dropoff_Latitude)
    # the formulas of calculate_distance
    dlon = Dropoff_Longitude - Pickup_Longitude
    dlat = Dropoff_Latitude - Pickup_Latitude
    a = sin(dlat/2)**2 + cos(Pickup_Latitude) * cos(Dropoff_Latitude) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    r = 6371 # The Average Radius of Earth
    Distance = c*r
    return Distance

### Processing Uber Data
#### Import Uber Data

In [4]:
df_Uber = pd.read_csv(r'C:\Users\siqie\Desktop\uber_rides_sample.csv') # Load the data for Uber Rides

#### Clean Uber data by deleting unused columns

In [5]:
# Delete the unused columns from Uber data
del df_Uber["Unnamed: 0"]   
del df_Uber["key"]

#### Normalize the column names

In [6]:
df_Uber.rename(columns={
                        'fare_amount':'Fare_Amount', 
                        'pickup_datetime':'Pickup_Date_Time', 
                        'pickup_longitude':'Pickup_Longitude', 
                        'pickup_latitude':'Pickup_Latitude',
                        'dropoff_longitude':'Dropoff_Longitude',
                        'dropoff_latitude':'Dropoff_Latitude',
                        'passenger_count':'Passenger_Count'
                       }, inplace = True)

#### Only include rides that's in our circled area

In [7]:
# Circle the areas that we are considering: ((40.560445, -74.242330), (40.908524, -73.717047))
df_Uber=df_Uber[df_Uber["Pickup_Longitude"] <= -73.717047]  
df_Uber=df_Uber[df_Uber["Pickup_Longitude"] >= -74.242330]
df_Uber=df_Uber[df_Uber["Pickup_Latitude"] >= 40.560445]
df_Uber=df_Uber[df_Uber["Pickup_Latitude"] <= 40.908524]
df_Uber=df_Uber[df_Uber["Dropoff_Longitude"] <= -73.717047]
df_Uber=df_Uber[df_Uber["Dropoff_Longitude"] >= -74.242330]
df_Uber=df_Uber[df_Uber["Dropoff_Latitude"] >= 40.560445]
df_Uber=df_Uber[df_Uber["Dropoff_Latitude"] <= 40.908524]
# Remove all the rows that has cordinates with 0, which are considered as invalid value. 
df_Uber=df_Uber[df_Uber.Pickup_Longitude != 0]
df_Uber=df_Uber[df_Uber.Pickup_Latitude != 0]
df_Uber=df_Uber[df_Uber.Dropoff_Longitude != 0]
df_Uber=df_Uber[df_Uber.Dropoff_Latitude != 0]

#### Seperate Date and Time

In [8]:
# Create a column called "Pickup_Time"
df_Uber['Pickup_Time'] = df_Uber['Pickup_Date_Time'].str.slice(11,19)
## Move Pickup_Time column next to Pickup_Date
Pickup_Time = df_Uber['Pickup_Time']
df_Uber.drop(labels=['Pickup_Time'], axis=1, inplace=True)
df_Uber.insert(2, "Pickup_Time", Pickup_Time)

# Rename "Pickup_Date_Time" to "Pickup_Date", then remove the time part
df_Uber.rename(columns={"Pickup_Date_Time":"Pickup_Date"}, inplace = True)
df_Uber['Pickup_Date'] = df_Uber['Pickup_Date'].str[:11]


# Create a column called "Day_of_Week" which shows on which day of the week the rides happened. 
df_Uber['Pickup_Date'] = pd.to_datetime(df_Uber['Pickup_Date'])
df_Uber['DayofWeek'] = df_Uber['Pickup_Date'].dt.day_name()

## Move DayofWeek column next to Pickup_Time
DayofWeek = df_Uber["DayofWeek"]
df_Uber.drop(labels=["DayofWeek"], axis=1, inplace=True)
df_Uber.insert(3, "DayofWeek", DayofWeek)


#### Insert a column for travel distance

In [122]:
# This formula takes about 8 minutes to generate the Distance column in df_Uber dataframe
# Since the data frames will be exported to cvs files, don't run this if not necessary!
df_Uber['Distance'] = [calculate_distance(**df_Uber[['Pickup_Latitude', 'Pickup_Longitude', 'Dropoff_Latitude', 'Dropoff_Longitude']].iloc[i].to_dict()) for i in range(df_Uber.shape[0])]
df_Uber

Unnamed: 0,Fare_Amount,Pickup_Date,Pickup_Time,DayofWeek,Pickup_Longitude,Pickup_Latitude,Dropoff_Longitude,Dropoff_Latitude,Passenger_Count,Distance
0,7.50,2015-05-07,19:52:06,Thursday,-73.999817,40.738354,-73.999512,40.723217,1,1.683323
1,7.70,2009-07-17,20:04:56,Friday,-73.994355,40.728225,-73.994710,40.750325,1,2.457590
2,12.90,2009-08-24,21:45:00,Monday,-74.005043,40.740770,-73.962565,40.772647,1,5.036377
3,5.30,2009-06-26,08:22:21,Friday,-73.976124,40.790844,-73.965316,40.803349,3,1.661683
4,16.00,2014-08-28,17:47:00,Thursday,-73.925023,40.744085,-73.973082,40.761247,5,4.475450
5,4.90,2011-02-12,02:27:09,Saturday,-73.969019,40.755910,-73.969019,40.755910,1,0.000000
6,24.50,2014-10-12,07:04:00,Sunday,-73.961447,40.693965,-73.871195,40.774297,5,11.731015
8,9.70,2012-02-17,09:32:00,Friday,-73.975187,40.745767,-74.002720,40.743537,1,2.332711
9,12.50,2012-03-29,19:06:00,Thursday,-74.001065,40.741787,-73.963040,40.775012,1,4.889417
10,6.50,2015-05-22,17:32:27,Friday,-73.974388,40.746952,-73.988586,40.729805,1,2.250858


#### Export the data as csv

In [126]:
df_Uber.to_csv(r'D:\IEOR4501Project\Uber_Data.csv', index = None, header=True)

### Processing Weather Data
#### Import weather data

In [9]:
# Loading all weather data from 2009 to 2015
df_Weather_2009 = pd.read_csv(r'C:\Users\siqie\Desktop\2009_weather.csv')
df_Weather_2010 = pd.read_csv(r'C:\Users\siqie\Desktop\2010_weather.csv')
df_Weather_2011 = pd.read_csv(r'C:\Users\siqie\Desktop\2011_weather.csv')
df_Weather_2012 = pd.read_csv(r'C:\Users\siqie\Desktop\2012_weather.csv')
df_Weather_2013 = pd.read_csv(r'C:\Users\siqie\Desktop\2013_weather.csv')
df_Weather_2014 = pd.read_csv(r'C:\Users\siqie\Desktop\2014_weather.csv')
df_Weather_2015 = pd.read_csv(r'C:\Users\siqie\Desktop\2015_weather.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


#### Remove unnecessary columns for each dataset


In [10]:
# Since there are too many columns we want to delete, we used "column number" method to clean the data
df_Weather_2009 = df_Weather_2009.drop(df_Weather_2009.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)
df_Weather_2010 = df_Weather_2010.drop(df_Weather_2010.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)
df_Weather_2011 = df_Weather_2011.drop(df_Weather_2011.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)
df_Weather_2012 = df_Weather_2012.drop(df_Weather_2012.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)
df_Weather_2013 = df_Weather_2013.drop(df_Weather_2013.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)
df_Weather_2014 = df_Weather_2014.drop(df_Weather_2014.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)
df_Weather_2015 = df_Weather_2015.drop(df_Weather_2015.columns[[0,2,3,4,5,7,8,12,13,14,15,16,17,18,19,20,21,28,29,30,31,33,34,35,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]], axis=1)



#### Put together hourly weather data

In [11]:
# Since we will only focus on the weather from 2012 to 2014, then we only clean up what we need.
## All the hourly data has REPORT_TYPE: FM-15, so we only select FM-15
df_Weather_2012_hourly = df_Weather_2012[df_Weather_2012['REPORT_TYPE'].str.contains('FM-15')]
df_Weather_2013_hourly = df_Weather_2013[df_Weather_2013['REPORT_TYPE'].str.contains('FM-15')]
df_Weather_2014_hourly = df_Weather_2014[df_Weather_2014['REPORT_TYPE'].str.contains('FM-15')]

## Put 3 hourly data into one hourly weather data
frames1 = [df_Weather_2012_hourly, df_Weather_2013_hourly, df_Weather_2014_hourly]
df_Weather_hourly = pd.concat(frames1)



#### Clean up hourly weather data

In [12]:
# Delete unnecessary columns from the table (Most of the daily data). 
df_Weather_hourly = df_Weather_hourly.drop(df_Weather_hourly.columns[[9,10,11,12,13,14,15,16,17]], axis=1)


# Split Datatime to Date and Time
df_Weather_hourly['TIME']=df_Weather_hourly['DATE'].str.slice(11,19) # Create a new column that only has time

# Move TIME column right after DATE column
TIME = df_Weather_hourly["TIME"]
df_Weather_hourly.drop(labels=["TIME"], axis=1, inplace=True)
df_Weather_hourly.insert(1, "TIME", TIME)


# Only leave the date information in "DATE" column
df_Weather_hourly['DATE'] = df_Weather_hourly['DATE'].str[:-9]
df_Weather_hourly

# Add a new column that indicates days of a week
df_Weather_hourly['DATE'] = pd.to_datetime(df_Weather_hourly['DATE'])
df_Weather_hourly['DayofWeek'] = df_Weather_hourly['DATE'].dt.day_name()

# Move DayofWeek column right after TIME
DayofWeek = df_Weather_hourly["DayofWeek"]
df_Weather_hourly.drop(labels=["DayofWeek"], axis=1, inplace=True)
df_Weather_hourly.insert(2, "DayofWeek", DayofWeek)

# Replace T in Precipitation column with 0 since T means trace amount and we would like to have it as an integer
df_Weather_hourly['HourlyPrecipitation'].replace({"T": 0}, inplace=True)

#### Put together daily weather data

In [13]:
# For daily data we will also focus on 2012 to 2014
## All the daily data has REPORT_TYPE: SOD, so we only select SOD data.
df_Weather_2012_daily = df_Weather_2012[df_Weather_2012['REPORT_TYPE'].str.contains('SOD')]
df_Weather_2013_daily = df_Weather_2013[df_Weather_2013['REPORT_TYPE'].str.contains('SOD')]
df_Weather_2014_daily = df_Weather_2014[df_Weather_2014['REPORT_TYPE'].str.contains('SOD')]

## Put 3 daily data into one hourly weather data
frames2 = [df_Weather_2012_daily, df_Weather_2013_daily, df_Weather_2014_daily]
df_Weather_daily = pd.concat(frames2)

#### Clean up daily weather data

In [14]:
# Delete unnecessary columns (hourly data)
df_Weather_daily = df_Weather_daily.drop(df_Weather_daily.columns[[2,3,4,5,6]], axis=1)
df_Weather_daily

# Change the column name of "DailySustainedWindSpeed" to "SustainedWindSpeed"
df_Weather_daily.rename(columns={"DailySustainedWindSpeed":"SustainedWindSpeed"}, inplace = True)

# Only leave date information in DATE column
df_Weather_daily['DATE'] = df_Weather_daily['DATE'].str[:-9]
df_Weather_daily

# Add a new column that indicates days of a week
df_Weather_daily['DATE'] = pd.to_datetime(df_Weather_daily['DATE'])
df_Weather_daily['DayofWeek'] = df_Weather_daily['DATE'].dt.day_name()


# Move DayofWeek column right after TIME
DayofWeek = df_Weather_daily["DayofWeek"]
df_Weather_daily.drop(labels=["DayofWeek"], axis=1, inplace=True)
df_Weather_daily.insert(1, "DayofWeek", DayofWeek)
df_Weather_daily

# Replace T in Precipitation column with 0 since T means trace amount and we would like to have it as an integer
df_Weather_daily['DailyPrecipitation'].replace({"T": 0}, inplace=True)



# The column names are already normalized, no further action needed.
# Based on what is needed from this data, there's no more invalid data point that must be removed. 

#### Generate values for hourly sustained wind speed

In [15]:
df_Weather_hourly = pd.merge(df_Weather_hourly, df_Weather_daily.loc[:,['DATE', 'SustainedWindSpeed']], how='left', on='DATE')
df_Weather_hourly

Unnamed: 0,DATE,TIME,DayofWeek,REPORT_TYPE,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyWindGustSpeed,HourlyWindSpeed,Sunrise,Sunset,SustainedWindSpeed
0,2012-01-01,06:00:00,Sunday,FM-15,36,37,,,8.0,,,
1,2012-01-01,08:00:00,Sunday,FM-15,36,37,,,0.0,,,
2,2012-01-12,08:00:00,Thursday,FM-15,43,43,0.03,,0.0,,,
3,2012-01-17,15:00:00,Tuesday,FM-15,41,43,0.01,18.0,10.0,,,
4,2012-01-27,01:00:00,Friday,FM-15,39,41,,,3.0,,,
5,2012-01-27,03:00:00,Friday,FM-15,43,43,,,5.0,,,
6,2012-01-27,07:00:00,Friday,FM-15,48,48,0,,7.0,,,
7,2012-01-27,11:00:00,Friday,FM-15,55,55,0.01,,3.0,,,
8,2012-02-02,00:51:00,Thursday,FM-15,34,48,,,5.0,,,
9,2012-02-02,01:51:00,Thursday,FM-15,32,46,,,3.0,,,


#### Export Weather data into csv files

In [11]:
df_Weather_hourly.to_csv(r'D:\IEOR4501Project\Weather_hourly_Data.csv', index = None, header=True)
df_Weather_daily.to_csv(r'D:\IEOR4501Project\Weather_daily_Data.csv', index = None, header=True)

### Yellow Taxi Data

In [2]:
import math

import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db

import re

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

# add other constants to refer to any local data, e.g. uber & weather
UBER_CSV = "uber_rides_sample.csv"

NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [4]:
# Get UBER data and count the number of rows in each month

def load_and_clean_uber_data(csv_file):
    
    df = pd.read_csv (csv_file)
    
    del df["Unnamed: 0"]
    del df["key"]
    df = df[df["pickup_longitude"] <= -73.717047]
    df = df[df["pickup_longitude"] >= -74.242330]
    df = df[df["dropoff_longitude"] <= -73.717047]
    df = df[df["dropoff_longitude"] >= -74.242330]
    df = df[df["pickup_latitude"] <= 40.908524]
    df = df[df["pickup_latitude"] >= 40.560445]
    df = df[df["dropoff_latitude"] <= 40.908524]
    df = df[df["dropoff_latitude"] >= 40.560445]
    
    # df.to_csv('uber_data.csv')
    return df
    
    # raise NotImplemented()
    
def get_uber_data():
    
    uber_dataframe = load_and_clean_uber_data(r'/Users/sandyfan/Desktop/project/uber_rides_sample.csv')
    # uber_dataframe = load_and_clean_uber_data(r'/content/gdrive/My Drive//uber_rides_sample.csv')
    
    # add_distance_column(uber_dataframe)
    
    return uber_dataframe

In [5]:
def find_taxi_csv_urls():
    
    #content = requests.get('https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page')
    content = requests.get(TAXI_URL)
        
    soup = bs4.BeautifulSoup(content.text, 'html.parser')
    
    links = []
    for x in [a['href'] for a in soup.find_all('a')]:
        string = x
        pattern = re.compile(r"https://s3.amazonaws.com/nyc-tlc/trip\+\bdata/yell.*[\W_]csv")
        match = pattern.findall(string)
        if match != []:
            a = match[0]
            links.append(a)
            
    yellow_taxi = links[54:]   # get the links of all years that we need 
    
    
    return yellow_taxi
    
    # raise NotImplemented()

In [6]:
def get_month_taxi_data(url,year,month):
    
    taxi_link = pd.read_csv(url)
    #taxi_link  = pd.read_csv(url,on_bad_lines='skip')
    
    # removing trips that start and/or end outside
    #taxi_link = taxi_link[taxi_link["pickup_longitude"] <= -73.717047]
    #taxi_link = taxi_link[taxi_link["pickup_longitude"] >= -74.242330]
    #taxi_link = taxi_link[taxi_link["dropoff_longitude"] <= -73.717047]
    #taxi_link = taxi_link[taxi_link["dropoff_longitude"] >= -74.242330]
    #taxi_link = taxi_link[taxi_link["pickup_latitude"] <= 40.908524]
    #taxi_link = taxi_link[taxi_link["pickup_latitude"] >= 40.560445]
    #taxi_link = taxi_link[taxi_link["dropoff_latitude"] <= 40.908524]
    #taxi_link = taxi_link[taxi_link["dropoff_latitude"] >= 40.560445]

    # FOR 2014 dataset: different format of columns' names
    # print(taxi_link_2014.columns.tolist()) -- check columns' names
    taxi_link = taxi_link[taxi_link[" pickup_longitude"] <= -73.717047]
    taxi_link = taxi_link[taxi_link[" pickup_longitude"] >= -74.242330]
    taxi_link = taxi_link[taxi_link[" dropoff_longitude"] <= -73.717047]
    taxi_link = taxi_link[taxi_link[" dropoff_longitude"] >= -74.242330]
    taxi_link = taxi_link[taxi_link[" pickup_latitude"] <= 40.908524]
    taxi_link = taxi_link[taxi_link[" pickup_latitude"] >= 40.560445]
    taxi_link = taxi_link[taxi_link[" dropoff_latitude"] <= 40.908524]
    taxi_link = taxi_link[taxi_link[" dropoff_latitude"] >= 40.560445]

    #taxi_link = taxi_link[taxi_link["Start_Lon"] <= -73.717047]
    #taxi_link = taxi_link[taxi_link["Start_Lon"] >= -74.242330]
    #taxi_link = taxi_link[taxi_link["End_Lon"] <= -73.717047]
    #taxi_link = taxi_link[taxi_link["End_Lon"] >= -74.242330]
    #taxi_link = taxi_link[taxi_link["Start_Lat"] <= 40.908524]
    #taxi_link = taxi_link[taxi_link["Start_Lat"] >= 40.560445]
    #taxi_link = taxi_link[taxi_link["End_Lat"] <= 40.908524]
    #taxi_link = taxi_link[taxi_link["End_Lat"] >= 40.560445]
    
    sample_data = taxi_link.sample(month_name_values[year][month])
    
    return sample_data

### Get & Clean Sample Data
### Too slow to run -- run in colab -- DO NOT RUN THIS PART 

In [None]:
def get_and_clean_taxi_data_2015():      
    
    taxi_dataframes_2015 = []
    
    all_csv_urls = find_taxi_csv_urls()
    
    for i in range(6):    # set as 6 b/c there are only first six months uber data in 2015

        dataframe = get_month_taxi_data(all_csv_urls[i],2015,i+1)
        
        # add_distance_column(dataframe)
        
        taxi_dataframes_2015.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data_2015 = pd.concat(taxi_dataframes_2015)
    return taxi_data_2015

In [None]:
yellow_taxi_2015 = get_and_clean_taxi_data_2015()
yellow_taxi_2015.to_csv('yellow_2015.csv')

In [None]:
def get_and_clean_taxi_data_2014():      
    
    taxi_dataframes_2014 = []
    
    all_csv_urls = find_taxi_csv_urls()
    
    for i in range(12,24):

        dataframe = get_month_taxi_data(all_csv_urls[i],2014,i-11)
        
        # add_distance_column(dataframe)
        
        taxi_dataframes_2014.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data_2014 = pd.concat(taxi_dataframes_2014)
    return taxi_data_2014

In [None]:
yellow_taxi_2014 = get_and_clean_taxi_data_2014()
yellow_taxi_2014.to_csv('yellow_2014.csv')

In [None]:
def get_and_clean_taxi_data_2013():      
    
    taxi_dataframes_2013 = []
    
    all_csv_urls = find_taxi_csv_urls()
    
    for i in range(24,36):

        dataframe = get_month_taxi_data(all_csv_urls[i],2013,i-23)
        
        # add_distance_column(dataframe)
        
        taxi_dataframes_2013.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data_2013 = pd.concat(taxi_dataframes_2013)
    return taxi_data_2013

In [None]:
yellow_taxi_2013 = get_and_clean_taxi_data_2013()
yellow_taxi_2013.to_csv('yellow_2013.csv')

In [None]:
def get_and_clean_taxi_data_2012():      
    
    taxi_dataframes_2012 = []
    
    all_csv_urls = find_taxi_csv_urls()
    
    for i in range(36,48):

        dataframe = get_month_taxi_data(all_csv_urls[i],2012,i-35)
        
        # add_distance_column(dataframe)
        
        taxi_dataframes_2012.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data_2012 = pd.concat(taxi_dataframes_2012)
    
    return taxi_data_2012

In [None]:
yellow_taxi_2012 = get_and_clean_taxi_data_2012()
yellow_taxi_2012.to_csv('/content/gdrive/My Drive/yellow_2012.csv')

In [None]:
def get_and_clean_taxi_data_2011():      
    
    taxi_dataframes_2011 = []
    
    all_csv_urls = find_taxi_csv_urls()
    
    for i in range(48,60):

        dataframe = get_month_taxi_data(all_csv_urls[i],2011,i-47)
        
        # add_distance_column(dataframe)
        
        taxi_dataframes_2011.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data_2011 = pd.concat(taxi_dataframes_2011)
    
    return taxi_data_2011

In [None]:
yellow_taxi_2011 = get_and_clean_taxi_data_2011()
yellow_taxi_2011.to_csv('/content/gdrive/My Drive/yellow_2011.csv')

In [None]:
def get_and_clean_taxi_data_2010():      
    
    taxi_dataframes_2010 = []
    
    all_csv_urls = find_taxi_csv_urls()
    
    for i in range(60,72):

        dataframe = get_month_taxi_data(all_csv_urls[i],2010,i-59)
        
        # add_distance_column(dataframe)
        
        taxi_dataframes_2010.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data_2010 = pd.concat(taxi_dataframes_2010)
    return taxi_data_2010

In [None]:
yellow_taxi_2010 = get_and_clean_taxi_data_2010()
yellow_taxi_2010.to_csv('/content/gdrive/My Drive/yellow_2010.csv')

In [None]:
def get_and_clean_taxi_data_2009():      
    
    taxi_dataframes_2009 = []
    
    all_csv_urls = find_taxi_csv_urls()
    
    for i in range(72,84):

        dataframe = get_month_taxi_data(all_csv_urls[i],2009,i-71)
        
        # add_distance_column(dataframe)
        
        taxi_dataframes_2009.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data_2009 = pd.concat(taxi_dataframes_2009)
    return taxi_data_2009

In [None]:
yellow_taxi_2009 = get_and_clean_taxi_data_2009()
yellow_taxi_2009.to_csv('/content/gdrive/My Drive/yellow_2009.csv')

### After get all Taxi Sample Data  -- 195472 Rows
### Load saved data from local location

In [7]:
data_taxi_2009 = pd.read_csv (r'/Users/sandyfan/Desktop/project/yellow_2009.csv')
data_taxi_2010 = pd.read_csv (r'/Users/sandyfan/Desktop/project/yellow_2010.csv')
data_taxi_2011 = pd.read_csv (r'/Users/sandyfan/Desktop/project/yellow_2011.csv')
data_taxi_2012 = pd.read_csv (r'/Users/sandyfan/Desktop/project/yellow_2012.csv')
data_taxi_2013 = pd.read_csv (r'/Users/sandyfan/Desktop/project/yellow_2013.csv')
data_taxi_2014 = pd.read_csv (r'/Users/sandyfan/Desktop/project/yellow_2014.csv')
data_taxi_2015 = pd.read_csv (r'/Users/sandyfan/Desktop/project/yellow_2015.csv')

In [8]:
# Rename the columns' names of data_taxi_2009

data_taxi_2009 = data_taxi_2009.rename(columns={"vendor_name": "vendor_id", "Trip_Pickup_DateTime": "pickup_datetime",
                               "Trip_Dropoff_DateTime": "dropoff_datetime","Passenger_Count": "passenger_count",
                               "Trip_Distance": "trip_distance","Start_Lon": "pickup_longitude",
                               "Start_Lat": "pickup_latitude","Rate_Code": "rate_code",
                               "store_and_forward": "store_and_fwd_flag","End_Lon": "dropoff_longitude",
                               "End_Lat": "dropoff_latitude","Payment_Type": "payment_type",
                               "Fare_Amt": "fare_amount","Tip_Amt": "tip_amount","surcharge":"extra",
                               "Tolls_Amt": "tolls_amount","Total_Amt": "total_amount"})

In [25]:
def combine_and_clean_all_data():
    
    all_origin_data = [data_taxi_2009,data_taxi_2010,data_taxi_2011,data_taxi_2012,data_taxi_2013,
                       data_taxi_2014,data_taxi_2015]
    
    all_cleaned_data = []
    
    for data in all_origin_data:
        
        # Select 13 useful columns
        data_cleaned = pd.DataFrame(data, columns = ["pickup_datetime","dropoff_datetime",
                                                            "passenger_count","trip_distance","pickup_longitude",
                                                            "pickup_latitude","dropoff_longitude","dropoff_latitude",
                                                            "fare_amount","extra","tip_amount","tolls_amount",
                                                            "total_amount"])
        
        
        
        # add_distance_column(dataframe)
        
        all_cleaned_data.append(data_cleaned)
    
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.concat(all_cleaned_data)
    # save data
    taxi_data.to_csv('final_taxi_data.csv')
    
    return taxi_data

## Part 2: Storing Cleaned Data

In [None]:
uber_data =  pd.read_csv (r'/Users/sandyfan/Desktop/project/final_uber_data.csv')
taxi_data =  pd.read_csv (r'/Users/sandyfan/Desktop/project/final_taxi_data.csv')
hourly_data = pd.read_csv (r'/Users/sandyfan/Desktop/project/Weather_hourly_Data.csv')
daily_data = pd.read_csv (r'/Users/sandyfan/Desktop/project/Weather_daily_Data.csv')

In [92]:
# first create a connection - we'll create a new database

DATABASE_URL = "sqlite:///project_1.db"

engine = db.create_engine(DATABASE_URL)

In [93]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
CREATE TABLE IF NOT EXISTS HourlyWeather
(
    HourlyWeather_id INTEGER PRIMARY KEY AUTOINCREMENT,
    DATE DATE,
    TIME TIME,
    DayofWeek TEXT,
    REPORT_TYPE TEXT,
    HourlyDewPointTemperature INTEGER,
    HourlyDryBulbTemperature INTEGER,
    HourlyPrecipitation REAL,
    HourlyWindGustSpeed INTEGER,
    HourlyWindSpeed INTEGER,
    Sunrise INTEGER,
    Sunset INTEGER,
    SustainedWindSpeed INTEGER
)
"""

DAILY_WEATHER_SCHEMA = """
CREATE TABLE IF NOT EXISTS DailyWeather
(
    DailyWeather_id INTEGER PRIMARY KEY AUTOINCREMENT,
    DATE DATE,
    DayofWeek TEXT,
    REPORT_TYPE TEXT,
    Sunrise INTEGER,
    Sunset INTEGER,
    DailyAverageDewPointTemperature INTEGER,
    DailyAverageDryBulbTemperature INTEGER,
    DailyAverageWindSpeed REAL,
    DailyMaximumDryBulbTemperature INTEGER,
    DailyMinimumDryBulbTemperature INTEGER,
    DailyPeakWindDirection INTEGER,
    DailyPeakWindSpeed INTEGER,
    DailyPrecipitation REAL,
    SustainedWindSpeed INTEGER
)
"""

TAXI_TRIPS_SCHEMA = """
CREATE TABLE IF NOT EXISTS TaxiRides
(
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    pickup_datetime DATETIME,
    dropoff_datetime DATETIME,
    passenger_count INTEGER,
    trip_distance REAL,
    pickup_longitude REAL,
    pickup_latitude REAL,
    dropoff_longitude REAL,
    dropoff_latitude REAL,
    fare_amount REAL,
    extra REAL,
    tip_amount REAL,
    tolls_amount REAL,
    total_amount REAL

"""


UBER_TRIPS_SCHEMA = """
CREATE TABLE IF NOT EXISTS UberRides
(
    UberRides_ID INTEGER PRIMARY KEY AUTOINCREMENT,
    Pickup_Date DATE,
    Pickup_Time TIME,
    DayofWeek TEXT,
    Pickup_Longitude REAL,
    Pickup_Latitude REAL,
    Dropoff_Longitude REAL,
    Dropoff_Latitude REAL,
    Passenger_Count INTEGER,
    Distance REAL
)
"""

In [94]:
# create that required schema.sql file

with open("schema.sql", "w") as f: 
    
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [95]:
# create the tables with the schema files

with engine.connect() as connection:
    pass


#### Create sqlalchemy database

In [2]:
from sqlalchemy import Column, Integer, String, DateTime, Date, Time, Float, DateTime
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.declarative import declarative_base
os.chdir(r'D:\IEOR4501Project')

engine = db.create_engine(DATABASE_URL)

Base = declarative_base()

class HOURLY_WEATHER_SCHEMA(Base):
    __tablename__ = 'HOURLY_WEATHER_SCHEMA'

    
    HourlyWeather_id = Column(Integer, primary_key = True)
    DATE = Column(Date)
    TIME = Column(Time)
    DayofWeek = Column(String)
    REPORT_TYPE = Column(String)
    HourlyDewPointTemperature = Column(Integer)
    HourlyDryBulbTemperature = Column(Integer)
    HourlyPrecipitation = Column(Float)
    HourlyWindGustSpeed = Column(Integer)
    HourlyWindSpeed = Column(Integer)
    Sunrise = Column(Integer)
    Sunset = Column(Integer)
    SustainedWindSpeed = Column(Integer)


Base.metadata.create_all(engine)


file_name = "Weather_hourly_Data.csv"

df_Weather_Hourly = pd.read_csv(file_name)

df_Weather_Hourly.to_sql(con=engine, name = HOURLY_WEATHER_SCHEMA.__tablename__, if_exists='append', index=False)

session = sessionmaker()
session.configure(bind=engine)
s = session()

result1 = s.query(HOURLY_WEATHER_SCHEMA).limit(10).all()
result1
    


FileNotFoundError: [Errno 2] No such file or directory: 'D:\\IEOR4501Project'

In [99]:
from sqlalchemy import Column, Integer, String, DateTime, Date, Time, Float, DateTime
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.declarative import declarative_base
os.chdir(r'D:\IEOR4501Project')

engine = db.create_engine(DATABASE_URL)

Base = declarative_base()

class TAXI_TRIPS_SCHEMA(Base):
   __tablename__ = 'TAXI_TRIPS_SCHEMA'

    
    id = Column(Integer, Primary_Key = True)
    pickup_datetime = Column(DateTime)
    dropoff_datetime = Column(DateTime)
    passenger_count = Column(Integer)
    trip_distance = Column(Float)
    pickup_longitude = Column(Float)
    pickup_latitude = Column(Float)
    dropoff_longitude = Column(Float)
    dropoff_latitude = Column(Float)
    fare_amount = Column(Float)
    extra = Column(Float)
    tip_amount = Column(Float)
    tolls_amount = Column(Float)
    total_amount = Column(Float)


Base.metadata.create_all(engine)


file_name = "final_taxi_data.csv"

TAXI_TRIPS_SCHEMA = pd.read_csv(file_name)

TAXI_TRIPS_SCHEMA.to_sql(con=engine, name = TAXI_TRIPS_SCHEMA.__tablename__, if_exists='append', index=False)

session = sessionmaker()
session.configure(bind=engine)
t = session()

result2 = t.query(TAXI_TRIPS_SCHEMA).limit(10).all()
result2

IndentationError: unexpected indent (<ipython-input-99-c391a93c189d>, line 15)

### Add Data to Database

In [None]:
from sqlalchemy.orm import sessionmaker

def dataframes_to_table(table_name):
    
    table_name["taxi_trips"].to_sql(name = "TaxiRides",con = engine)
    table_name["uber_trips"].to_sql(name = "UberRides",con = engine)
    table_name["hourly_weather"].to_sql(name = "HourlyWeather",con = engine)
    table_name["daily_weather"].to_sql(name = "DailyWeather",con = engine)

In [None]:
table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

dataframes_to_table(table_name_to_dataframe)

In [None]:
!sqlite3 project_1.db < schema.sql