# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1uAUJGEUzfNj6OsWNAimnYCw7eKaHhMUfU1MTj9YwYw4/edit?usp=sharing), [grading rubric](https://docs.google.com/document/d/1hKuRWqFcIdhOkow3Nljcm7PXzIkoa9c_aHkMKZDxWa0/edit?usp=sharing)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an outline to help you with your own approach.**_

## Project Setup

In [1]:
# all import statements needed for the project, for example:

import math
import os

import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db
import math
import geopandas as gpd

In [6]:
# any constants you might need; some have been added for you, and 
# some you need to fill in

TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = "./taxi_zones"
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
UBER_CSV = "uber_rides_sample.csv"
WEATHER_CSV_DIR = ""

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [7]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

### Load Taxi Zones

In [10]:
def get_and_clean_taxi_zones():
    '''
    This function is used for loading the taxi_zones.shp file and cleaning the file,
    including adding longitude and latitude columns by looking up the polygon,
    removing the duplicate LocationID, setting a new index, and removing useless columns
    '''
    
    #load taxi_zones shapefile
    taxi_zones_shapefile = gpd.read_file('./taxi_zones.shp')
    shp = taxi_zones_shapefile.to_crs(4326)
    
    #look up and get longitude column
    shp["longitude"]=shp.centroid.x
    #look up and get latitude column
    shp["latitude"]=shp.centroid.y
    
    #check if there are any duplicate values in LocationID 
    if shp.loc[shp['OBJECTID'] != shp['LocationID']]['LocationID'].count() != 0:
        #manually correct the duplicate values of LocationID
        shp.loc[shp['OBJECTID'] == 57, 'LocationID'] = 57
        shp.loc[shp['OBJECTID'] == 104, 'LocationID'] = 104
        shp.loc[shp['OBJECTID'] == 105, 'LocationID'] = 105
    
    #check if the index is 'OBJECTID'
    if shp.index.name != 'OBJECTID':
        #change the index to 'OBJECTID'
        shp = shp.set_index('OBJECTID')
    
    #only keep useful columns
    shp = shp[['LocationID','longitude','latitude']]
    
    #after cleaning, return the shp file
    return shp

In [11]:
get_and_clean_taxi_zones()

DriverError: ./taxi_zones.shp: No such file or directory

In [None]:
def lookup_coords_for_taxi_zone_id(zone_loc_id, loaded_taxi_zones):
    raise NotImplementedError()

### Calculate distance

In [None]:
def calculate_distance_with_coords(from_coord, to_coord):
    
    R = 6371.0
    
    pickup_lat = math.radians(from_coord['pickup_latitude'])
    pickup_lon = math.radians(from_coord['pickup_longitude'])
    dropoff_lat = math.radians(to_coord['dropoff_latitude'])
    dropoff_lon = math.radians(to_coord['dropoff_longitude'])
    
    diff_lon = dropoff_lon - pickup_lon
    diff_lat = dropoff_lat - pickup_lat
    
    a = math.sin(diff_lat/2)**2 + math.cos(pickup_lat) * math.cos(dropoff_lat) * math.sin(diff_lon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = R * c
    
    return distance

In [None]:
def add_distance_column(df):
    
    df['distance'] = df.apply(lambda x: calculate_distance_with_coords(
                            x[['pickup_latitude','pickup_longitude']], 
                            x[['dropoff_latitude','dropoff_longitude']]), axis=1)
    
    return df

### Process Taxi Data

In [None]:
def get_all_yellow_taxi_urls(TAXI_URL):
    """
    This function, get_parquet_files(), is used for collecting all urls of 
    yellow taxi data from 2009-01 to 2015-06 and save them into a list.
    """
    import re #using the re module
    from bs4 import BeautifulSoup
    import requests
    
    response = requests.get(TAXI_URL)
    soup = BeautifulSoup(response.content, 'html.parser')

    #write a regular expression to help pull out the desired links for Yellow Taxi Parquet files
    pattern = r"yellow_tripdata_(2009-(0[1-9]|1[0-2])|201[0-4]-(0[1-9]|1[0-2])|2015-(0[1-6]))"
    parquet_links_list = []
    
    for url in [a['href'] for a in soup.find_all('a')]:
        if re.search(pattern, url):
            parquet_links_list.append(url)
            
    return parquet_links_list

In [None]:
def get_and_clean_month(taxi_data_url):
    
    #give an unique file name by slicing the corresponding month 
    filename = taxi_data_url.split('/')[-1] 
    
    zones_file = get_and_clean_taxi_zones()
    
    #check if the file is already downloaded to the current path
    if not os.path.exists(filename):
        response = requests.get(taxi_data_url, stream=True)
        #download the monthly parquet file from correspond taxi url
        with open(filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024): 
                if chunk:
                    f.write(chunk)
    
    
    #clean the parquet file for corresponding month
    if '2009' in filename:
        
        df = pd.read_parquet(filename)
        
        df = df[['Trip_Pickup_DateTime','Tip_Amt','Start_Lon','Start_Lat','End_Lon','End_Lat']]
        
        df.rename(columns={'Trip_Pickup_DateTime':'pickup_datetime','Tip_Amt':'tip_amount',
                           'Start_Lon':'pickup_longitude','Start_Lat':'pickup_latitude',
                           'End_Lon':'dropoff_longitude','End_Lat':'dropoff_latitude'}, inplace=True)
    
    elif '2010' in filename:
        
        df = pd.read_parquet(filename)
        df = df[['pickup_datetime','tip_amount','pickup_longitude',
                 'pickup_latitude','dropoff_longitude','dropoff_latitude']]
    
    elif '2011' in filename or '2012' in filename or '2013' in filename or '2014' in filename or '2015' in filename:
        
        df = pd.read_parquet(filename)
        df = df[['tpep_pickup_datetime','PULocationID','DOLocationID','tip_amount']]
        
        #zones_file = get_and_clean_taxi_zones()
        
        df = pd.merge(df, zones_file, left_on="PULocationID", right_on="LocationID", how="left")
        df = pd.merge(df, zones_file, left_on="DOLocationID", right_on="LocationID", 
                             suffixes=("_PU", "_DO"), how="left")
        
        df = df[['tpep_pickup_datetime','tip_amount',
                 'longitude_PU','latitude_PU','longitude_DO','latitude_DO']]
        
        df.rename(columns={'tpep_pickup_datetime':'pickup_datetime',
                           'longitude_PU':'pickup_longitude','latitude_PU':'pickup_latitude',
                           'longitude_DO':'dropoff_longitude','latitude_DO':'dropoff_latitude'},inplace=True)
    
    df = df[df['tip_amount']>=0]
    
    #removing all data outside of the New York Box range
    
    #for pickup_longitude and pickup_latitude
    df = df[(df['pickup_longitude']>=NEW_YORK_BOX_COORDS[0][1])
            &(df['pickup_longitude']<=NEW_YORK_BOX_COORDS[1][1])
            &(df['pickup_latitude']>=NEW_YORK_BOX_COORDS[0][0])
            &(df['pickup_latitude']<=NEW_YORK_BOX_COORDS[1][0])]
    
    #for dropoff_longitude and dropoff_latitude
    df = df[(df['dropoff_longitude']>=NEW_YORK_BOX_COORDS[0][1])
            &(df['dropoff_longitude']<=NEW_YORK_BOX_COORDS[1][1])
            &(df['dropoff_latitude']>=NEW_YORK_BOX_COORDS[0][0])
            &(df['dropoff_latitude']<=NEW_YORK_BOX_COORDS[1][0])]
    
    #remove all trips with the same pickup_longitude and dropoff_longitude
    #and the same pickup_latitude and dropoff_latitude
    #which may result distance = 0 
    #filter out these values
    df = df[(df['pickup_longitude']!=df['dropoff_longitude'])
           &(df['pickup_latitude']!=df['dropoff_latitude'])]
    
    #change the type of "pickup_datetime" into python datetime
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    
    #make sure all the data types are the desired ones
    df = df.astype({'tip_amount':'float64',
                    'pickup_longitude':'float64','pickup_latitude':'float64',
                    'dropoff_longitude':'float64','dropoff_latitude':'float64'})
    
    #since we need to match the amount of data from the taxi file with the one from the uber 
    #as we know that the cleaned uber data is about 195,000, 
    #we have total 78 months from 2009-01 to 2015-06, so 195,000/78 is about 2500 rows for each month
    df = df.sample(2500)
    
    #return the cleaned taxi data frame
    return df

In [None]:
def get_and_clean_taxi_data(parquet_urls):
    
    all_taxi_dataframes = []
    
    for parquet_url in parquet_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_month(parquet_url)
        add_distance_column(dataframe)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_taxi_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.contact(all_taxi_dataframes)
    
    return taxi_data

In [None]:
def get_taxi_data():
    
    all_urls = get_all_yellow_taxi_urls(TAXI_URL)
    taxi_data = get_and_clean_taxi_data(all_urls)
    
    return taxi_data

In [None]:
taxi_data = get_taxi_data()

In [None]:
taxi_data.head()

### Processing Uber Data

In [None]:
def load_and_clean_uber_data():
    
    #load in the uber csv file as a data frame by pandas, and set the first column as index
    df_uber = pd.read_csv(UBER_CSV, index_col=0)
    
    #only keeping useful columns
    df_uber = df_uber[['pickup_datetime','pickup_longitude','pickup_latitude',
                       'dropoff_longitude','dropoff_latitude','passenger_count' ]]
    
    #removing the rows whose passenger count is less than or equal to 0, which is unpractical
    df_uber = df_uber[df_uber['passenger_count']>0]
    
    #removing all data outside of the New York Box range
    
    #for pickup_longitude and pickup_latitude
    df_uber = df_uber[(df_uber['pickup_longitude']>=NEW_YORK_BOX_COORDS[0][1])
                  &(df_uber['pickup_longitude']<=NEW_YORK_BOX_COORDS[1][1])
                  &(df_uber['pickup_latitude']>=NEW_YORK_BOX_COORDS[0][0])
                  &(df_uber['pickup_latitude']<=NEW_YORK_BOX_COORDS[1][0])]
    
    #for dropoff_longitude and dropoff_latitude
    df_uber = df_uber[(df_uber['dropoff_longitude']>=NEW_YORK_BOX_COORDS[0][1])
                  &(df_uber['dropoff_longitude']<=NEW_YORK_BOX_COORDS[1][1])
                  &(df_uber['dropoff_latitude']>=NEW_YORK_BOX_COORDS[0][0])
                  &(df_uber['dropoff_latitude']<=NEW_YORK_BOX_COORDS[1][0])]
    
    #filter out distance = 0 
    df_uber = df_uber[(df_uber['pickup_longitude']!=df_uber['dropoff_longitude'])
                      &(df_uber['pickup_latitude']!=df_uber['dropoff_latitude'])]
    
    #change the type of "pickup_datetime" into python datetime
    df_uber['pickup_datetime'] = pd.to_datetime(df_uber['pickup_datetime'])
    
    #change all the data types into desired ones
    df_uber = df_uber.astype({'pickup_longitude':'float64','pickup_latitude':'float64',
                              'dropoff_longitude':'float64','dropoff_latitude':'float64',
                              'passenger_count':'int64'})
    
    #return the cleaned uber data frame
    return df_uber

In [None]:
def get_uber_data():
    uber_dataframe = load_and_clean_uber_data()
    add_distance_column(uber_dataframe)
    return uber_dataframe

In [None]:
uber_data = get_uber_data()

In [None]:
uber_data.head()

### Processing Weather Data

In [None]:
def get_all_weather_csvs():
    
    #initiate with an empty list 
    df_weathers = []
    
    #iterate the year from 2009 to 2015
    for year in range(2009, 2016, 1):
        df_weather = str(year) + '_weather.csv'
        df_weathers.append(df_weather)
    
    #return a list with all weathers files names from 2009 to 2015
    return df_weathers

In [None]:
def get_and_clean_weather_data_hourly(csv_file):
    
    #load a certain weather csv file into data frame
    df_weather = pd.read_csv(csv_file)
    
    #transform the data type of column 'Date' into a datetime format
    df_weather['DATE'] = pd.to_datetime(df_weather['DATE'])
    
    #split hour and minute of a datetime and create corresponding columns respectively
    df_weather['timestamp_hour'] = df_weather['DATE'].apply(lambda x:str(x.hour).zfill(2))
    df_weather['timestamp_minute'] = df_weather['DATE'].apply(lambda x:str(x.minute).zfill(2))

    #as I manually observe that the daily data is always collected at the last minute of that day
    #I create a new data frame by filtering the datetime as the last minute of a day
    df_weather_day_end = df_weather[(df_weather['timestamp_hour'] == '23')
                                & (df_weather['timestamp_minute'] == '59')]
    
    #use the index to drop all daily collected weather data from the total weather data
    #after that, we can get the pure hourly weather data 
    df_weather_hourly = df_weather.drop(df_weather_day_end.index)
    
    #reset the index
    df_weather_hourly.reset_index(inplace=True, drop=True)
    
    #only keep the useful columns
    df_weather_hourly = df_weather_hourly[['DATE','HourlyPrecipitation','HourlyWindSpeed']]
    
    # https://stackoverflow.com/questions/58807577/pandas-dataframe-extracting-float-values-from-string-in-a-column
    #since I found that there are some data with units, strip all the units and only keep the numeric values
    df_weather_hourly['HourlyPrecipitation'] = pd.to_numeric(
        df_weather_hourly['HourlyPrecipitation'].str.extract(r'(\d+\.?\d*)', expand=False), errors='coerce')
    
    #As the documentation mentions that the blank/null values indicate that
    #no precipitation was observed/reported for the hour ending at that time
    #also, the value of 'T' means trace amount of precipitation, 
    #therefore, we can directly fill all these values as 0
    df_weather_hourly['HourlyPrecipitation'] = df_weather_hourly['HourlyPrecipitation'].fillna(0)
    
    #since there are some rows whose hourly wind speed is null
    #as this is a very small amount, we can drop these values, which will not affect the whole trend
    #by the law of large numbers
    df_weather_hourly = df_weather_hourly.dropna(subset = 'HourlyWindSpeed')
    
    #make sure all the data types are the desired ones
    df_weather_hourly = df_weather_hourly.astype({'HourlyPrecipitation':'float64',
                                                  'HourlyWindSpeed':'float64'})
    
    #rename the essential columns
    df_weather_hourly.rename(columns={'DATE':'Date'}, inplace=True)
    
    #make sure the 'Date' columns follow datetime format
    df_weather_hourly['Date'] = pd.to_datetime(df_weather_hourly['Date'])
    
    #return the cleaned hourly weather data
    return df_weather_hourly

In [None]:
def get_and_clean_weather_data_daily(csv_file):
    
    #load a certain weather csv file into data frame
    df_weather = pd.read_csv(csv_file)
    
    #transform the data type of column 'Date' into a datetime format
    df_weather['DATE'] = pd.to_datetime(df_weather['DATE'])
    
    #split hour and minute of a datetime and create corresponding columns respectively
    df_weather['timestamp_hour'] = df_weather['DATE'].apply(lambda x:str(x.hour).zfill(2))
    df_weather['timestamp_minute'] = df_weather['DATE'].apply(lambda x:str(x.minute).zfill(2))

    #as I manually observe that the daily data is always collected at the last minute of that day
    #I create a new data frame by filtering the datetime as the last minute of a day
    df_weather_day_end = df_weather[(df_weather['timestamp_hour'] == '23')
                                & (df_weather['timestamp_minute'] == '59')]
    
    #use the index to drop all daily collected weather data from the total weather data
    #after that, we can get the pure hourly weather data 
    df_weather_hourly = df_weather.drop(df_weather_day_end.index)
    
    #rearrange the index, counting from zero and increment
    df_weather_hourly.reset_index(inplace=True, drop=True)
    
    #only keep the useful columns
    df_weather_hourly = df_weather_hourly[['DATE','HourlyPrecipitation','HourlyWindSpeed']]
    
    # https://stackoverflow.com/questions/58807577/pandas-dataframe-extracting-float-values-from-string-in-a-column
    #found some data with units, clear all the units and only keep the numeric values
    df_weather_hourly['HourlyPrecipitation'] = pd.to_numeric(
        df_weather_hourly['HourlyPrecipitation'].str.extract(r'(\d+\.?\d*)', expand=False), errors='coerce')
    
    #As the documentation mentions that the blank/null values indicate that
    #no precipitation was observed/reported for the hour ending at that time
    #also, the value of 'T' means trace amount of precipitation, 
    #therefore, we can directly fill all these values as 0
    df_weather_hourly['HourlyPrecipitation'] = df_weather_hourly['HourlyPrecipitation'].fillna(0)
    
    #since there are some rows whose hourly wind speed is null
    #as this is a very small amount, we can drop these values, which will not affect the whole trend
    #by the law of large numbers
    df_weather_hourly = df_weather_hourly.dropna(subset='HourlyWindSpeed')
    
    #since we can get mostly the hourly data from all weather files
    #however, there are too many missing values in a daily frequency, especially from 2009 to 2012
    #using hourly data to populate the daily data 
    #also, by observing the daily precipitation and daily average wind speed columns from 2013 to 2015
    #the way to calculate the daily precipitation is to take summation
    #the way to calculate the daily average wind speed is to take average of samples in a 24-hours interval
    df_weather_daily = df_weather_hourly.groupby(
        df_weather_hourly['DATE'].dt.date).agg({'HourlyPrecipitation': 'sum', 'HourlyWindSpeed': 'mean'})
    
    #since the index for daily df is the date, create another date column DATE with index values
    df_weather_daily['DATE'] = df_weather_daily.index
    
    #make sure the DATE as the datetime format
    df_weather_daily['DATE'] = pd.to_datetime(df_weather_daily['DATE'])
    
    #reset the default index as row number
    df_weather_daily.reset_index(inplace=True, drop=True)
    
    #rename the essential columns
    df_weather_daily.rename(columns={'DATE':'Date',
                                     'HourlyPrecipitation':'DailyPrecipitation',
                                     'HourlyWindSpeed':'DailyAverageWindSpeed'}, inplace=True)
    
    #make sure all the data types are the desired ones
    df_weather_daily = df_weather_daily.astype({'DailyPrecipitation':'float64',
                                                'DailyAverageWindSpeed':'float64'})
    
    #rearrange the order of all columns
    df_weather_daily = df_weather_daily.reindex(columns=['Date','DailyPrecipitation','DailyAverageWindSpeed'])
        
    #return the cleaned version of daily weather data
    return df_weather_daily

In [None]:
def get_and_clean_sunrise_sunset_data(csv_file):
    
    #load a certain weather csv file into data frame
    df_weather = pd.read_csv(csv_file)
    
    #transform the data type of column 'Date' into a datetime format
    df_weather['DATE'] = pd.to_datetime(df_weather['DATE'])
    
    #split hour and minute of a datetime and create corresponding columns respectively
    df_weather['timestamp_hour'] = df_weather['DATE'].apply(lambda x:str(x.hour).zfill(2))
    df_weather['timestamp_minute'] = df_weather['DATE'].apply(lambda x:str(x.minute).zfill(2))

    #as I manually observe that the daily data is always collected at the last minute of that day
    #create a new data frame with only the daily data
    df_weather_sun = df_weather[(df_weather['timestamp_hour'] == '23')
                                & (df_weather['timestamp_minute'] == '59')]
    
    #rearrange the index, counting from zero and increment
    df_weather_sun.reset_index(inplace=True, drop=True)
    
    #only keep the important columns
    df_weather_sun = df_weather_sun[['DATE','Sunrise','Sunset']]
    
    #since we can't use any other columns to populate the null values
    #directly drop the null values
    df_weather_sun = df_weather_sun.dropna(subset=['Sunrise','Sunset'])
    
    #rename the essential columns
    df_weather_sun.rename(columns={'DATE':'Date'}, inplace=True)
    
    #make sure the DATE as the datetime format
    df_weather_sun['Date'] = pd.to_datetime(df_weather_sun['Date'])
    
    return df_weather_sun

In [None]:
def load_and_clean_weather_data():
    
    weather_csv_files = get_all_weather_csvs()
    
    hourly_dataframes = []
    daily_dataframes = []
    sun_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = get_and_clean_weather_data_hourly(csv_file)
        daily_dataframe = get_and_clean_weather_data_daily(csv_file)
        sun_dataframe = get_and_clean_sunrise_sunset_data(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        sun_dataframes.append(sun_dataframe)
        
    # create two dataframes with hourly & daily data from every year
    # create one dataframe with only sunrise and sunset data
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    sun_data = pd.concat(sun_dataframes)
    
    #make sure all index are counting the row number from 0 and gradually incrementing
    hourly_data.reset_index(inplace=True, drop=True)
    daily_data.reset_index(inplace=True, drop=True)
    sun_data.reset_index(inplace=True, drop=True)
    
    return hourly_data, daily_data, sun_data

In [None]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data.head()

In [None]:
daily_weather_data.head()

## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
CREATE TABLE IF NOT EXISTS hourly_weather (
    hourly_Id INTEGER PRIMARY KEY,
    Date DATE,
    HourlyPrecipitation FLOAT,
    HourlyWindSpeed FLOAT
);
"""

DAILY_WEATHER_SCHEMA = """
CREATE TABLE IF NOT EXISTS daily_weather (
    daily_Id INTEGER PRIMARY KEY,
    Date DATE,
    DailyPrecipitation FLOAT,
    DailyAverageWindSpeed FLOAT
);
"""

TAXI_TRIPS_SCHEMA = """
CREATE TABLE IF NOT EXISTS taxi_trips (
    taxi_Id INTEGER PRIMARY KEY AUTOINCREMENT,
    pickup_datetime DATE,
    tip_amount FLOAT,
    pickup_longitude FLOAT,
    pickup_latitude FLOAT,
    dropoff_longitude FLOAT,
    dropoff_latitude FLOAT,
    distance FLOAT
);
"""

UBER_TRIPS_SCHEMA = """
CREATE TABLE IF NOT EXISTS uber_trips (
    uber_Id INTEGER PRIMARY KEY AUTOINCREMENT,
    pickup_datetime DATE,
    pickup_longitude FLOAT,
    pickup_latitude FLOAT,
    dropoff_longitude FLOAT,
    dropoff_latitude FLOAT,
    passenger_count INTEGER,
    distance FLOAT
    );
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict):
    raise NotImplemented()

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """
TODO
"""

In [None]:
engine.execute(QUERY_1).fetchall()

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)