In [11]:
import math
import os

import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db
import geopandas as gpd

# any constants you might need; some have been added for you, and 
# some you need to fill in

TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = "data/taxi_zones"
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
UBER_CSV = ""
WEATHER_CSV_DIR = ""

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise


# Load Taxi Zones
def load_taxi_zones(shapefile):
    taxi_zones = gpd.read_file(shapefile)
    return taxi_zones

def lookup_coords_for_taxi_zone_id(zone_loc_id, loaded_taxi_zones):
    zone_coords = loaded_taxi_zones[loaded_taxi_zones['LocationID'] == zone_loc_id].iloc[0]['geometry'].centroid.coords[0]
    return zone_coords

# Calculate distance
def calculate_distance_with_coords(from_coord, to_coord):
    # convert coordinates to radians
    from_lat, from_lon = from_coord
    to_lat, to_lon = to_coord
    from_lat, from_lon, to_lat, to_lon = map(math.radians, [from_lat, from_lon, to_lat, to_lon])
    
    # haversine formula
    d_lat = to_lat - from_lat
    d_lon = to_lon - from_lon
    a = math.sin(d_lat/2)**2 + math.cos(from_lat) * math.cos(to_lat) * math.sin(d_lon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    radius = 6371 # radius of Earth in km
    distance = radius * c
    return distance

def calculate_distance_with_zones(from_zone, to_zone):
    from_coords = lookup_coords_for_taxi_zone_id(from_zone, taxi_zones)
    to_coords = lookup_coords_for_taxi_zone_id(to_zone, taxi_zones)
    distance = calculate_distance_with_coords(from_coords, to_coords)
    return distance

def add_distance_column(dataframe):
    dataframe['distance'] = dataframe.apply(lambda row: calculate_distance_with_zones(row['PULocationID'], row['DOLocationID']), axis=1)
    return dataframe


# Process Taxi Data
def get_all_urls_from_taxi_page(taxi_page):
    response = requests.get(taxi_page)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')



   


In [18]:
import pandas as pd

UBER_DATA = 'uber_rides_sample.csv'

def load_and_clean_uber_data(csv_file):
    # Load CSV file into a pandas dataframe
    df = pd.read_csv(csv_file)
   
    # Convert datetime column to datetime format
    df['datetime'] = pd.to_datetime(df['key'])
    df = add_distance_column(df)
    
    # Return the cleaned dataframe
    return df
    
# def add_distance_column(df):
#     # Compute the distance between pickup and dropoff locations using the haversine formula
#     # and add it as a new column to the dataframe
#     # (implementation details depend on the format of the data, which is not provided)
#     # ...
#     df['distance'] = (df['pickup_longitude'] -  df['dropoff_longitude'])**2 + (df['pickup_latitude'] -  df['dropoff_latitude'])**2
#     df = df[df['distance'] != 0]
    
#     # Return the dataframe with the new column added
#     return df

def add_distance_column(dataframe):
    dataframe['distance'] = dataframe.apply(lambda row: calculate_distance_with_coords((row['pickup_latitude'], row['pickup_longitude']),(row['dropoff_latitude'], row['dropoff_longitude']), axis=1))
    return dataframe

def get_uber_data():
    # Load and clean the Uber data from the CSV file
    uber_dataframe = load_and_clean_uber_data(UBER_DATA)
    
    # Add a distance column to the dataframe
    uber_dataframe = add_distance_column(uber_dataframe)

    
   
    
    # Return the cleaned dataframe
    return uber_dataframe

# Load and print the first few rows of the Uber data
uber_data = get_uber_data()
print(uber_data.head())


KeyError: 'pickup_latitude'

In [13]:
uber_data.head(5)

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,datetime,distance
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,2015-05-07 19:52:06.000000300,0.000229
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1,2009-07-17 20:04:56.000000200,0.000489
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1,2009-08-24 21:45:00.000000610,0.002821
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,2009-06-26 08:22:21.000000100,0.000273
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,2014-08-28 17:47:00.000000188,0.002604


In [None]:
import os
import pandas as pd

WEATHER_CSV_DIR = 'path/to/weather/csv/files'

def get_all_weather_csvs(directory):
    # Get a list of all CSV files in the specified directory
    csv_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]
    
    # Return the list of CSV files
    return csv_files

def clean_month_weather_data_hourly(csv_file):
    # Load CSV file into a pandas dataframe
    df = pd.read_csv(csv_file)
    
    # Convert datetime column to datetime format
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    # Return the cleaned dataframe
    return df

def clean_month_weather_data_daily(csv_file):
    # Load CSV file into a pandas dataframe
    df = pd.read_csv(csv_file)
    
    # Convert date column to datetime format
    df['date'] = pd.to_datetime(df['date'])
    
    # Return the cleaned dataframe
    return df

def load_and_clean_weather_data():
    # Get a list of all weather CSV files in the specified directory
    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    # Create two lists to store hourly and daily dataframes from each month
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        # Clean the weather data for the current month and append the hourly and daily dataframes
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # Concatenate all hourly and daily dataframes into two dataframes
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    # Return the cleaned hourly and daily dataframes
    return hourly_data, daily_data

# Load and print the first few rows of the hourly and daily weather data
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()
print(hourly_weather_data.head())
print(daily_weather_data.head())
