### Notebook for data load and EDA

In [3]:
# libs import
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
import lux
import tqdm
import requests
from typing import List, Optional
import plotly.express as px

# remove warnings
warnings.filterwarnings('ignore')

# retina display -> delete this line if you don't have a retina display
%config InlineBackend.figure_format = 'retina'

In [4]:
# data path and import 
import requests

def download_one_file_of_raw_data(year: int, month: int) -> Path:
    """Downloads one file of raw data from the website of the TLC Trip Record Data and saves it to the data/raw folder.
    Args:
        year (int): The year of the data to download.
        month (int): The month of the data to download.
    Returns:
        Path: The path to the downloaded file.
    """
    # create the url
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month:02d}.parquet"
    response = requests.get(url)

    if response.status_code == 200:
        # create the path to the file
        path = f'../data/raw/rides_{year}-{month:02d}.parquet'
        open(path, 'wb').write(response.content)
        return path #return Path(path) 
    else:
        print(f"Could not download file {url}")
        return None


In [5]:
# call the function
download_one_file_of_raw_data(2022, 1)

'../data/raw/rides_2022-01.parquet'

In [6]:
# read the file into a pandas dataframe
rides = pd.read_parquet('../data/raw/rides_2022-01.parquet')
rides.head()

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [7]:
# we need to know the pickup time and drop off location
rides = rides[['tpep_pickup_datetime', 'PULocationID']]

In [8]:
# rename the columns
rides.rename(columns={
    'tpep_pickup_datetime': 'pickup_datetime', 
    'PULocationID': 'pickup_location_id'}, 
    inplace=True)
#rides.head()

In [9]:
# describe the data
rides.pickup_datetime.describe()

count                 2463931
unique                1423522
top       2022-01-26 07:57:00
freq                       12
first     2008-12-31 22:23:09
last      2022-05-18 20:41:57
Name: pickup_datetime, dtype: object


In [10]:
# remove the rides that are not in 2022
# we need only Jan 2022
rides = rides[rides.pickup_datetime >= '2022-01-01']
rides = rides[rides.pickup_datetime <  '2022-02-01']
rides.pickup_datetime.describe()

count                 2463879
unique                1423471
top       2022-01-26 07:57:00
freq                       12
first     2022-01-01 00:00:08
last      2022-01-31 23:59:58
Name: pickup_datetime, dtype: object


In [11]:
# save the data
rides.to_parquet('../data/transformed/validated_rides_2022-01.parquet')

In [12]:
# read transformed data
rides = pd.read_parquet('../data/transformed/validated_rides_2022-01.parquet')
rides.head()

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()