In [1]:
from pathlib import Path
import requests
import pandas as pd
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
from tqdm import tqdm
%matplotlib inline

In [2]:
import sys
print(sys.version)

3.11.9 (main, Jul 27 2024, 15:25:39) [Clang 15.0.0 (clang-1500.3.9.4)]


In [3]:
taxi_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/"
folder  = "../data/"

In [4]:
def fetch_data(base_url: str, year: int, month: int, download_dir: str):
    file_name = f"yellow_tripdata_{year}-{month:02d}.parquet"
    url = f"{base_url}{file_name}"
    raw_dir = Path(download_dir) / "raw"
    if not raw_dir.exists():
        raw_dir.mkdir(parents=True, exist_ok=True)  # Create the "raw" directory if it doesn't exist
        print(f'Folder "raw" created in "{download_dir}"')
    download_path = raw_dir / file_name
    download_file(url, download_path)
    
def download_file(url: str, path: Path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024  # 1 Kibibyte
        
        if path.exists():
            print("Overwriting existing file")
            path.unlink()  # Remove the existing file
        
        with open(path, 'wb') as file, tqdm(
            desc=path.name,
            total=total_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for data in response.iter_content(block_size):
                file.write(data)
                bar.update(len(data))
        
        print(f"File {path.name} saved")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")

In [5]:
def save_validated_data(rides: pd.DataFrame, folder: str, file_name: str):
    transformed_dir = Path(folder) / "transformed"
    if not transformed_dir.exists():
        transformed_dir.mkdir(parents=True, exist_ok=True)  # Create the "transformed" directory if it doesn't exist
        print(f'Folder "transformed" created in "{folder}"')
    save_path = transformed_dir / file_name
    
    # Save the DataFrame to a parquet file with a progress bar
    with tqdm(total=len(rides), desc="Saving data", unit="rows") as pbar:
        rides.to_parquet(save_path)
        pbar.update(len(rides))
    
    print(f'Validated data saved to "{save_path}"')

In [6]:

fetch_data(taxi_url, 2024, 1,folder)

Folder "raw" created in "../data/"


yellow_tripdata_2024-01.parquet: 100%|██████████| 47.6M/47.6M [02:25<00:00, 344kiB/s] 

File yellow_tripdata_2024-01.parquet saved





In [7]:
rides = pd.read_parquet(folder + "raw/yellow_tripdata_2024-01.parquet")
rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2964624 entries, 0 to 2964623
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int32         
 8   DOLocationID           int32         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [8]:
# Change dtypes from int32 to int64
for column in rides.select_dtypes(include=['int32']).columns:
    rides[column] = rides[column].astype('int64') 


# Display DataFrame info
rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2964624 entries, 0 to 2964623
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [9]:
profile = ProfileReport(rides, title="Profiling Report")
profile.to_file("profile.html")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
# Assuming 'rides' is your DataFrame
nan_passenger_count = rides[rides['store_and_fwd_flag'].isnull()]
nan_passenger_count

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
2824462,2,2024-01-01 00:34:19,2024-01-01 00:51:22,,2.04,,,143,141,0,12.72,0.00,0.5,0.00,0.00,1.0,16.72,,
2824463,1,2024-01-01 00:14:31,2024-01-01 00:19:29,,1.60,,,236,238,0,9.30,1.00,0.5,2.86,0.00,1.0,17.16,,
2824464,1,2024-01-01 00:35:11,2024-01-01 01:13:40,,0.00,,,142,79,0,21.01,0.00,0.5,0.00,0.00,1.0,25.01,,
2824465,1,2024-01-01 00:33:37,2024-01-01 00:50:34,,0.00,,,237,4,0,17.79,0.00,0.5,0.00,0.00,1.0,21.79,,
2824466,1,2024-01-01 00:49:04,2024-01-01 01:01:16,,0.00,,,244,50,0,34.65,0.00,0.5,0.00,0.00,1.0,38.65,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2964619,2,2024-01-31 23:45:59,2024-01-31 23:54:36,,3.18,,,107,263,0,15.77,0.00,0.5,2.00,0.00,1.0,21.77,,
2964620,1,2024-01-31 23:13:07,2024-01-31 23:27:52,,4.00,,,114,236,0,18.40,1.00,0.5,2.34,0.00,1.0,25.74,,
2964621,2,2024-01-31 23:19:00,2024-01-31 23:38:00,,3.33,,,211,25,0,19.97,0.00,0.5,0.00,0.00,1.0,23.97,,
2964622,2,2024-01-31 23:07:23,2024-01-31 23:25:14,,3.06,,,107,13,0,23.88,0.00,0.5,5.58,0.00,1.0,33.46,,


In [11]:
rides = (
    rides[['tpep_pickup_datetime', 'PULocationID']]
    .rename(columns={'tpep_pickup_datetime': 'pickup_datetime', 'PULocationID': 'pickup_location_id'})
    .assign(pickup_time=lambda x: x['pickup_datetime'])
    .loc[lambda x: (x['pickup_datetime'] >= '2024-01-01 00:00:00') & (x['pickup_datetime'] < '2024-02-01 00:00:00')]
)

In [12]:
save_validated_data(rides, folder, "clean_rides_2024-01.parquet")

Folder "transformed" created in "../data/"


Saving data: 100%|██████████| 2964606/2964606 [00:00<00:00, 13239809.96rows/s]

Validated data saved to "../data/transformed/clean_rides_2024-01.parquet"



