In [1]:
import pandas as pd

In [2]:
!pwd

/home/bri/zoomcamp/data_engineering_zoomcamp_workshop/01-docker-terraform/pipeline


In [None]:
!uv pip list

In [3]:
url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2025-11.parquet'

In [7]:
try:
    df_trip = pd.read_parquet(url)
    print(df_trip.head())
except ImportError as e:
    print(f"Error: {e}")
    print("Please install pyarrow or fastparquet to read parquet files.")
except Exception as e:
    print(f"An error occurred: {e}")

   VendorID lpep_pickup_datetime lpep_dropoff_datetime store_and_fwd_flag  \
0         2  2025-11-01 00:34:48   2025-11-01 00:41:39                  N   
1         2  2025-11-01 00:18:52   2025-11-01 00:24:27                  N   
2         2  2025-11-01 01:03:14   2025-11-01 01:15:24                  N   
3         2  2025-11-01 00:10:57   2025-11-01 00:24:53                  N   
4         1  2025-11-01 00:03:48   2025-11-01 00:19:38                  N   

   RatecodeID  PULocationID  DOLocationID  passenger_count  trip_distance  \
0         1.0            74            42              1.0           0.74   
1         1.0            74            42              2.0           0.95   
2         1.0            83           160              1.0           2.19   
3         1.0           166           127              1.0           5.44   
4         1.0           166           262              1.0           3.20   

   fare_amount  ...  mta_tax  tip_amount  tolls_amount  ehail_fee  \
0    

In [8]:
df_trip.info()

<class 'pandas.DataFrame'>
RangeIndex: 46912 entries, 0 to 46911
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               46912 non-null  int32         
 1   lpep_pickup_datetime   46912 non-null  datetime64[us]
 2   lpep_dropoff_datetime  46912 non-null  datetime64[us]
 3   store_and_fwd_flag     41343 non-null  str           
 4   RatecodeID             41343 non-null  float64       
 5   PULocationID           46912 non-null  int32         
 6   DOLocationID           46912 non-null  int32         
 7   passenger_count        41343 non-null  float64       
 8   trip_distance          46912 non-null  float64       
 9   fare_amount            46912 non-null  float64       
 10  extra                  46912 non-null  float64       
 11  mta_tax                46912 non-null  float64       
 12  tip_amount             46912 non-null  float64       
 13  tolls_amount

In [9]:
url_zones = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv'

In [10]:
try:
    df_zones = pd.read_csv(url_zones)
    print(df_zones.head())
except ImportError as e:
    print(f"Error: {e}")
    print("Please install pyarrow or fastparquet to read parquet files.")
except Exception as e:
    print(f"An error occurred: {e}")

   LocationID        Borough                     Zone service_zone
0           1            EWR           Newark Airport          EWR
1           2         Queens              Jamaica Bay    Boro Zone
2           3          Bronx  Allerton/Pelham Gardens    Boro Zone
3           4      Manhattan            Alphabet City  Yellow Zone
4           5  Staten Island            Arden Heights    Boro Zone


In [11]:
df_zones.info()

<class 'pandas.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   LocationID    265 non-null    int64
 1   Borough       265 non-null    str  
 2   Zone          264 non-null    str  
 3   service_zone  263 non-null    str  
dtypes: int64(1), str(3)
memory usage: 16.9 KB


In [20]:
for col in df_zones.columns:
    print(f"--- Column: {col} ---")
    print(df_zones[col].value_counts())
    print("\n")

--- Column: LocationID ---
LocationID
1      1
2      1
3      1
4      1
5      1
      ..
261    1
262    1
263    1
264    1
265    1
Name: count, Length: 265, dtype: int64


--- Column: Borough ---
Borough
Queens           69
Manhattan        69
Brooklyn         61
Bronx            43
Staten Island    20
Unknown           2
EWR               1
Name: count, dtype: int64


--- Column: Zone ---
Zone
Governor's Island/Ellis Island/Liberty Island    3
Corona                                           2
Newark Airport                                   1
Jamaica Bay                                      1
Allerton/Pelham Gardens                          1
                                                ..
Woodside                                         1
World Trade Center                               1
Yorkville East                                   1
Yorkville West                                   1
NV                                               1
Name: count, Length: 261, dtype: in

In [12]:
def multi_data_check_with_names(datasets, dataset_names):
    """
    Function to display missing values, percentage of missing values, 
    duplicated data, total number of rows, and dtype of each column 
    for multiple datasets.
    
    :param datasets: A list of pandas dataframes to check.
    :param dataset_names: A list of dataset names corresponding to each dataframe.
    :return: A summary dataframe with the details for all datasets.
    """
    
    summary_list = []  # To store details for each dataset
    
    for i, data in enumerate(datasets):
        # Get the name of the dataset from the passed dataset_names list
        dataset_name = dataset_names[i] if i < len(dataset_names) else f'Dataset_{i+1}'
        
        # For each dataset, we compute the necessary details
        null_counts = data.isnull().sum()
        null_percentage = (null_counts / len(data)) * 100
        dup_count = data.duplicated().sum()
        dtypes = data.dtypes
        total_rows = len(data)  # Total number of rows
        
        # Create a temporary DataFrame to store the results for this dataset
        temp_df = pd.DataFrame({
            'Dataset': dataset_name,
            'Column': data.columns,
            'Data Type': dtypes.values,
            'Missing Values': null_counts.values,
            'Percentage Missing': null_percentage.values,
            'Total Rows': [total_rows] * len(data.columns),
            'Duplicate Rows': [dup_count] * len(data.columns)  # same for all columns
                 # same for all columns
            
        })
        
        # Append the result to the summary list
        summary_list.append(temp_df)
    
    # Concatenate all summaries into one DataFrame
    final_summary = pd.concat(summary_list, ignore_index=True)
    
    return final_summary

In [13]:
dataset = [df_trip, df_zones]
names = ['df_trip', 'df_zones']

In [14]:
multi_data_check_with_names(dataset, names)

Unnamed: 0,Dataset,Column,Data Type,Missing Values,Percentage Missing,Total Rows,Duplicate Rows
0,df_trip,VendorID,int32,0,0.0,46912,0
1,df_trip,lpep_pickup_datetime,datetime64[us],0,0.0,46912,0
2,df_trip,lpep_dropoff_datetime,datetime64[us],0,0.0,46912,0
3,df_trip,store_and_fwd_flag,str,5569,11.871163,46912,0
4,df_trip,RatecodeID,float64,5569,11.871163,46912,0
5,df_trip,PULocationID,int32,0,0.0,46912,0
6,df_trip,DOLocationID,int32,0,0.0,46912,0
7,df_trip,passenger_count,float64,5569,11.871163,46912,0
8,df_trip,trip_distance,float64,0,0.0,46912,0
9,df_trip,fare_amount,float64,0,0.0,46912,0


## Insights from multiple data check

1. 'ehail_fee' column in df_trip has 100% missing data. After consulting with software engineers and business people, it is a valid feature for green taxi after court decision in 2013. So we keep it as is.
2. Meanwhile, these 6 columns in df_trip that have 11% to 12% missing values are to be kept as they are. We can analyze further if there are patterns emerge from those missing values : Whether specific areas have higher missing values. This can start a business process investigation.
   - store_and_fwd_flag
   - RatecodeID
   - passenger_count
   - payment_type
   - trip_type
   - congestion_surcharge
4. 

# Create dtypes and parse date for df_trip

To be used in iteration of chunk (batches)

In [25]:
df_trip.columns

Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge',
       'cbd_congestion_fee'],
      dtype='str')

In [None]:
dtype = {
    'VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge',
       'cbd_congestion_fee'