In [4]:
# NYC Yellow Taxi Analytics - Data Quality Assessment
# This notebook performs comprehensive data quality checks on the taxi trip data

import pandas as pd
import numpy as np
import os
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

print("=" * 80)
print("NYC YELLOW TAXI DATA - COMPREHENSIVE QUALITY ASSESSMENT")
print("=" * 80)

NYC YELLOW TAXI DATA - COMPREHENSIVE QUALITY ASSESSMENT


In [5]:

# 1. DATA DISCOVERY - Find all parquet files

data_path = '/Users/yash/Documents/Projects/NYC_Yellow_Taxi_Analytics/data/raw/'

# Find all parquet files across all years
all_files = []
for year in ['2022', '2023', '2024', '2025']:
    year_path = os.path.join(data_path, year)
    if os.path.exists(year_path):
        files = glob(os.path.join(year_path, '*.parquet'))
        all_files.extend(files)

print(f"\nüìÅ DISCOVERED FILES")
print("=" * 80)
print(f"Total parquet files found: {len(all_files)}")

# Organize by year
files_by_year = {}
for file in sorted(all_files):
    year = file.split('/')[-2]
    if year not in files_by_year:
        files_by_year[year] = []
    files_by_year[year].append(os.path.basename(file))

for year, files in sorted(files_by_year.items()):
    print(f"\n{year}: {len(files)} files")
    for f in files[:3]:  # Show first 3 files
        print(f"  - {f}")
    if len(files) > 3:
        print(f"  ... and {len(files) - 3} more")




üìÅ DISCOVERED FILES
Total parquet files found: 44

2022: 12 files
  - yellow_tripdata_2022-01.parquet
  - yellow_tripdata_2022-02.parquet
  - yellow_tripdata_2022-03.parquet
  ... and 9 more

2023: 12 files
  - yellow_tripdata_2023-01.parquet
  - yellow_tripdata_2023-02.parquet
  - yellow_tripdata_2023-03.parquet
  ... and 9 more

2024: 12 files
  - yellow_tripdata_2024-01.parquet
  - yellow_tripdata_2024-02.parquet
  - yellow_tripdata_2024-03.parquet
  ... and 9 more

2025: 8 files
  - yellow_tripdata_2025-01.parquet
  - yellow_tripdata_2025-02.parquet
  - yellow_tripdata_2025-03.parquet
  ... and 5 more


In [7]:
# 2. LOAD SAMPLE DATA - Load one file from each year for quality check

print("\n\nüìä LOADING SAMPLE DATA FROM EACH YEAR")
print("=" * 80)

sample_data = {}
for year in files_by_year.keys():
    try:
        # Load first file from each year
        file_path = os.path.join(data_path, year, files_by_year[year][0])
        df_sample = pd.read_parquet(file_path)
        sample_data[year] = df_sample
        
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"\n{year}:")
        print(f"  File: {files_by_year[year][0]}")
        print(f"  Rows: {len(df_sample):,}")
        print(f"  Columns: {len(df_sample.columns)}")
        print(f"  File Size: {file_size_mb:.2f} MB")
        print(f"  Memory Usage: {df_sample.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
    except Exception as e:
        print(f"\n{year}: ‚ùå Error loading - {str(e)}")

# Use most recent year for detailed analysis
latest_year = max(sample_data.keys())
df = sample_data[latest_year]
print(f"\n‚úÖ Using {latest_year} data for detailed quality checks")



üìä LOADING SAMPLE DATA FROM EACH YEAR

2022:
  File: yellow_tripdata_2022-01.parquet
  Rows: 2,463,931
  Columns: 19
  File Size: 36.37 MB
  Memory Usage: 454.09 MB

2022:
  File: yellow_tripdata_2022-01.parquet
  Rows: 2,463,931
  Columns: 19
  File Size: 36.37 MB
  Memory Usage: 454.09 MB

2023:
  File: yellow_tripdata_2023-01.parquet
  Rows: 3,066,766
  Columns: 19
  File Size: 45.46 MB
  Memory Usage: 565.61 MB

2023:
  File: yellow_tripdata_2023-01.parquet
  Rows: 3,066,766
  Columns: 19
  File Size: 45.46 MB
  Memory Usage: 565.61 MB

2024:
  File: yellow_tripdata_2024-01.parquet
  Rows: 2,964,624
  Columns: 19
  File Size: 47.65 MB
  Memory Usage: 511.09 MB

2024:
  File: yellow_tripdata_2024-01.parquet
  Rows: 2,964,624
  Columns: 19
  File Size: 47.65 MB
  Memory Usage: 511.09 MB

2025:
  File: yellow_tripdata_2025-01.parquet
  Rows: 3,475,226
  Columns: 20
  File Size: 56.42 MB
  Memory Usage: 616.31 MB

‚úÖ Using 2025 data for detailed quality checks

2025:
  File: yello

In [8]:

# 3. SCHEMA VALIDATION - Check column structure

print("\n\nüìã SCHEMA VALIDATION")
print("=" * 80)

# Expected columns based on data dictionary
expected_columns = [
    'VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
    'passenger_count', 'trip_distance', 'RatecodeID',
    'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
    'payment_type', 'fare_amount', 'extra', 'mta_tax',
    'tip_amount', 'tolls_amount', 'improvement_surcharge',
    'total_amount', 'congestion_surcharge', 'Airport_fee'
]

print("\n‚úì Column Presence Check:")
actual_columns = df.columns.tolist()
missing_columns = set(expected_columns) - set(actual_columns)
extra_columns = set(actual_columns) - set(expected_columns)

if not missing_columns and not extra_columns:
    print("  ‚úÖ All expected columns present, no extra columns")
else:
    if missing_columns:
        print(f"  ‚ö†Ô∏è  Missing columns: {missing_columns}")
    if extra_columns:
        print(f"  ‚ÑπÔ∏è  Extra columns: {extra_columns}")

print(f"\n‚úì Column Count: {len(actual_columns)}")
print(f"\n‚úì Column Names and Data Types:")
for col in actual_columns:
    dtype = df[col].dtype
    null_pct = (df[col].isna().sum() / len(df)) * 100
    print(f"  {col:30s} | {str(dtype):15s} | Nulls: {null_pct:5.2f}%")




üìã SCHEMA VALIDATION

‚úì Column Presence Check:
  ‚ÑπÔ∏è  Extra columns: {'cbd_congestion_fee'}

‚úì Column Count: 20

‚úì Column Names and Data Types:
  VendorID                       | int32           | Nulls:  0.00%
  tpep_pickup_datetime           | datetime64[us]  | Nulls:  0.00%
  tpep_dropoff_datetime          | datetime64[us]  | Nulls:  0.00%
  passenger_count                | float64         | Nulls: 15.54%
  trip_distance                  | float64         | Nulls:  0.00%
  RatecodeID                     | float64         | Nulls: 15.54%
  store_and_fwd_flag             | object          | Nulls: 15.54%
  PULocationID                   | int32           | Nulls:  0.00%
  DOLocationID                   | int32           | Nulls:  0.00%
  payment_type                   | int64           | Nulls:  0.00%
  fare_amount                    | float64         | Nulls:  0.00%
  extra                          | float64         | Nulls:  0.00%
  mta_tax                        | floa

In [9]:

# 4. DATA QUALITY METRICS

print("\n\nüîç DATA QUALITY METRICS")
print("=" * 80)

# Basic statistics
print(f"\n‚úì Dataset Overview:")
print(f"  Total Records: {len(df):,}")
print(f"  Date Range: {df['tpep_pickup_datetime'].min()} to {df['tpep_pickup_datetime'].max()}")
print(f"  Unique Pickup Locations: {df['PULocationID'].nunique()}")
print(f"  Unique Dropoff Locations: {df['DOLocationID'].nunique()}")

# Missing values analysis
print(f"\n‚úì Missing Values Analysis:")
missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isna().sum(),
    'Missing_Percentage': (df.isna().sum() / len(df)) * 100
}).sort_values('Missing_Percentage', ascending=False)

critical_missing = missing_summary[missing_summary['Missing_Percentage'] > 0]
if len(critical_missing) > 0:
    print(critical_missing.to_string(index=False))
else:
    print("  ‚úÖ No missing values detected")

# Duplicate records
duplicate_count = df.duplicated().sum()
print(f"\n‚úì Duplicate Records: {duplicate_count:,} ({(duplicate_count/len(df)*100):.2f}%)")




üîç DATA QUALITY METRICS

‚úì Dataset Overview:
  Total Records: 3,475,226
  Date Range: 2024-12-31 20:47:55 to 2025-02-01 00:00:44
  Unique Pickup Locations: 261
  Unique Dropoff Locations: 260

‚úì Missing Values Analysis:
              Column  Missing_Count  Missing_Percentage
         Airport_fee         540149               15.54
     passenger_count         540149               15.54
congestion_surcharge         540149               15.54
          RatecodeID         540149               15.54
  store_and_fwd_flag         540149               15.54
              Column  Missing_Count  Missing_Percentage
         Airport_fee         540149               15.54
     passenger_count         540149               15.54
congestion_surcharge         540149               15.54
          RatecodeID         540149               15.54
  store_and_fwd_flag         540149               15.54

‚úì Duplicate Records: 0 (0.00%)

‚úì Duplicate Records: 0 (0.00%)


In [10]:

# 5. DATA ANOMALY DETECTION

print("\n\n‚ö†Ô∏è  DATA ANOMALY DETECTION")
print("=" * 80)

anomalies = {}

# 1. Negative or zero values where they shouldn't be
print("\n1. Negative/Zero Value Checks:")
anomalies['negative_fare'] = (df['fare_amount'] <= 0).sum()
anomalies['negative_distance'] = (df['trip_distance'] < 0).sum()
anomalies['zero_distance'] = (df['trip_distance'] == 0).sum()
anomalies['negative_passenger'] = (df['passenger_count'] <= 0).sum()

print(f"  Negative/Zero Fares: {anomalies['negative_fare']:,}")
print(f"  Negative Distance: {anomalies['negative_distance']:,}")
print(f"  Zero Distance: {anomalies['zero_distance']:,} ({(anomalies['zero_distance']/len(df)*100):.2f}%)")
print(f"  Zero/Negative Passengers: {anomalies['negative_passenger']:,}")

# 2. Unrealistic values
print("\n2. Unrealistic Value Checks:")
anomalies['extreme_fare'] = (df['fare_amount'] > 500).sum()
anomalies['extreme_distance'] = (df['trip_distance'] > 100).sum()
anomalies['extreme_passengers'] = (df['passenger_count'] > 6).sum()

print(f"  Fares > $500: {anomalies['extreme_fare']:,}")
print(f"  Distance > 100 miles: {anomalies['extreme_distance']:,}")
print(f"  Passengers > 6: {anomalies['extreme_passengers']:,}")

# 3. Time-based anomalies
df['trip_duration_minutes'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
anomalies['negative_duration'] = (df['trip_duration_minutes'] < 0).sum()
anomalies['zero_duration'] = (df['trip_duration_minutes'] == 0).sum()
anomalies['extreme_duration'] = (df['trip_duration_minutes'] > 180).sum()  # > 3 hours

print(f"\n3. Time-Based Anomalies:")
print(f"  Negative Duration: {anomalies['negative_duration']:,}")
print(f"  Zero Duration: {anomalies['zero_duration']:,}")
print(f"  Duration > 3 hours: {anomalies['extreme_duration']:,}")

# 4. Total amount validation
df['calculated_total'] = (df['fare_amount'] + df['extra'] + df['mta_tax'] + 
                          df['tip_amount'] + df['tolls_amount'] + 
                          df['improvement_surcharge'] + df['congestion_surcharge'].fillna(0) + 
                          df['Airport_fee'].fillna(0))
df['total_mismatch'] = abs(df['total_amount'] - df['calculated_total']) > 0.10  # 10 cent tolerance
anomalies['total_mismatch'] = df['total_mismatch'].sum()

print(f"\n4. Total Amount Validation:")
print(f"  Total Amount Mismatches: {anomalies['total_mismatch']:,} ({(anomalies['total_mismatch']/len(df)*100):.2f}%)")

# Summary
total_anomalies = sum(anomalies.values())
print(f"\nüìä TOTAL ANOMALIES DETECTED: {total_anomalies:,} ({(total_anomalies/len(df)*100):.2f}% of records)")



‚ö†Ô∏è  DATA ANOMALY DETECTION

1. Negative/Zero Value Checks:
  Negative/Zero Fares: 145,516
  Negative Distance: 0
  Zero Distance: 90,893 (2.62%)
  Zero/Negative Passengers: 24,656

2. Unrealistic Value Checks:
  Fares > $500: 55
  Distance > 100 miles: 162
  Passengers > 6: 18

3. Time-Based Anomalies:
  Negative Duration: 124
  Zero Duration: 1,927
  Duration > 3 hours: 1,377

4. Total Amount Validation:
  Total Amount Mismatches: 2,564,189 (73.78%)

üìä TOTAL ANOMALIES DETECTED: 2,828,917 (81.40% of records)


In [11]:

# 6. STATISTICAL SUMMARY

print("\n\nüìà STATISTICAL SUMMARY - KEY METRICS")
print("=" * 80)

# Fare statistics
print("\nüí∞ Fare Amount:")
print(df['fare_amount'].describe())

# Distance statistics
print("\nüìè Trip Distance:")
print(df['trip_distance'].describe())

# Passenger count
print("\nüë• Passenger Count:")
print(df['passenger_count'].value_counts().sort_index())

# Payment type
print("\nüí≥ Payment Type Distribution:")
payment_labels = {1: 'Credit Card', 2: 'Cash', 3: 'No Charge', 4: 'Dispute', 5: 'Unknown', 6: 'Voided'}
payment_dist = df['payment_type'].value_counts()
for payment_id, count in payment_dist.items():
    label = payment_labels.get(payment_id, f'Unknown ({payment_id})')
    pct = (count / len(df)) * 100
    print(f"  {label}: {count:,} ({pct:.2f}%)")

# Rate code
print("\nüöï Rate Code Distribution:")
rate_labels = {1: 'Standard', 2: 'JFK', 3: 'Newark', 4: 'Nassau/Westchester', 5: 'Negotiated', 6: 'Group Ride'}
rate_dist = df['RatecodeID'].value_counts()
for rate_id, count in rate_dist.items():
    label = rate_labels.get(rate_id, f'Unknown ({rate_id})')
    pct = (count / len(df)) * 100
    print(f"  {label}: {count:,} ({pct:.2f}%)")

# Trip duration
print("\n‚è±Ô∏è  Trip Duration (minutes):")
print(df['trip_duration_minutes'].describe())



üìà STATISTICAL SUMMARY - KEY METRICS

üí∞ Fare Amount:
count   3475226.00
mean         17.08
std         463.47
min        -900.00
25%           8.60
50%          12.11
75%          19.50
max      863372.12
Name: fare_amount, dtype: float64

üìè Trip Distance:
count   3475226.00
mean          5.86
std         564.60
min           0.00
25%           0.98
50%           1.67
75%           3.10
max      276423.57
Name: trip_distance, dtype: float64

üë• Passenger Count:
passenger_count
0.00      24656
1.00    2322434
2.00     407761
3.00      91409
4.00      59009
5.00      17786
6.00      12004
7.00          4
8.00         11
9.00          3
Name: count, dtype: int64

üí≥ Payment Type Distribution:
  Credit Card: 2,444,393 (70.34%)
  Unknown (0): 540,149 (15.54%)
  Cash: 390,429 (11.23%)
  Dispute: 76,481 (2.20%)
  No Charge: 23,773 (0.68%)
  Unknown: 1 (0.00%)

üöï Rate Code Distribution:
  Standard: 2,756,472 (79.32%)
  JFK: 94,420 (2.72%)
  Unknown (99.0): 41,963 (1.21%)
  Neg

In [12]:

# 7. BUSINESS LOGIC VALIDATION

print("\n\n‚úÖ BUSINESS LOGIC VALIDATION")
print("=" * 80)

# Tip percentage analysis (credit card only)
credit_card_trips = df[df['payment_type'] == 1].copy()
credit_card_trips['tip_percentage'] = (credit_card_trips['tip_amount'] / credit_card_trips['fare_amount']) * 100

print("\nüí° Tip Analysis (Credit Card Payments Only):")
print(f"  Credit Card Trips: {len(credit_card_trips):,} ({(len(credit_card_trips)/len(df)*100):.2f}%)")
print(f"  Trips with Tips: {(credit_card_trips['tip_amount'] > 0).sum():,}")
print(f"  Average Tip %: {credit_card_trips['tip_percentage'].mean():.2f}%")
print(f"  Median Tip %: {credit_card_trips['tip_percentage'].median():.2f}%")

# Revenue per mile
df['revenue_per_mile'] = df['fare_amount'] / df['trip_distance'].replace(0, np.nan)
print(f"\nüíµ Revenue per Mile:")
print(f"  Mean: ${df['revenue_per_mile'].mean():.2f}")
print(f"  Median: ${df['revenue_per_mile'].median():.2f}")

# Revenue per minute
df['revenue_per_minute'] = df['fare_amount'] / df['trip_duration_minutes'].replace(0, np.nan)
print(f"\n‚è∞ Revenue per Minute:")
print(f"  Mean: ${df['revenue_per_minute'].mean():.2f}")
print(f"  Median: ${df['revenue_per_minute'].median():.2f}")

# Airport trips analysis
airport_trips = df[df['RatecodeID'].isin([2, 3])]
print(f"\n‚úàÔ∏è  Airport Trips:")
print(f"  Count: {len(airport_trips):,} ({(len(airport_trips)/len(df)*100):.2f}%)")
print(f"  Average Fare: ${airport_trips['fare_amount'].mean():.2f}")
print(f"  Average Distance: {airport_trips['trip_distance'].mean():.2f} miles")



‚úÖ BUSINESS LOGIC VALIDATION

üí° Tip Analysis (Credit Card Payments Only):
  Credit Card Trips: 2,444,393 (70.34%)
  Trips with Tips: 2,300,221
  Average Tip %: inf%
  Median Tip %: 27.04%

üíµ Revenue per Mile:
  Mean: $13.70
  Median: $7.10

‚è∞ Revenue per Minute:
  Mean: $3.24
  Median: $1.15

üí° Tip Analysis (Credit Card Payments Only):
  Credit Card Trips: 2,444,393 (70.34%)
  Trips with Tips: 2,300,221
  Average Tip %: inf%
  Median Tip %: 27.04%

üíµ Revenue per Mile:
  Mean: $13.70
  Median: $7.10

‚è∞ Revenue per Minute:
  Mean: $3.24
  Median: $1.15

‚úàÔ∏è  Airport Trips:
  Count: 103,042 (2.97%)
  Average Fare: $64.94
  Average Distance: 16.94 miles

‚úàÔ∏è  Airport Trips:
  Count: 103,042 (2.97%)
  Average Fare: $64.94
  Average Distance: 16.94 miles


In [13]:

# 8. DATA QUALITY SCORE CALCULATION

print("\n\n‚≠ê DATA QUALITY SCORE")
print("=" * 80)

quality_metrics = {
    'Completeness': 100 - (df.isna().sum().sum() / (len(df) * len(df.columns)) * 100),
    'Uniqueness': 100 - (duplicate_count / len(df) * 100),
    'Validity': 100 - (total_anomalies / len(df) * 100),
    'Consistency': 100 - (anomalies['total_mismatch'] / len(df) * 100),
}

print("\nQuality Dimensions:")
for metric, score in quality_metrics.items():
    status = "‚úÖ" if score >= 95 else "‚ö†Ô∏è " if score >= 90 else "‚ùå"
    print(f"  {status} {metric}: {score:.2f}%")

overall_quality = np.mean(list(quality_metrics.values()))
print(f"\nüéØ OVERALL DATA QUALITY SCORE: {overall_quality:.2f}%")

if overall_quality >= 95:
    print("   ‚úÖ Excellent - Data is ready for analysis")
elif overall_quality >= 90:
    print("   ‚ö†Ô∏è  Good - Minor cleaning recommended")
else:
    print("   ‚ùå Fair - Significant cleaning required")



‚≠ê DATA QUALITY SCORE

Quality Dimensions:
  ‚úÖ Completeness: 96.78%
  ‚úÖ Uniqueness: 100.00%
  ‚ùå Validity: 18.60%
  ‚ùå Consistency: 26.22%

üéØ OVERALL DATA QUALITY SCORE: 60.40%
   ‚ùå Fair - Significant cleaning required

Quality Dimensions:
  ‚úÖ Completeness: 96.78%
  ‚úÖ Uniqueness: 100.00%
  ‚ùå Validity: 18.60%
  ‚ùå Consistency: 26.22%

üéØ OVERALL DATA QUALITY SCORE: 60.40%
   ‚ùå Fair - Significant cleaning required


In [14]:
# 9. RECOMMENDATIONS & ACTION ITEMS

print("\n\nüìù RECOMMENDATIONS & ACTION ITEMS")
print("=" * 80)

recommendations = []

if anomalies['zero_distance'] > len(df) * 0.01:  # > 1%
    recommendations.append(f"‚Ä¢ Remove/investigate {anomalies['zero_distance']:,} trips with zero distance")

if anomalies['negative_fare'] > 0:
    recommendations.append(f"‚Ä¢ Remove {anomalies['negative_fare']:,} trips with negative fares")

if anomalies['extreme_duration'] > len(df) * 0.001:  # > 0.1%
    recommendations.append(f"‚Ä¢ Cap/investigate {anomalies['extreme_duration']:,} trips with duration > 3 hours")

if anomalies['total_mismatch'] > len(df) * 0.05:  # > 5%
    recommendations.append(f"‚Ä¢ Investigate {anomalies['total_mismatch']:,} total amount calculation mismatches")

if duplicate_count > 0:
    recommendations.append(f"‚Ä¢ Remove {duplicate_count:,} duplicate records")

if len(recommendations) == 0:
    print("\n‚úÖ No critical issues found! Data is in good shape for analysis.")
else:
    print("\nAction Items:")
    for rec in recommendations:
        print(rec)

print("\n" + "=" * 80)
print("‚úÖ DATA QUALITY ASSESSMENT COMPLETE")
print("=" * 80)



üìù RECOMMENDATIONS & ACTION ITEMS

Action Items:
‚Ä¢ Remove/investigate 90,893 trips with zero distance
‚Ä¢ Remove 145,516 trips with negative fares
‚Ä¢ Investigate 2,564,189 total amount calculation mismatches

‚úÖ DATA QUALITY ASSESSMENT COMPLETE


In [15]:
# 10. DETAILED ANOMALY INVESTIGATION

print("\n\nüî¨ DETAILED ANOMALY INVESTIGATION")
print("=" * 80)

# 1. ZERO DISTANCE TRIPS
print("\n1Ô∏è‚É£ ZERO DISTANCE TRIPS ANALYSIS")
print("-" * 80)

zero_distance_trips = df[df['trip_distance'] == 0].copy()
print(f"\nTotal Zero Distance Trips: {len(zero_distance_trips):,}")

print("\nüìä Sample of Zero Distance Trips (First 20):")
columns_to_show = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID',
                   'trip_distance', 'fare_amount', 'total_amount', 'payment_type', 
                   'trip_duration_minutes']
print(zero_distance_trips[columns_to_show].head(20).to_string(index=False))

print("\nüìà Zero Distance Trip Statistics:")
print(f"  Average Fare: ${zero_distance_trips['fare_amount'].mean():.2f}")
print(f"  Median Fare: ${zero_distance_trips['fare_amount'].median():.2f}")
print(f"  Average Total: ${zero_distance_trips['total_amount'].mean():.2f}")
print(f"  Average Duration: {zero_distance_trips['trip_duration_minutes'].mean():.2f} minutes")

print("\nüí≥ Payment Type Distribution (Zero Distance):")
zero_payment_dist = zero_distance_trips['payment_type'].value_counts()
payment_labels = {1: 'Credit Card', 2: 'Cash', 3: 'No Charge', 4: 'Dispute', 5: 'Unknown', 6: 'Voided'}
for payment_id, count in zero_payment_dist.items():
    label = payment_labels.get(payment_id, f'Unknown ({payment_id})')
    pct = (count / len(zero_distance_trips)) * 100
    print(f"  {label}: {count:,} ({pct:.2f}%)")

print("\nüîç Pickup/Dropoff Pattern (Same location?):")
zero_distance_trips['same_location'] = zero_distance_trips['PULocationID'] == zero_distance_trips['DOLocationID']
same_loc_count = zero_distance_trips['same_location'].sum()
print(f"  Trips with Same Pickup/Dropoff: {same_loc_count:,} ({(same_loc_count/len(zero_distance_trips)*100):.2f}%)")

# 2. NEGATIVE FARE TRIPS
print("\n\n2Ô∏è‚É£ NEGATIVE/ZERO FARE TRIPS ANALYSIS")
print("-" * 80)

negative_fare_trips = df[df['fare_amount'] <= 0].copy()
print(f"\nTotal Negative/Zero Fare Trips: {len(negative_fare_trips):,}")

print("\nüìä Sample of Negative/Zero Fare Trips (First 20):")
print(negative_fare_trips[columns_to_show].head(20).to_string(index=False))

print("\nüìà Negative Fare Trip Statistics:")
print(f"  Min Fare: ${negative_fare_trips['fare_amount'].min():.2f}")
print(f"  Max Fare: ${negative_fare_trips['fare_amount'].max():.2f}")
print(f"  Average Distance: {negative_fare_trips['trip_distance'].mean():.2f} miles")
print(f"  Average Duration: {negative_fare_trips['trip_duration_minutes'].mean():.2f} minutes")
print(f"  Average Total Amount: ${negative_fare_trips['total_amount'].mean():.2f}")

print("\nüí≥ Payment Type Distribution (Negative Fares):")
neg_payment_dist = negative_fare_trips['payment_type'].value_counts()
for payment_id, count in neg_payment_dist.items():
    label = payment_labels.get(payment_id, f'Unknown ({payment_id})')
    pct = (count / len(negative_fare_trips)) * 100
    print(f"  {label}: {count:,} ({pct:.2f}%)")

print("\nüöï Rate Code Distribution (Negative Fares):")
rate_labels = {1: 'Standard', 2: 'JFK', 3: 'Newark', 4: 'Nassau/Westchester', 5: 'Negotiated', 6: 'Group Ride'}
neg_rate_dist = negative_fare_trips['RatecodeID'].value_counts()
for rate_id, count in neg_rate_dist.items():
    label = rate_labels.get(rate_id, f'Unknown ({rate_id})')
    pct = (count / len(negative_fare_trips)) * 100
    print(f"  {label}: {count:,} ({pct:.2f}%)")

# 3. OVERLAP ANALYSIS
print("\n\n3Ô∏è‚É£ OVERLAP ANALYSIS")
print("-" * 80)

zero_and_negative = df[(df['trip_distance'] == 0) & (df['fare_amount'] <= 0)]
print(f"\nTrips with BOTH zero distance AND negative/zero fare: {len(zero_and_negative):,}")

if len(zero_and_negative) > 0:
    print("\nüìä Sample of Overlapping Issues (First 10):")
    print(zero_and_negative[columns_to_show].head(10).to_string(index=False))

# 4. EXPORT ANOMALIES TO CSV FOR FURTHER INVESTIGATION
print("\n\nüíæ EXPORTING ANOMALIES FOR DETAILED REVIEW")
print("-" * 80)

output_dir = '/Users/yash/Documents/Projects/NYC_Yellow_Taxi_Analytics/outputs/'
os.makedirs(output_dir, exist_ok=True)

# Export zero distance trips
zero_distance_trips.to_csv(f'{output_dir}zero_distance_trips.csv', index=False)
print(f"‚úÖ Exported {len(zero_distance_trips):,} zero distance trips to: {output_dir}zero_distance_trips.csv")

# Export negative fare trips
negative_fare_trips.to_csv(f'{output_dir}negative_fare_trips.csv', index=False)
print(f"‚úÖ Exported {len(negative_fare_trips):,} negative fare trips to: {output_dir}negative_fare_trips.csv")

# Export overlap trips
if len(zero_and_negative) > 0:
    zero_and_negative.to_csv(f'{output_dir}zero_distance_and_negative_fare.csv', index=False)
    print(f"‚úÖ Exported {len(zero_and_negative):,} overlapping anomalies to: {output_dir}zero_distance_and_negative_fare.csv")

print("\n" + "=" * 80)
print("‚úÖ ANOMALY INVESTIGATION COMPLETE")
print("=" * 80)



üî¨ DETAILED ANOMALY INVESTIGATION

1Ô∏è‚É£ ZERO DISTANCE TRIPS ANALYSIS
--------------------------------------------------------------------------------

Total Zero Distance Trips: 90,893

üìä Sample of Zero Distance Trips (First 20):
tpep_pickup_datetime tpep_dropoff_datetime  PULocationID  DOLocationID  trip_distance  fare_amount  total_amount  payment_type  trip_duration_minutes
 2025-01-01 00:49:48   2025-01-01 00:49:48            87           264           0.00        20.06         20.06             2                   0.00
 2025-01-01 00:37:43   2025-01-01 00:37:53           148           148           0.00        12.00         17.50             1                   0.17
 2025-01-01 00:57:08   2025-01-01 00:57:16           141           141           0.00        30.00         33.50             1                   0.13
 2025-01-01 00:27:40   2025-01-01 00:59:30           168            76           0.00        50.50         58.94             1                  31.83
 2025-01-0