# Taxi Trips Analysis Project

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

## 1: Introduction

In [None]:
# Loading the dataset:
url = '/workspaces/AnyoneAI/Proyecto_Final/yellow_tripdata_2022-05.parquet'
df = pd.read_parquet(url, engine="fastparquet")

In [None]:
# We show the first 5 rows of the dataframe:
df.head()

In [None]:
# We show the shape of the dataframe and the data type of values in the columns:
print(df.shape)
print(df.dtypes)

In [None]:
# Check for missing values
df.isna().sum()

In [None]:
# We show some descriptive statistics of the numerical columns
df.describe().T

------

In [None]:
df[(df['tip_amount'] < 0) | (df['fare_amount'] < 0) | (df['extra'] < 0) | (df['mta_tax'] < 0)]

There're 20692 records with negative numerical values.

--------

`Total_amount:` The total amount charged to passengers. Does not include cash tips.

Can't be negative or extremely high.

In [None]:
df['total_amount'].value_counts().sort_values()

In [None]:
# We create a histogram of the 'total amount' column:
plt.hist(df['total_amount'], bins=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150], edgecolor='black')
plt.title("Histogram of Total Amount")
plt.xlabel("Total Amount")
plt.ylabel("Frequency")
plt.show()

`Congestion_Surcharge:` Total amount collected in trip for NYS congestion surcharge.

The rate depends on the type of vehicle used to provide transportation in or through the congestion zone. The surcharge is generally:

- $2.75 for each for-hire transportation trip in a vehicle that is not a medallion taxicab or a pool vehicle.
- $2.50 per trip when the transportation is provided by a medallion taxicab vehicle.
- $0.75 per pool trip.

This amount is required to be passed through to passengers and reported separately on any receipt given to the passenger.  

In [None]:
df['congestion_surcharge'].value_counts()

`Airport_fee:` $1.25 for pick up only at LaGuardia and John F. Kennedy Airports.

So it should be either $1.25 or 0. Other values to be considered as mistake.

In [None]:
df['airport_fee'].value_counts()

``Improvement_surcharge:`` $0.30 improvement surcharge assessed trips at the flag drop. The improvement surcharge began being levied in 2015.

So it should be either 0.30 or 0. Other values to be considered as mistake.

In [None]:
df['improvement_surcharge'].value_counts()

``MTA_tax:`` $0.50 MTA tax that is automatically triggered based on the metered rate in use.

Should be either 0 or 0.5

In [None]:
df['mta_tax'].value_counts()

``Extra:`` Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges.

Should be either 0.5, 1 or 0

In [None]:
df['extra'].value_counts()

`Fare_amount:` The time-and-distance fare calculated by the meter.

The fare can be different, but it can't be less than 0 or astronomicaly high.


In [None]:
df['fare_amount'].sort_values()

`Tolls_amount:` Total amount of all tolls paid in trip.

Can't be negative

In [None]:
df['tolls_amount'].sort_values()

`Tip_amount:` This field is automatically populated for credit card tips. Cash tips are not included.

Can't be negative

In [None]:
df['tip_amount'].sort_values()

`Payment_type:` A numeric code signifying how the passenger paid for the trip. 

- 1= Credit card
- 2= Cash
- 3= No charge
- 4= Dispute
- 5= Unknown
- 6= Voided trip

In [None]:
df.payment_type.value_counts()

In [None]:
plt.figure(figsize=(28, 5))
sns.countplot(data=df, y='payment_type', orient='h')
plt.yticks(np.arange(5), ['Credit Card', 'Cash', 'No Charge', 'Dispute', 'Unknown'])
plt.ylabel('Payment Type')
plt.xlabel(None)
plt.show()

`Trip_distance:` The elapsed trip distance in miles reported by the taximeter.

If measured correctly it shouldn't be equal to 0 or astronomicaly high.

It can also be strange comparing to starting-ending time of the trip (1 hour long trip and 200 miles)


In [None]:
df.trip_distance.sort_values()

In [None]:
df[df['trip_distance'] > 1000]

In [None]:
# Showing rows with any distance data equals to 0:
df[(df['trip_distance'] == 0)]

`Store_and_fwd_flag:` This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka “store and forward,” because the vehicle did not have a connection to the server.

- Y= store and forward trip
- N= not a store and forward trip

In [None]:
df.store_and_fwd_flag.value_counts()

In [None]:
plt.figure(figsize=(20, 2))
sns.countplot(data=df, y='store_and_fwd_flag', orient='h')
plt.xlabel(None)
plt.ylabel(None)
plt.title('Store and Fwd Flag')
plt.show()

`RateCodeID:` The final rate code in effect at the end of the trip.

1. Standard rate
2. JFK
3. Newark
4. Nassau or Westchester
5. Negotiated fare
6. Group ride

In [None]:
df.RatecodeID.value_counts()

99 is not a value compatible with the description.

`Passenger_count:` The number of passengers in the vehicle. This is a driver-entered value.

0 passengers considered as mistake

In [None]:
df['passenger_count'].value_counts()

`VendorID:` A code indicating the TPEP provider that provided the record. 

- 1= Creative Mobile Technologies, LLC 
- 2= VeriFone Inc.

We only have two options, therefore, any other number is considered an error.

In [None]:
df['VendorID'].value_counts()

In [None]:
sns.countplot(data=df, x='VendorID');
plt.xticks([0,1], ['1', '2'])
plt.title('Vendor ID')
plt.show()

------------

Negative to non-negative

In [None]:
import folium
from folium.plugins import HeatMap

# Load the Yellow Taxi Trip Records dataset
df_taxi = pd.read_parquet("path_to_yellow_taxi_dataset.parquet")

# Group by pickup and dropoff locations and count the number of occurrences
pickup_counts = df_taxi.groupby(["pickup_latitude", "pickup_longitude"]).size().reset_index(name="count")
dropoff_counts = df_taxi.groupby(["dropoff_latitude", "dropoff_longitude"]).size().reset_index(name="count")

# Get the most frequent pickup and dropoff locations
top_pickup_locations = pickup_counts.nlargest(10, "count")
top_dropoff_locations = dropoff_counts.nlargest(10, "count")

# Create a map centered around New York City
m = folium.Map(location=[40.7128, -74.0060], zoom_start=12)

# Add heatmaps for pickup and dropoff locations
pickup_heatmap_data = top_pickup_locations[["pickup_latitude", "pickup_longitude", "count"]].values
dropoff_heatmap_data = top_dropoff_locations[["dropoff_latitude", "dropoff_longitude", "count"]].values

HeatMap(pickup_heatmap_data, radius=15).add_to(m)
HeatMap(dropoff_heatmap_data, radius=15).add_to(m)

# Add markers for the most frequent pickup and dropoff locations
for index, row in top_pickup_locations.iterrows():
    folium.Marker(
        location=[row["pickup_latitude"], row["pickup_longitude"]],
        popup=f"Pickup Count: {row['count']}",
        icon=folium.Icon(color="blue")
    ).add_to(m)

for index, row in top_dropoff_locations.iterrows():
    folium.Marker(
        location=[row["dropoff_latitude"], row["dropoff_longitude"]],
        popup=f"Dropoff Count: {row['count']}",
        icon=folium.Icon(color="red")
    ).add_to(m)

# Display the map
m

In [2]:
import geopandas as gpd

# Read the shapefile
url_location = '/workspaces/AnyoneAI/Proyecto_Final/taxi_zones.zip'
gdf = gpd.read_file(url_location)

# Print the first few rows of the GeoDataFrame
gdf

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((933100.918 192536.086, 933091.011 19..."
1,2,0.433470,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((1033269.244 172126.008, 103343..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((1026308.770 256767.698, 1026495.593 ..."
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((992073.467 203714.076, 992068.667 20..."
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((935843.310 144283.336, 936046.565 14..."
...,...,...,...,...,...,...,...
258,259,0.126750,0.000395,Woodlawn/Wakefield,259,Bronx,"POLYGON ((1025414.782 270986.139, 1025138.624 ..."
259,260,0.133514,0.000422,Woodside,260,Queens,"POLYGON ((1011466.966 216463.005, 1011545.889 ..."
260,261,0.027120,0.000034,World Trade Center,261,Manhattan,"POLYGON ((980555.204 196138.486, 980570.792 19..."
261,262,0.049064,0.000122,Yorkville East,262,Manhattan,"MULTIPOLYGON (((999804.795 224498.527, 999824...."


In [3]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   OBJECTID    263 non-null    int64   
 1   Shape_Leng  263 non-null    float64 
 2   Shape_Area  263 non-null    float64 
 3   zone        263 non-null    object  
 4   LocationID  263 non-null    int64   
 5   borough     263 non-null    object  
 6   geometry    263 non-null    geometry
dtypes: float64(2), geometry(1), int64(2), object(2)
memory usage: 14.5+ KB


In [None]:
import plotly.express as px

# Merge the DataFrame with the GeoDataFrame
merged_df = gdf.merge(df, left_on='location_id', right_on='LocationID', how='left')

# Create a Plotly scatter mapbox plot
fig = px.scatter_mapbox(merged_df, 
                        lat="latitude", 
                        lon="longitude", 
                        color="trip_distance", 
                        hover_name="location_id",
                        mapbox_style="open-street-map",
                        zoom=10)

# Display the plot
fig.show()

In [None]:
import plotly.express as px
import pandas as pd

# Read the yellow_tripdata_2022-05 dataset into a pandas DataFrame
df = pd.read_csv('/workspaces/FinalProject/yellow_tripdata_2022-05.csv')

# Merge the DataFrame with the GeoDataFrame
merged_df = gdf.merge(df, left_on='location_id', right_on='location_id', how='left')

# Create a Plotly scatter mapbox plot
fig = px.scatter_mapbox(merged_df, 
                        lat="latitude", 
                        lon="longitude", 
                        color="trip_distance", 
                        hover_name="location_id",
                        mapbox_style="open-street-map",
                        zoom=10)

# Display the plot
fig.show()