In [None]:
pip install numpy pandas seaborn matplotlib pillow

In [None]:
pip list

## Analyze data
we will analyze Taxi rides data in order to exercise our skills and get valuable insights.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

File Path

In [None]:
trips_path = '/datasets/taxi_trip_data.csv'

Reading the CSV File:

In [None]:
import os
trips = pd.read_csv(os.getcwd()+trips_path)

In [None]:
from IPython.display import display

In [None]:
display(trips.head())  # Show first 5 rows

In [None]:
display(trips.sample(5))

In [None]:
display(trips.info())

In [None]:
pd.options.display.float_format = '{:.2f}'.format
display(trips.describe())

##### Data preparation and cleaning
We will begin by removing duplicates and seeing how many there were in initial dataset

In [None]:
with_dups = trips.shape[0]
trips.drop_duplicates(inplace=True)

print(f"{round((with_dups-trips.shape[0])/with_dups*100, 2)}% of rows were duplicates")
print(f"duplicates dropped with {trips.shape[0]} rows remaining")

And now we count missing values, luckily there are none this time

In [None]:
trips.isna().sum()

In [None]:
#Some missing values can be zeroes so let's check this too

def zero_cnt(df): 
    print("Count of zeroes and empty strings:")
    print("column                   count     percentage")
    for col in df.columns:
        if df[col].dtype != 'object':
            cnt = df[df[col] == 0].shape[0]
        else:
            cnt = df[df[col] == ''].shape[0]
        print(f"{col:<25}{cnt:<10}{round(cnt/df.shape[0]*100, 2):<5}")
        
zero_cnt(trips)

Percentage of records with invalid trip distance or fare amount is relatively low, so we can drop them without worrying too much

In [None]:
trips = trips[trips['trip_distance']!=0]
trips = trips[trips['fare_amount']!=0]
zero_cnt(trips)

And the last things left to do are converting required columns to datetime format, dropping records that for example ended in dispute over charge, select only columns that are relevant for our purposes and finally see how many records are left for us to work with.

In [None]:
trips['pickup_datetime'] = pd.to_datetime(trips['pickup_datetime'])
trips['dropoff_datetime'] = pd.to_datetime(trips['dropoff_datetime'])

In [None]:
trips = trips[(trips['payment_type']==1)|(trips['payment_type']==2)|(trips['payment_type']==3)]
trips = trips[trips['pickup_datetime'].dt.year == 2018]
trips.shape[0]

In [None]:
trips = trips.loc[:,['pickup_datetime','dropoff_datetime','passenger_count','trip_distance','rate_code','fare_amount','pickup_location_id','dropoff_location_id']]

### Explanation:
trips.loc[:, column_list]

.loc[]: Used for selecting data by labels.

: (before the comma) selects all rows.

column_list (after the comma) selects only the specified columns.

The selected columns are:

pickup_datetime → Time when the ride started.

dropoff_datetime → Time when the ride ended.

passenger_count → Number of passengers in the ride.

trip_distance → Distance traveled during the ride.

rate_code → Code indicating the fare type (e.g., standard, airport ride, etc.).

fare_amount → Total fare charged for the ride.

pickup_location_id → Location ID where the trip started.

dropoff_location_id → Location ID where the trip ended.

### Data analysis and visualization
So, before making any visualizations we just make our life easier by adding separate columns with information we will need later on

In [None]:
trips['day_of_week'] = trips['pickup_datetime'].dt.dayofweek
trips['hour'] = trips['pickup_datetime'].dt.hour
trips['duration_hours'] = (trips['dropoff_datetime']-trips['pickup_datetime']).astype('timedelta64[s]')/3600
#trips['speed_mph'] = (trips['trip_distance']/trips['duration_hours']).astype('float64')
trips.loc[trips['duration_hours'] == 0, 'duration_hours'] = np.nan
trips = trips.dropna(subset=['pickup_datetime', 'dropoff_datetime'])

In [None]:
trips = trips.dropna(subset=['pickup_datetime', 'dropoff_datetime'])

In [None]:
trips['duration_hours'] = trips['dropoff_datetime'] - trips['pickup_datetime']
trips['duration_hours'] = trips['duration_hours'].dt.total_seconds() / 3600  # Convert timedelta to hours

In [None]:
trips['speed_mph'] = trips['trip_distance'] / trips['duration_hours']

And now let's make our first chart: lineplot of taxi's average speed by hour. We can immediately see sighnificant drops in average speed corresponding to rush hours they are especially pronounced in rate codes 2 to 4 which correspond to areas outside of city center such as JFK airport or Nassau county.

This drops would be more distinguished if we only used workday data, but we will go over that later

In [None]:
avg_speed_by_h = trips[(trips['duration_hours']>0.01)&(trips['rate_code']<5)].groupby(by=['hour','rate_code'],as_index=False).agg(avg_speed=('speed_mph','mean'))
sns.lineplot(data=avg_speed_by_h,x='hour',y='avg_speed',hue='rate_code');

Okay, now we will see differences in demand for taxi in different times of the day and how it changes between workdays and weekends.

It is important to remember that we shouldn't compare absolute values because our bar chart shows counts of rides and week has only two weekends as compared to five workdays.

So what can we see on this chart?

We can see difference of lowest demands between workdays and weekends:on workdays highest number of people sleeping(presumably) is around 2-3 a.m. and on weekwnds it is at 5 p.m.

Also weekends have their own peaks of taxi demand: late afternoon to evening when people go out and around midnight when they return home

In [None]:
fig, axes = plt.subplots(figsize=(15,6), nrows=1, ncols=2)
workday_trips = trips.loc[trips['day_of_week']<5]
weekend_trips = trips.loc[trips['day_of_week']>=5]

sns.countplot(ax=axes[0], x=workday_trips['hour'])
sns.countplot(ax=axes[1], x=weekend_trips['hour'])

axes[0].set_title('Workdays',fontsize = 18)
axes[0].set_xlabel('hour', fontsize=15)
axes[0].set_ylabel('number of pickups', fontsize=15)
axes[1].set_title('Weekends',fontsize = 18)
axes[1].set_xlabel('hour', fontsize=15)
axes[1].set_ylabel('number of pickups', fontsize=15)

fig.suptitle('Number of pickups on workdays and weekends',fontsize = 20)
plt.subplots_adjust(top = 0.85)
plt.show;

Third and final chart before the grand finale is a heatmap of number of taxi rides by hour and day of week. I invite you to go through the code first and we will discuss results later.

In [None]:
dow_h_count = trips.groupby(['day_of_week','hour']).pickup_datetime.count()
dow_h_count = dow_h_count.reset_index()
dow_h_count.columns = ['day_of_week','hour','count']
dow_h_count.head()

This groups the trips DataFrame by day of the week (day_of_week) and hour (hour).

It then counts the number of trips (pickup_datetime.count()) in each group.

The result is a Pandas Series, where the index is ['day_of_week', 'hour'] and the values are the trip counts.

Since groupby() returns a Series, reset_index() converts it into a proper DataFrame with day_of_week and hour as separate columns.

The grouped DataFrame originally has unnamed columns.

This explicitly renames them as:

day_of_week → Day of the week (0=Monday, 6=Sunday)

hour → Hour of the day (0-23)

count → Number of trips in that day-hour combination.
Displays the first five rows of the dataset for inspection.


In [None]:
dow_h_count_piv = dow_h_count.pivot(index='day_of_week',columns='hour',values='count')
dow_h_count_piv.head()

In [None]:
dow_h_count_piv.index = pd.Index(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
dow_h_count_piv.index.name = 'day_of_week'

In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(data=dow_h_count_piv,cmap='viridis',square=True);

Here it is, the heatmap that once again confirms what we saw in other charts(for example different low demand perionds on weekends), adding something new as well.

For example, we see that afternoon-evening weekend demand from previous bar chart mostly comes from Sunday as shown by corresponding horizontal stripe.

We also see vertical stripes like 21-22 p.m. when people tend to get home from various social activities(dinner parties, dates). Although it may seem odd at first that highest overall demand for rides is on Thursday rather than Friday is is explained by the fact that on Thursdays people need to maintain sleeping scedule because they have job the next day. On Friday on the other hand they prefer getting cab later with that demand "spread more thinly" over after midnight hours of Saturday as can be seen on the heatmap.

### Animated choropleth map!
Author had great time doing this part and takes certain pride in the outcome.

Short explanation of what happens here:

A few additional imports
Dropping rides from airports(for the purposes of map readability)
Making image for every hour of day
Combining these images in a gif

In [None]:
!pip install geopandas imageio shapely

In [None]:
import geopandas as gpd
import imageio
from shapely import wkt

In [None]:
zones = pd.read_csv(os.getcwd() + "/datasets/taxi_zone_geo.csv")

In [None]:
zones['zone_geom'] = zones['zone_geom'].apply(wkt.loads)

In [None]:
airports_mask = zones['zone_name'].str.contains('Airport|airport|AIRPORT',regex=True,na=False)
zones_no_airports = zones[~airports_mask]
zones_no_airports_geo = gpd.GeoDataFrame(data=zones_no_airports, geometry='zone_geom')
zones_no_airports_geo.plot();