In [1]:
# From your virtul environment, install the pandas package for working with data frames: 
# pip install pandas
# pandas is an open source Python library for data analysis
# A data frame allows us to read and access data stored in spreadsheet (.csv) format
# Good tutorial on pandas: http://pandas.pydata.org/pandas-docs/stable/10min.html
# Pandas Cookbook: http://pandas.pydata.org/pandas-docs/stable/tutorials.html
# Another good resource for pandas: http://chrisalbon.com/python/pandas_dataframe_importing_csv.htm

#Import the required packages
#Import package pandas for data analysis
import pandas as pd

# Import package numpy for numeric computing
import numpy as np

# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt

#For showing plots directly in the notebook run the command below
%matplotlib inline

# For saving multiple plots into a single pdf file
from matplotlib.backends.backend_pdf import PdfPages

In [3]:
# Reading from a csv file, into a data frame
# Can set parameters to remove white space from the beginning and end of column names
# Read more about .read_csv() here: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
df = pd.read_csv('/Users/jennifercasavantes/Downloads/2020_Yellow_Taxi_Trip_Data_20250523.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)
#How many rows should be displayed in full
pd.set_option('display.max_rows', 100)
# Show data frame first few rows
df.head(50)

  df = pd.read_csv('/Users/jennifercasavantes/Downloads/2020_Yellow_Taxi_Trip_Data_20250523.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,01/01/2020 12:28:15 AM,01/01/2020 12:33:03 AM,1.0,1.2,1.0,N,238,239,1.0,6.0,3.0,0.5,1.47,0.0,0.3,11.27,2.5
1,1.0,01/01/2020 12:35:39 AM,01/01/2020 12:43:04 AM,1.0,1.2,1.0,N,239,238,1.0,7.0,3.0,0.5,1.5,0.0,0.3,12.3,2.5
2,1.0,01/01/2020 12:47:41 AM,01/01/2020 12:53:52 AM,1.0,0.6,1.0,N,238,238,1.0,6.0,3.0,0.5,1.0,0.0,0.3,10.8,2.5
3,1.0,01/01/2020 12:55:23 AM,01/01/2020 01:00:14 AM,1.0,0.8,1.0,N,238,151,1.0,5.5,0.5,0.5,1.36,0.0,0.3,8.16,0.0
4,2.0,01/01/2020 12:01:58 AM,01/01/2020 12:04:16 AM,1.0,0.0,1.0,N,193,193,2.0,3.5,0.5,0.5,0.0,0.0,0.3,4.8,0.0
5,2.0,01/01/2020 12:09:44 AM,01/01/2020 12:10:37 AM,1.0,0.03,1.0,N,7,193,2.0,2.5,0.5,0.5,0.0,0.0,0.3,3.8,0.0
6,2.0,01/01/2020 12:39:25 AM,01/01/2020 12:39:29 AM,1.0,0.0,1.0,N,193,193,1.0,2.5,0.5,0.5,0.01,0.0,0.3,3.81,0.0
7,2.0,12/18/2019 03:27:49 PM,12/18/2019 03:28:59 PM,1.0,0.0,5.0,N,193,193,1.0,0.01,0.0,0.0,0.0,0.0,0.3,2.81,2.5
8,2.0,12/18/2019 03:30:35 PM,12/18/2019 03:31:35 PM,4.0,0.0,1.0,N,193,193,1.0,2.5,0.5,0.5,0.0,0.0,0.3,6.3,2.5
9,1.0,01/01/2020 12:29:01 AM,01/01/2020 12:40:28 AM,2.0,0.7,1.0,N,246,48,1.0,8.0,3.0,0.5,2.35,0.0,0.3,14.15,2.5


In [4]:
# Check how many rows and columns this dataframe has
df.shape

(24648499, 18)

In [5]:
# You can see below that some columns are wrongly set to continuous types, e.g., ID.
df.dtypes

VendorID                 float64
tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count          float64
trip_distance            float64
RatecodeID               float64
store_and_fwd_flag        object
PULocationID               int64
DOLocationID               int64
payment_type             float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
congestion_surcharge     float64
dtype: object

In [6]:
#another way to get a summary of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24648499 entries, 0 to 24648498
Data columns (total 18 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   VendorID               float64
 1   tpep_pickup_datetime   object 
 2   tpep_dropoff_datetime  object 
 3   passenger_count        float64
 4   trip_distance          float64
 5   RatecodeID             float64
 6   store_and_fwd_flag     object 
 7   PULocationID           int64  
 8   DOLocationID           int64  
 9   payment_type           float64
 10  fare_amount            float64
 11  extra                  float64
 12  mta_tax                float64
 13  tip_amount             float64
 14  tolls_amount           float64
 15  improvement_surcharge  float64
 16  total_amount           float64
 17  congestion_surcharge   float64
dtypes: float64(13), int64(2), object(3)
memory usage: 3.3+ GB


In [7]:
# Look at column names. Some have spaces in or after the name, e.g., 'Insurance Type '.
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge'],
      dtype='object')

In [8]:
# Clean the column names to remove white space after the name or in the name
df.columns = df.columns.str.replace(' ', '')
#df.columns = df.columns.str.replace(' ', '_')

In [9]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge'],
      dtype='object')

In [10]:
# We compute them below.
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VendorID,23838931.0,1.667327,0.47117,1.0,1.0,2.0,2.0,2.0
passenger_count,23838931.0,1.467986,1.112783,0.0,1.0,1.0,1.0,9.0
trip_distance,24648499.0,3.527061,325.035867,-30.62,0.99,1.65,3.0,350914.89
RatecodeID,23838931.0,1.048557,0.761086,1.0,1.0,1.0,1.0,99.0
PULocationID,24648499.0,163.969081,66.751541,1.0,114.0,162.0,234.0,265.0
DOLocationID,24648499.0,161.170975,70.956151,1.0,107.0,162.0,234.0,265.0
payment_type,23838931.0,1.280403,0.48439,1.0,1.0,1.0,2.0,5.0
fare_amount,24648499.0,12.667645,274.091443,-1259.0,6.5,9.0,14.0,998310.03
extra,24648499.0,1.071817,100.718118,-27.0,0.0,0.5,2.5,500000.8
mta_tax,24648499.0,0.512797,100.710533,-0.5,0.5,0.5,0.5,500000.5


In [12]:
df[df['trip_distance'] >= 1.4] 
#add more 

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
6412080,,01/28/2020 07:37:00 AM,01/28/2020 08:42:00 AM,,210240.07,,,76,231,,37.77,2.75,0.5,0.0,0.0,0.3,41.32,0.0
12217597,,07/09/2020 09:46:00 PM,07/09/2020 10:02:00 PM,,222795.31,,,165,11,,17.01,0.0,0.5,2.75,0.0,0.3,20.56,0.0
15713451,,03/03/2020 04:23:00 AM,03/03/2020 04:39:00 AM,,269803.73,,,256,90,,17.9,0.0,0.5,0.0,0.0,0.3,18.7,0.0
15715205,,03/05/2020 08:09:00 AM,03/05/2020 09:55:00 AM,,210148.62,,,76,230,,40.54,0.0,0.5,0.0,6.12,0.3,47.46,0.0
16274277,,05/02/2020 06:06:00 AM,05/02/2020 06:33:00 AM,,297004.51,,,89,137,,32.34,0.0,0.5,0.0,0.0,0.3,35.64,2.5
16871570,,06/24/2020 12:25:00 PM,06/24/2020 12:43:00 PM,,220386.23,,,236,69,,23.82,0.0,0.5,2.75,0.0,0.3,27.37,0.0
17629287,,07/26/2020 09:33:00 PM,07/26/2020 09:34:00 PM,,231147.49,,,213,213,,21.95,0.0,0.5,2.75,0.0,0.3,25.5,0.0
17645118,,07/18/2020 09:08:00 PM,07/18/2020 09:18:00 PM,,256069.13,,,83,82,,11.45,0.0,0.5,2.75,0.0,0.3,15.0,0.0
19948405,,09/16/2020 11:15:00 AM,09/16/2020 11:31:00 AM,,239679.02,,,10,131,,25.45,0.0,0.5,2.75,0.0,0.3,29.0,0.0
23173658,,11/29/2020 02:54:00 PM,11/29/2020 03:17:00 PM,,349692.3,,,19,78,,42.44,0.0,0.5,2.75,6.12,0.3,52.11,0.0


In [13]:
#Keep only the numeric features.
numeric_columns = df.select_dtypes(['int64', 'float64']).columns
numeric_columns

Index(['VendorID', 'passenger_count', 'trip_distance', 'RatecodeID',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge'],
      dtype='object')

In [14]:
# Select only the numeric features
df[numeric_columns].head()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,1.0,1.2,1.0,238,239,1.0,6.0,3.0,0.5,1.47,0.0,0.3,11.27,2.5
1,1.0,1.0,1.2,1.0,239,238,1.0,7.0,3.0,0.5,1.5,0.0,0.3,12.3,2.5
2,1.0,1.0,0.6,1.0,238,238,1.0,6.0,3.0,0.5,1.0,0.0,0.3,10.8,2.5
3,1.0,1.0,0.8,1.0,238,151,1.0,5.5,0.5,0.5,1.36,0.0,0.3,8.16,0.0
4,2.0,1.0,0.0,1.0,193,193,2.0,3.5,0.5,0.5,0.0,0.0,0.3,4.8,0.0
