In [40]:
import pandas as pd
from sqlalchemy import create_engine
pd.set_option('display.max_columns', None)

In [2]:
engine = create_engine('postgresql://root:root@localhost:5433/taxi')

In [3]:
engine.connect()


<sqlalchemy.engine.base.Connection at 0x117386a30>

In [4]:
query = """
SELECT 1 as number;
"""

pd.read_sql(query, con=engine)

Unnamed: 0,number
0,1


In [5]:
query = """
SELECT *
FROM public.green_taxi_data;"""
data = pd.read_sql(query, con=engine)

In [6]:
data.dtypes

index                      int64
VendorID                   int64
lpep_pickup_datetime      object
lpep_dropoff_datetime     object
store_and_fwd_flag        object
RatecodeID                 int64
PULocationID               int64
DOLocationID               int64
passenger_count            int64
trip_distance            float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
ehail_fee                 object
improvement_surcharge    float64
total_amount             float64
payment_type               int64
trip_type                  int64
congestion_surcharge     float64
dtype: object

In [7]:
data.lpep_pickup_datetime = pd.to_datetime(data.lpep_pickup_datetime)
data.lpep_dropoff_datetime = pd.to_datetime(data.lpep_dropoff_datetime)

In [8]:
data.dtypes

index                             int64
VendorID                          int64
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                        int64
PULocationID                      int64
DOLocationID                      int64
passenger_count                   int64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                        object
improvement_surcharge           float64
total_amount                    float64
payment_type                      int64
trip_type                         int64
congestion_surcharge            float64
dtype: object

In [9]:
data

Unnamed: 0,index,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,...,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,0,2,2018-12-21 15:17:29,2018-12-21 15:18:57,N,1,264,264,5,0.00,...,0.5,0.5,0.00,0.0,,0.3,4.30,2,1,
1,1,2,2019-01-01 00:10:16,2019-01-01 00:16:32,N,1,97,49,2,0.86,...,0.5,0.5,0.00,0.0,,0.3,7.30,2,1,
2,2,2,2019-01-01 00:27:11,2019-01-01 00:31:38,N,1,49,189,2,0.66,...,0.5,0.5,0.00,0.0,,0.3,5.80,1,1,
3,3,2,2019-01-01 00:46:20,2019-01-01 01:04:54,N,1,189,17,2,2.68,...,0.5,0.5,2.96,0.0,,0.3,19.71,1,1,
4,4,2,2019-01-01 00:19:06,2019-01-01 00:39:43,N,1,82,258,1,4.53,...,0.5,0.5,0.00,0.0,,0.3,19.30,2,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
630913,630913,2,2019-01-31 23:08:27,2019-01-31 23:22:59,N,1,255,226,1,3.33,...,0.5,0.5,2.14,0.0,,0.3,18.39,1,1,0.0
630914,630914,2,2019-01-31 23:21:26,2019-01-31 23:23:05,N,1,75,151,1,0.72,...,0.5,0.5,1.06,0.0,,0.3,6.36,1,1,0.0
630915,630915,2,2019-01-31 23:30:05,2019-01-31 23:36:14,N,1,75,238,1,1.75,...,0.5,0.5,0.00,0.0,,0.3,8.30,1,1,0.0
630916,630916,2,2019-01-31 23:59:58,2019-02-01 00:04:18,N,1,74,74,1,0.57,...,0.5,0.5,1.00,0.0,,0.3,7.30,1,1,0.0


In [12]:
taxi_trips_jan = data[data['lpep_pickup_datetime'].dt.date.astype(str) == '2019-01-15']

In [13]:
len(taxi_trips_jan)

20689

In [38]:
largest_trip_distance = data.loc[data['trip_distance'] == data.trip_distance.max(),
                            ['lpep_pickup_datetime', 'trip_distance']]
largest_trip_distance

Unnamed: 0,lpep_pickup_datetime,trip_distance
297377,2019-01-15 19:27:58,117.99


In [36]:
trips_had_passengers_3 = data.loc[(data['lpep_pickup_datetime'].dt.date.astype(str) == '2019-01-01') & 
                          (data['passenger_count'] == 3) ,
                          ['lpep_pickup_datetime','passenger_count']]
trips_had_passengers_3

Unnamed: 0,lpep_pickup_datetime,passenger_count
31,2019-01-01 00:08:07,3
40,2019-01-01 00:31:28,3
67,2019-01-01 00:39:51,3
149,2019-01-01 00:35:40,3
185,2019-01-01 00:09:09,3
...,...,...
14666,2019-01-01 22:15:36,3
14753,2019-01-01 23:46:40,3
14830,2019-01-01 23:20:47,3
14959,2019-01-01 23:16:50,3


In [34]:
trips_had_passengers_2 = data.loc[(data['lpep_pickup_datetime'].dt.date.astype(str) == '2019-01-01') & 
                          (data['passenger_count'] == 2) ,
                          ['lpep_pickup_datetime','passenger_count']]
trips_had_passengers_2

Unnamed: 0,lpep_pickup_datetime,passenger_count
1,2019-01-01 00:10:16,2
2,2019-01-01 00:27:11,2
3,2019-01-01 00:46:20,2
11,2019-01-01 00:13:48,2
32,2019-01-01 00:40:23,2
...,...,...
14971,2019-01-01 23:33:40,2
14996,2019-01-01 23:10:16,2
14997,2019-01-01 23:34:15,2
15029,2019-01-01 23:08:14,2


In [41]:
data

Unnamed: 0,index,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,0,2,2018-12-21 15:17:29,2018-12-21 15:18:57,N,1,264,264,5,0.00,3.0,0.5,0.5,0.00,0.0,,0.3,4.30,2,1,
1,1,2,2019-01-01 00:10:16,2019-01-01 00:16:32,N,1,97,49,2,0.86,6.0,0.5,0.5,0.00,0.0,,0.3,7.30,2,1,
2,2,2,2019-01-01 00:27:11,2019-01-01 00:31:38,N,1,49,189,2,0.66,4.5,0.5,0.5,0.00,0.0,,0.3,5.80,1,1,
3,3,2,2019-01-01 00:46:20,2019-01-01 01:04:54,N,1,189,17,2,2.68,13.5,0.5,0.5,2.96,0.0,,0.3,19.71,1,1,
4,4,2,2019-01-01 00:19:06,2019-01-01 00:39:43,N,1,82,258,1,4.53,18.0,0.5,0.5,0.00,0.0,,0.3,19.30,2,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
630913,630913,2,2019-01-31 23:08:27,2019-01-31 23:22:59,N,1,255,226,1,3.33,13.0,0.5,0.5,2.14,0.0,,0.3,18.39,1,1,0.0
630914,630914,2,2019-01-31 23:21:26,2019-01-31 23:23:05,N,1,75,151,1,0.72,4.0,0.5,0.5,1.06,0.0,,0.3,6.36,1,1,0.0
630915,630915,2,2019-01-31 23:30:05,2019-01-31 23:36:14,N,1,75,238,1,1.75,7.0,0.5,0.5,0.00,0.0,,0.3,8.30,1,1,0.0
630916,630916,2,2019-01-31 23:59:58,2019-02-01 00:04:18,N,1,74,74,1,0.57,5.0,0.5,0.5,1.00,0.0,,0.3,7.30,1,1,0.0


In [44]:
len(data['PULocationID'].unique())

256

In [91]:
largest_tip = data.loc[(data['PULocationID'] == 7) & 
                          (data['tip_amount'] == data.tip_amount.max()) ,
                          ['PULocationID','tip_amount','DOLocationID']]
largest_tip

Unnamed: 0,PULocationID,tip_amount,DOLocationID


In [53]:
data.tip_amount.max()

100.0