# Optimizing Data Acquisition

Since this is a pretty extensive dataset, I'll explore jsut one subset of the data. Let's see what can be done to reduce size

In [1]:
import pandas as pd
ny = pd.read_csv(r'..\data\raw\taxi data\yellow_tripdata_2015-01_00', parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])
ny.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-01-15 19:05:39,2015-01-15 19:23:42,1,1.59,-73.993896,40.750111,1,N,-73.974785,40.750618,1,12.0,1.0,0.5,3.25,0.0,0.3,17.05
1,1,2015-01-10 20:33:38,2015-01-10 20:53:28,1,3.3,-74.001648,40.724243,1,N,-73.994415,40.759109,1,14.5,0.5,0.5,2.0,0.0,0.3,17.8
2,1,2015-01-10 20:33:38,2015-01-10 20:43:41,1,1.8,-73.963341,40.802788,1,N,-73.95182,40.824413,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8
3,1,2015-01-10 20:33:39,2015-01-10 20:35:31,1,0.5,-74.009087,40.713818,1,N,-74.004326,40.719986,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8
4,1,2015-01-10 20:33:39,2015-01-10 20:52:58,1,3.0,-73.971176,40.762428,1,N,-74.004181,40.742653,2,15.0,0.5,0.5,0.0,0.0,0.3,16.3


In [2]:
ny.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499999 entries, 0 to 499998
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   VendorID               499999 non-null  int64         
 1   tpep_pickup_datetime   499999 non-null  datetime64[ns]
 2   tpep_dropoff_datetime  499999 non-null  datetime64[ns]
 3   passenger_count        499999 non-null  int64         
 4   trip_distance          499999 non-null  float64       
 5   pickup_longitude       499999 non-null  float64       
 6   pickup_latitude        499999 non-null  float64       
 7   RateCodeID             499999 non-null  int64         
 8   store_and_fwd_flag     499999 non-null  object        
 9   dropoff_longitude      499999 non-null  float64       
 10  dropoff_latitude       499999 non-null  float64       
 11  payment_type           499999 non-null  int64         
 12  fare_amount            499999 non-null  floa

I'll try to reduce the footprint by selecting the data types used for each field.

In [3]:
dtype = {
    'VendorID': 'category',
    'passenger_count': 'int8',
    'trip_distance': 'float32',
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'dropoff_longitude': 'float32',
    'dropoff_latitude': 'float32',
    'RateCodeID': 'category',
    'store_and_fwd_flag': 'category',
    'payment_type': 'category',
    'fare_amount': 'float32',
    'extra': 'float32',
    'mta_tax': 'category',
    'tip_amount': 'float32',
    'tolls_amount': 'float32',
    'improvement_surcharge': 'category',
    'total_amount': 'float32'
}

ny = pd.read_csv(r'..\data\raw\taxi data\yellow_tripdata_2015-01_00', dtype=dtype, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])
ny.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499999 entries, 0 to 499998
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   VendorID               499999 non-null  category      
 1   tpep_pickup_datetime   499999 non-null  datetime64[ns]
 2   tpep_dropoff_datetime  499999 non-null  datetime64[ns]
 3   passenger_count        499999 non-null  int8          
 4   trip_distance          499999 non-null  float32       
 5   pickup_longitude       499999 non-null  float32       
 6   pickup_latitude        499999 non-null  float32       
 7   RateCodeID             499999 non-null  category      
 8   store_and_fwd_flag     499999 non-null  category      
 9   dropoff_longitude      499999 non-null  float32       
 10  dropoff_latitude       499999 non-null  float32       
 11  payment_type           499999 non-null  category      
 12  fare_amount            499999 non-null  floa

We have gone from 72.5 MB to to 28.1 MB.

# Cleaning Data

Now we'll try to find inconsistencies in the data. Let's look at its description first.

In [4]:
ny.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-01-15 19:05:39,2015-01-15 19:23:42,1,1.59,-73.993896,40.750111,1,N,-73.974785,40.750618,1,12.0,1.0,0.5,3.25,0.0,0.3,17.049999
1,1,2015-01-10 20:33:38,2015-01-10 20:53:28,1,3.3,-74.001648,40.724243,1,N,-73.994415,40.759109,1,14.5,0.5,0.5,2.0,0.0,0.3,17.799999
2,1,2015-01-10 20:33:38,2015-01-10 20:43:41,1,1.8,-73.963341,40.802788,1,N,-73.95182,40.824413,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8
3,1,2015-01-10 20:33:39,2015-01-10 20:35:31,1,0.5,-74.009087,40.713818,1,N,-74.004326,40.719986,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8
4,1,2015-01-10 20:33:39,2015-01-10 20:52:58,1,3.0,-73.971176,40.762428,1,N,-74.004181,40.742653,2,15.0,0.5,0.5,0.0,0.0,0.3,16.299999


In [5]:
print(ny.shape)
ny.describe()

(499999, 19)


Unnamed: 0,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount,extra,tip_amount,tolls_amount,total_amount
count,499999.0,499999.0,499999.0,499999.0,499999.0,499999.0,499999.0,499999.0,499999.0,499999.0,499999.0
mean,1.678109,3.019222,-72.560242,39.97184,-72.605782,39.998131,11.872335,0.314752,1.543074,0.241704,14.768952
std,1.334983,136.926987,10.130702,5.581226,9.968216,5.491283,10.133603,0.367731,2.511312,1.222974,12.3003
min,0.0,0.0,-87.451874,0.0,-86.731705,0.0,-138.899994,-1.0,-81.0,-5.33,-139.699997
25%,1.0,1.0,-73.991669,40.735725,-73.991219,40.734528,6.5,0.0,0.0,0.0,8.16
50%,1.0,1.66,-73.981567,40.7533,-73.97982,40.753757,9.0,0.0,1.0,0.0,11.15
75%,2.0,3.0,-73.966606,40.767715,-73.962524,40.76894,13.5,0.5,2.06,0.0,16.299999
max,9.0,92000.898438,0.0,42.294155,0.0,49.194656,900.0,7.0,750.0,95.330002,900.299988


It seems there are negative values in values related to charges (total_amount, extra...).

In [6]:
idx = ny['total_amount'] < 0
print(ny[idx].shape)
ny[idx].head(10)

(158, 19)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
3831,2,2015-01-17 22:40:27,2015-01-17 22:43:04,1,0.11,-74.002357,40.739826,1,N,-74.001114,40.741108,4,-3.5,-0.5,-0.5,0.0,0.0,0.3,-4.8
4924,2,2015-01-15 17:33:24,2015-01-15 17:33:31,2,0.0,-73.982567,40.739799,1,N,-73.982567,40.739799,3,-2.5,-1.0,-0.5,-0.7,0.0,0.3,-5.0
10046,2,2015-01-16 16:00:45,2015-01-16 16:00:53,1,0.0,-73.937721,40.758194,1,N,-73.937721,40.758194,3,-2.5,-1.0,-0.5,0.0,0.0,0.3,-4.3
16703,2,2015-01-31 23:38:52,2015-01-31 23:38:54,2,0.0,0.0,0.0,2,N,0.0,0.0,2,-52.0,0.0,-0.5,0.0,0.0,0.3,-52.799999
19953,2,2015-01-10 02:23:53,2015-01-10 02:23:58,2,0.0,0.0,0.0,5,N,0.0,0.0,1,-6.8,0.0,0.0,-1.0,0.0,0.3,-8.1
22989,2,2015-01-14 11:52:09,2015-01-14 11:52:20,1,0.0,-73.789955,40.646946,2,N,0.0,0.0,3,-52.0,0.0,-0.5,-14.33,-5.33,0.3,-72.459999
23879,2,2015-01-03 02:01:25,2015-01-03 02:01:54,1,0.03,-73.953407,40.81115,1,N,-73.953751,40.811302,2,-2.5,-0.5,-0.5,0.0,0.0,0.3,-3.8
26582,2,2015-01-12 15:07:29,2015-01-12 15:07:35,1,0.0,0.0,0.0,2,N,0.0,0.0,2,-52.0,0.0,-0.5,0.0,0.0,0.3,-52.799999
27172,2,2015-01-06 14:07:25,2015-01-06 14:08:27,1,0.03,-73.99456,40.740318,1,N,-73.995331,40.740952,4,-2.5,0.0,-0.5,0.0,0.0,0.3,-3.3
30188,2,2015-01-10 21:10:20,2015-01-10 21:12:39,1,0.03,-73.986328,40.75528,1,N,-73.98542,40.755089,4,-3.5,-0.5,-0.5,0.0,0.0,0.3,-4.8


It doesn't make sense unless they are refunds. In that case, it means there are 2 records for the same pickup, one for the original trip and one for the refund.

The data dictionary [https://storage.googleapis.com/hiring-test/data_dictionary_trip_records_yellow.pdf](link) defines the following categories of payment in the payment_type variable:

 - 1= Credit card 
 - 2= Cash
 - 3= No charge 
 - 4= Dispute
 - 5= Unknown 
 - 6= Voided trip

But from the small sample above it seems clear not all refunds are registered as 'Dispute', 'Voided trip'... So this field can't be trusted to identify refunds.

Moving forward, records with a negative total_amount should be discarded to avoid duplicity.


In [7]:
ny = ny[ny['total_amount'] >= 0]
print(ny.shape)
ny.describe()

(499841, 19)


Unnamed: 0,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount,extra,tip_amount,tolls_amount,total_amount
count,499841.0,499841.0,499841.0,499841.0,499841.0,499841.0,499841.0,499841.0,499841.0,499841.0,499841.0
mean,1.678048,3.0201,-72.563202,39.973476,-72.609505,40.000172,11.879289,0.314949,1.543882,0.241802,14.777504
std,1.334955,136.948608,10.120315,5.575504,9.954946,5.483973,10.122419,0.367572,2.507314,1.223109,12.287228
min,0.0,0.0,-87.451874,0.0,-86.731705,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,-73.991669,40.735729,-73.991219,40.734531,6.5,0.0,0.0,0.0,8.16
50%,1.0,1.66,-73.981567,40.753304,-73.97982,40.753761,9.0,0.0,1.0,0.0,11.15
75%,2.0,3.0,-73.966614,40.767715,-73.962532,40.768944,13.5,0.5,2.06,0.0,16.299999
max,9.0,92000.898438,0.0,42.294155,0.0,49.194656,900.0,7.0,750.0,95.330002,900.299988


It also stands out that the minimum latitude for the pickups and the dropoffs is 0 (pretty far from NYC). The same happens with the max longitude.

In [8]:
idx = ny['pickup_longitude'] == 0
print(ny[idx].shape)
ny[idx].head()

(9537, 19)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
31,2,2015-01-15 19:05:43,2015-01-15 19:05:44,2,0.01,0.0,0.0,5,N,0.0,0.0,1,60.0,0.0,0.0,0.0,0.0,0.3,60.299999
61,1,2015-01-04 13:44:52,2015-01-04 13:56:49,1,2.5,0.0,0.0,1,N,0.0,0.0,1,11.0,0.0,0.5,2.35,0.0,0.0,14.15
66,2,2015-01-04 13:44:52,2015-01-04 13:49:03,1,0.85,0.0,0.0,1,N,0.0,0.0,2,5.5,0.0,0.5,0.0,0.0,0.3,6.3
157,1,2015-01-15 09:47:00,2015-01-15 10:00:07,1,1.0,0.0,0.0,1,N,0.0,0.0,2,10.0,0.0,0.5,0.0,0.0,0.3,10.8
159,1,2015-01-15 09:47:02,2015-01-15 10:17:47,3,8.3,0.0,0.0,1,N,0.0,0.0,1,27.5,0.0,0.5,10.0,5.33,0.3,43.630001


In [9]:
ny = ny[ny['trip_distance'] != 0]
print(ny.shape)
ny.describe()

(496622, 19)


Unnamed: 0,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount,extra,tip_amount,tolls_amount,total_amount
count,496622.0,496622.0,496622.0,496622.0,496622.0,496622.0,496622.0,496622.0,496622.0,496622.0,496622.0
mean,1.680282,3.039675,-72.656624,40.024918,-72.779053,40.093559,11.838174,0.315709,1.538152,0.240555,14.730853
std,1.336804,137.391525,9.786289,5.391513,9.327368,5.13823,9.739705,0.36771,2.225962,1.218816,11.885249
min,0.0,0.01,-87.451874,0.0,-86.731705,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,-73.991684,40.735886,-73.991241,40.73473,6.5,0.0,0.0,0.0,8.3
50%,1.0,1.68,-73.981606,40.75338,-73.979897,40.753876,9.0,0.0,1.0,0.0,11.16
75%,2.0,3.0,-73.966766,40.767754,-73.962791,40.76902,13.5,0.5,2.06,0.0,16.299999
max,6.0,92000.898438,0.0,42.294155,0.0,49.194656,420.0,7.0,200.0,95.330002,453.299988


It's clear that the location data is invalid for these records. This data will get discarded later when joining with the geojson data. For now I'll discard manually

In [10]:
ny = ny[ny['pickup_longitude'] != 0]
ny = ny[ny['dropoff_longitude'] != 0]
print(ny.shape)
ny.describe()

(487557, 19)


Unnamed: 0,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount,extra,tip_amount,tolls_amount,total_amount
count,487557.0,487557.0,487557.0,487557.0,487557.0,487557.0,487557.0,487557.0,487557.0,487557.0,487557.0
mean,1.685159,3.045639,-73.974747,40.751041,-73.974266,40.751987,11.834809,0.31577,1.536997,0.240602,14.726496
std,1.3421,138.662109,0.043394,0.075289,0.115499,0.048327,9.712847,0.367778,2.217087,1.21936,11.85743
min,0.0,0.01,-87.451874,4.789132,-86.731705,18.625944,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,-73.991859,40.737282,-73.991379,40.736206,6.5,0.0,0.0,0.0,8.3
50%,1.0,1.68,-73.981918,40.754162,-73.980286,40.754509,9.0,0.0,1.0,0.0,11.16
75%,2.0,3.0,-73.967941,40.768108,-73.964012,40.76939,13.5,0.5,2.06,0.0,16.299999
max,6.0,92000.898438,-69.598526,42.294155,-0.116667,49.194656,420.0,7.0,200.0,95.330002,453.299988


In [11]:
print(ny[ny['trip_distance'] == 0].shape)
ny[ny['trip_distance'] == 0].head()

(0, 19)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount


Either the pickup or the dropoff is wrong. Or maybe these records are additional charges made at the dropoff location after a wrong mischarge. If that's the case this is not actually a pickup, so I'm inclinded to delete this records (although I recognize an argument could be made towards keeping them).

There still seem to be wrong coordinates (max dropoff longitude = -0.11) but as I said, they will be discarded later.

There are also lines where the trip distance is 0.

Finally, there are rows where the passenger count is 0, but this can be attributed to a wrongful registry of the data and it doesn't really affect the scope of this analysis.

We started with 499999 records and after cleaning we have 487557. In the next step, we'll try to clean the whole dataset.

# Joining with GEOJSON data

We need to calculate the average (we assume daily) pickups by block. Blocks are defined in the nyc_cbg_geoms.geoson file. GeoPandas allow us to easily join coordinates with geojson polygon data.

In [32]:
import geopandas

nyg = geopandas.GeoDataFrame(
    ny, 
    geometry=geopandas.points_from_xy(
        ny['pickup_longitude'], ny['pickup_latitude'], crs=4326)
)
nyg.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,geometry
0,2,2015-01-15 19:05:39,2015-01-15 19:23:42,1,1.59,-73.993896,40.750111,1,N,-73.974785,40.750618,1,12.0,1.0,0.5,3.25,0.0,0.3,17.049999,POINT (-73.99390 40.75011)
1,1,2015-01-10 20:33:38,2015-01-10 20:53:28,1,3.3,-74.001648,40.724243,1,N,-73.994415,40.759109,1,14.5,0.5,0.5,2.0,0.0,0.3,17.799999,POINT (-74.00165 40.72424)
2,1,2015-01-10 20:33:38,2015-01-10 20:43:41,1,1.8,-73.963341,40.802788,1,N,-73.95182,40.824413,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8,POINT (-73.96334 40.80279)
3,1,2015-01-10 20:33:39,2015-01-10 20:35:31,1,0.5,-74.009087,40.713818,1,N,-74.004326,40.719986,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8,POINT (-74.00909 40.71382)
4,1,2015-01-10 20:33:39,2015-01-10 20:52:58,1,3.0,-73.971176,40.762428,1,N,-74.004181,40.742653,2,15.0,0.5,0.5,0.0,0.0,0.3,16.299999,POINT (-73.97118 40.76243)


In [13]:
geo = geopandas.read_file(r'..\data\raw\nyc_cbg_geoms.geojson')
geo.head()

Unnamed: 0,geoid,geometry
0,360050001001,"POLYGON ((-73.89277 40.79284, -73.89261 40.792..."
1,360050002001,"POLYGON ((-73.86285 40.81267, -73.86191 40.812..."
2,360050002002,"POLYGON ((-73.86708 40.81444, -73.86332 40.812..."
3,360050002003,"POLYGON ((-73.85856 40.80665, -73.85848 40.806..."
4,360050004001,"POLYGON ((-73.85972 40.81527, -73.85956 40.815..."


In [14]:
%%time
nyg_join = geopandas.sjoin(nyg, geo, how='inner', op='within')

Wall time: 38.3 s


In [15]:
print(nyg_join.shape)
nyg_join.head()

(487202, 22)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,...,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,geometry,index_right,geoid
0,2,2015-01-15 19:05:39,2015-01-15 19:23:42,1,1.59,-73.993896,40.750111,1,N,-73.974785,...,12.0,1.0,0.5,3.25,0.0,0.3,17.049999,POINT (-73.99390 40.75011),3575,360610101001
11,1,2015-01-10 20:33:41,2015-01-10 20:43:26,1,1.1,-73.993782,40.751419,1,N,-73.967407,...,7.5,0.5,0.5,1.0,0.0,0.3,9.8,POINT (-73.99378 40.75142),3575,360610101001
26,2,2015-01-15 19:05:42,2015-01-15 19:16:18,1,1.53,-73.991127,40.75008,1,N,-73.988609,...,9.0,1.0,0.5,0.0,0.0,0.3,10.8,POINT (-73.99113 40.75008),3575,360610101001
64,2,2015-01-04 13:44:52,2015-01-04 13:53:17,1,1.14,-73.990952,40.750809,1,N,-73.979408,...,7.0,0.0,0.5,1.4,0.0,0.3,9.2,POINT (-73.99095 40.75081),3575,360610101001
89,2,2015-01-15 14:00:45,2015-01-15 14:10:24,1,1.2,-73.994003,40.75135,1,N,-73.981354,...,8.5,0.0,0.5,0.0,0.0,0.3,9.3,POINT (-73.99400 40.75135),3575,360610101001


Since the spatial join is very time consuming and we only need the average number of pickups, we'll calculate from the provided partitioned data.

In [16]:
nyg_join['pickup_date'] = nyg_join['tpep_pickup_datetime'].dt.floor('d')
nyg_join.groupby(['pickup_date', 'geoid']) \
    .size().reset_index(name='avg') \
    .groupby('geoid') \
    .mean()

Unnamed: 0_level_0,avg
geoid,Unnamed: 1_level_1
360050001001,2.000000
360050004002,1.000000
360050016004,1.000000
360050019001,1.642857
360050019002,1.000000
...,...
360850169012,1.000000
360850177003,1.000000
360850207001,1.000000
360850319011,1.000000


In [17]:
del nyg_join

# Using PostGIS

The ~40 seconds it takes to complete the spatial join multiplied by 75 files means almost an hour of computing time. We probably could use some alternative. There are resources online that claim faster spatial joins using PostGIS, so I created a test database to do some benchmarks.

In [18]:
from sqlalchemy import create_engine
from geoalchemy2 import Geometry, WKTElement

engine = create_engine('postgresql://postgres:1234@localhost:5432/carto')

In [34]:
%%time
nyg_min = nyg[['geometry', 'tpep_pickup_datetime']]
nyg_min.to_postgis('trip', engine, if_exists='append', index=False)

Wall time: 30.4 s


In [26]:
%%time
geo.to_postgis('block', engine)

Wall time: 925 ms


In [39]:
%%time
sql =  """SELECT COUNT(*) FROM trip INNER JOIN block 
            ON ST_Within(trip.geometry, block.geometry) 
            GROUP BY geoid, date_trunc('day', trip.tpep_pickup_datetime)"""
pd.read_sql_query(sql, engine)

Wall time: 13 s


It seems PostGIS makes spatial joins a lot faster than GeoPandas. Unfortunately, uploading the data from GeoPandas is also very slow. I'll try to upload using PostGres Copy command to see if it is worth it.

This time I'll try to upload all the columns. To create the table I'll take advantage of the to_postgis method.

In [63]:
import numpy as np
engine.execute('DROP TABLE trip')
nyg[np.full(nyg.shape[0], False)].to_postgis('trip', engine, if_exists='append', index=False)
engine.execute('ALTER TABLE trip DROP COLUMN geometry')