# Import

## packages

In [38]:
#import basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#import for time 
import dask.dataframe as dd
import os
from tqdm import tqdm
import time

#import pyarrow -> needed for feather
import pyarrow

## straight importing files (but see below for dealing with huge CSV file

In [11]:
%%time

#import dataset - but it is a big file. see below for this case. 
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

CPU times: user 1min 49s, sys: 2min 11s, total: 4min 1s
Wall time: 6min 48s


# Case of Huge CSV File

## readlines

In [10]:
%%time

# %%time will print out the CPU time and wall time 
print('hello')

hello
CPU times: user 177 µs, sys: 82 µs, total: 259 µs
Wall time: 222 µs


In [2]:
#path for train dataset
train_path = '../NYCTaxi/train.csv'

In [12]:
%%time 

#want to know the exact number of rows from the large file -> this way is a lot shorter to find how big the data is. 

#method 1 : use file.readlines 
with open(train_path) as file:
    n_rows = len(file.readlines())

print(f'Exact number of rows: {n_rows}')

Exact number of rows: 55423857
CPU times: user 20.1 s, sys: 27.8 s, total: 47.9 s
Wall time: 1min 31s


In [13]:
%%time 

#take a look at the train dataset
test_temp = pd.read_csv(train_path, nrows= 10)
test_temp.head()

CPU times: user 8.9 ms, sys: 26.1 ms, total: 35 ms
Wall time: 68.2 ms


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [7]:
#info on temp table -> to see column datatype
test_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   key                10 non-null     object 
 1   fare_amount        10 non-null     float64
 2   pickup_datetime    10 non-null     object 
 3   pickup_longitude   10 non-null     float64
 4   pickup_latitude    10 non-null     float64
 5   dropoff_longitude  10 non-null     float64
 6   dropoff_latitude   10 non-null     float64
 7   passenger_count    10 non-null     int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 768.0+ bytes


## Using Chunksize and load chunks to list

In [19]:
#change the column type to optimize for memory usage
traintypes = {
                    'fare_amount': 'float32',
                    'pickup_datetime': 'str',
                    'pickup_longitude': 'float32',
                    'pickup_latitude': 'float32',
                    'dropoff_longitude': 'float32',
                    'dropoff_latitude': 'float32',
                    'passenger_count': 'uint8'
}

#to get the column names, we can use .keys() and save it as cols. 
cols = list(traintypes.keys())

In [15]:
#make a chunk size to run 5mil rows at each chunk
chunksize = 5_000_000 
#here, _ is used as a visual separator. 

In [20]:
%%time
#make an empty list to start with
df_list = []

#for loop using chunksize above. 
for df_chunk in tqdm(pd.read_csv(train_path, usecols = cols, dtype=traintypes, chunksize = chunksize)):
    df_chunk['pickup_datetime'] = df_chunk['pickup_datetime'].str.slice(0,16)
    df_chunk['pickup_datetime'] = pd.to_datetime(df_chunk['pickup_datetime'], utc = True, format = '%Y-%m-%d %H:%M')
    
    #append the chunk to list and merge all
    df_list.append(df_chunk)
    
#12 iterations. 

12it [01:49,  9.17s/it]

CPU times: user 1min 35s, sys: 9.68 s, total: 1min 45s
Wall time: 1min 50s





In [21]:
#end result of df_list. -> each iteration results 5M chunk. 
df_list

[         fare_amount           pickup_datetime  pickup_longitude  \
 0                4.5 2009-06-15 17:26:00+00:00        -73.844315   
 1               16.9 2010-01-05 16:52:00+00:00        -74.016045   
 2                5.7 2011-08-18 00:35:00+00:00        -73.982735   
 3                7.7 2012-04-21 04:30:00+00:00        -73.987129   
 4                5.3 2010-03-09 07:51:00+00:00        -73.968094   
 ...              ...                       ...               ...   
 4999995         16.5 2011-01-24 21:33:00+00:00        -74.003883   
 4999996          9.0 2013-10-11 12:12:00+00:00        -73.995102   
 4999997         10.5 2014-12-06 23:04:00+00:00        -73.981064   
 4999998         10.0 2015-05-30 19:01:00+00:00        -73.965401   
 4999999          4.9 2012-07-11 08:12:00+00:00        -73.972595   
 
          pickup_latitude  dropoff_longitude  dropoff_latitude  passenger_count  
 0              40.721317         -73.841614         40.712276                1  
 1    

In [23]:
#merge all dataframe lists into one dataframe
trainm_df = pd.concat(df_list)

#delete the dataframe list to release memory
del df_list

#check trainm_df
trainm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55423856 entries, 0 to 55423855
Data columns (total 7 columns):
 #   Column             Dtype              
---  ------             -----              
 0   fare_amount        float32            
 1   pickup_datetime    datetime64[ns, UTC]
 2   pickup_longitude   float32            
 3   pickup_latitude    float32            
 4   dropoff_longitude  float32            
 5   dropoff_latitude   float32            
 6   passenger_count    uint8              
dtypes: datetime64[ns, UTC](1), float32(5), uint8(1)
memory usage: 1.5 GB


In [29]:
display(trainm_df.head())
display(trainm_df.tail(10))

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:00+00:00,-73.844315,40.721317,-73.841614,40.712276,1
1,16.9,2010-01-05 16:52:00+00:00,-74.016045,40.711304,-73.979271,40.782005,1
2,5.7,2011-08-18 00:35:00+00:00,-73.982735,40.761269,-73.991241,40.750561,2
3,7.7,2012-04-21 04:30:00+00:00,-73.987129,40.733143,-73.99157,40.758091,1
4,5.3,2010-03-09 07:51:00+00:00,-73.968094,40.768009,-73.956657,40.783764,1


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
55423846,11.7,2010-05-28 07:49:00+00:00,-73.947159,40.780228,-73.976807,40.758865,2
55423847,6.1,2011-09-16 00:46:00+00:00,-73.990944,40.736618,-73.992294,40.740314,3
55423848,6.0,2013-05-24 00:13:00+00:00,-73.993484,40.747372,-73.998436,40.730461,1
55423849,12.0,2014-03-04 22:25:00+00:00,-73.983017,40.745083,-73.954178,40.767338,1
55423850,4.5,2015-03-22 16:37:00+00:00,-73.981056,40.737457,-73.985474,40.729298,1
55423851,14.0,2014-03-15 03:28:00+00:00,-74.005272,40.740028,-73.96328,40.762554,1
55423852,4.2,2009-03-24 20:46:00+00:00,-73.957787,40.76553,-73.951637,40.77396,1
55423853,14.1,2011-04-02 22:04:00+00:00,-73.970505,40.752323,-73.960541,40.79734,1
55423854,28.9,2011-10-26 05:57:00+00:00,-73.980904,40.764629,-73.870605,40.773964,1
55423855,7.5,2014-12-12 11:33:00+00:00,-73.969719,40.797668,-73.970886,40.783314,1


## Feather

In [33]:
%%time
#save it into feather format ->  fast, lightweight, easy to use binary file format for storing dataframes. 

trainm_df.to_feather('nyc_taxi_data_raw.feather')

CPU times: user 2.51 s, sys: 2.46 s, total: 4.97 s
Wall time: 6.28 s


In [34]:
%%time
#load the same dataframe next time directly, without reading the csv file again
train_df_new = pd.read_feather('nyc_taxi_data_raw.feather')

CPU times: user 1.26 s, sys: 3.09 s, total: 4.35 s
Wall time: 5.67 s


In [36]:
%%time
#to verify we have loaded the saved dataframe of 55 mil rows. 
train_df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55423856 entries, 0 to 55423855
Data columns (total 7 columns):
 #   Column             Dtype              
---  ------             -----              
 0   fare_amount        float32            
 1   pickup_datetime    datetime64[ns, UTC]
 2   pickup_longitude   float32            
 3   pickup_latitude    float32            
 4   dropoff_longitude  float32            
 5   dropoff_latitude   float32            
 6   passenger_count    uint8              
dtypes: datetime64[ns, UTC](1), float32(5), uint8(1)
memory usage: 1.5 GB
CPU times: user 3.14 ms, sys: 867 µs, total: 4 ms
Wall time: 5.41 ms


# EDA