In [37]:
import pandas as pd
import numpy as np

In [2]:
def load_train():
    TRAIN_PATH = '../input/train.csv.zip'
    
    n_rows = 55_423_857 # total number of rows in the training dataset
    chunksize = 5_000_000 # batch size
    total_chunk = n_rows // chunksize + 1 # number of batches needed
    
    # defining the columns datatypes (optimize memory usage)
    traintypes = {'fare_amount': 'float32',
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8'}

    cols = list(traintypes.keys())
    
    # loading the dataframe into list of small dataframes
    df_list = []
    i=0

    for df_chunk in pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, chunksize=chunksize, compression='infer'):
        i += 1

        print(f'DataFrame Chunk {i:02d}/{total_chunk}')

        df_chunk['pickup_datetime'] = df_chunk['pickup_datetime'].str.slice(0, 16)
        df_chunk['pickup_datetime'] = pd.to_datetime(df_chunk['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')

        df_list.append(df_chunk) 
        
        df = pd.concat(df_list)
        
    return df
    
train = load_train()

DataFrame Chunk 01/12
DataFrame Chunk 02/12
DataFrame Chunk 03/12
DataFrame Chunk 04/12
DataFrame Chunk 05/12
DataFrame Chunk 06/12
DataFrame Chunk 07/12
DataFrame Chunk 08/12
DataFrame Chunk 09/12
DataFrame Chunk 10/12
DataFrame Chunk 11/12
DataFrame Chunk 12/12


In [3]:
def clean_train(df):
    train.dropna(axis=0, how='any', inplace=True)
    
    idx_negative_fare      = df[df['fare_amount'] < 0].index
    df.drop(axis=0, index=idx_negative_fare, inplace=True)
    del idx_negative_fare
    
    lat_intv = [38, 42]
    lon_intv = [-76, -70]
    
    idx_zero_plat = df[(df['pickup_latitude'] < lat_intv[0]) | df['pickup_latitude'] > lat_intv[1]].index
    df.drop(axis=0, index=idx_zero_plat, inplace=True)
    del idx_zero_plat
    
    idx_zero_dlat = df[(df['dropoff_latitude'] < lat_intv[0]) | (df['dropoff_latitude'] > lat_intv[1])].index
    df.drop(axis=0, index=idx_zero_dlat, inplace=True)
    del idx_zero_dlat
    
    idx_zero_plon = df[(df['pickup_longitude'] > lon_intv[1]) | (df['pickup_longitude'] < lon_intv[0])].index
    df.drop(axis=0, index=idx_zero_plon, inplace=True)
    del idx_zero_plon

    idx_zerp_dlon = df[(df['dropoff_longitude'] > lon_intv[1]) | (df['dropoff_longitude'] < lon_intv[0])].index
    df.drop(axis=0, index=idx_zerp_dlon, inplace=True)
    del idx_zerp_dlon
    
    idx_invalid_passengers = df[(df['passenger_count'] > 6) | (df['passenger_count'] == 0)].index
    df.drop(axis=0, index=idx_invalid_passengers, inplace=True)
    del idx_invalid_passengers

In [4]:
clean_train(train)

In [9]:
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:00+00:00,-73.844315,40.721317,-73.841614,40.712276,1
1,16.9,2010-01-05 16:52:00+00:00,-74.016045,40.711304,-73.979271,40.782005,1
2,5.7,2011-08-18 00:35:00+00:00,-73.982735,40.761269,-73.991241,40.750561,2
3,7.7,2012-04-21 04:30:00+00:00,-73.987129,40.733143,-73.99157,40.758091,1
4,5.3,2010-03-09 07:51:00+00:00,-73.968094,40.768009,-73.956657,40.783764,1


In [10]:
train['hour'] = train['pickup_datetime'].dt.hour
train['day'] = train['pickup_datetime'].dt.day
train['month'] = train['pickup_datetime'].dt.month
train['year'] = train['pickup_datetime'].dt.year
train['weekday'] = train['pickup_datetime'].dt.dayofweek

In [14]:
train.drop('pickup_datetime', axis=1, inplace=True)

In [33]:
cols = [
    'fare_amount',
    'passenger_count',
    'year',
    'hour',
    'day',
    'month',
    'pickup_longitude',
    'dropoff_longitude',
    'pickup_latitude',
    'dropoff_latitude'
]

train = train[cols]

cols = [
    'fare',
    'num_pass',
    'year',
    'hour',
    'day',
    'month',
    'x0',
    'x1',
    'y0',
    'y1'
]

train.columns = cols

In [34]:
train.head()

Unnamed: 0,fare,num_pass,year,hour,day,month,x0,x1,y0,y1
0,4.5,1,2009,17,15,6,-73.844315,-73.841614,40.721317,40.712276
1,16.9,1,2010,16,5,1,-74.016045,-73.979271,40.711304,40.782005
2,5.7,2,2011,0,18,8,-73.982735,-73.991241,40.761269,40.750561
3,7.7,1,2012,4,21,4,-73.987129,-73.99157,40.733143,40.758091
4,5.3,1,2010,7,9,3,-73.968094,-73.956657,40.768009,40.783764


In [39]:
def distances(df):
    df.loc[:, 'dist_e'] = np.sqrt((df.loc[:,'x1'] - df.loc[:,'x0'])**2 + (df.loc[:,'y1'] - df.loc[:,'y0'])**2)
    df.loc[:, 'dist_t'] = abs(df.loc[:,'x1'] - df.loc[:,'x0']) + abs(df.loc[:,'y1'] - df.loc[:,'y0'])

In [40]:
distances(train)

  


In [43]:
train.head()

Unnamed: 0,fare,num_pass,year,hour,day,month,x0,x1,y0,y1,dist_e,dist_t
0,4.5,1,2009,17,15,6,-73.844315,-73.841614,40.721317,40.712276,0.009436,0.011742
1,16.9,1,2010,16,5,1,-74.016045,-73.979271,40.711304,40.782005,0.079693,0.107475
2,5.7,2,2011,0,18,8,-73.982735,-73.991241,40.761269,40.750561,0.013676,0.019215
3,7.7,1,2012,4,21,4,-73.987129,-73.99157,40.733143,40.758091,0.02534,0.029388
4,5.3,1,2010,7,9,3,-73.968094,-73.956657,40.768009,40.783764,0.019468,0.027191


In [45]:
import matplotlib.pyplot as plt

In [46]:
%matplotlib inline

In [None]:
train['dist_e'].max()

In [None]:
train['dist_t'].max()