In [None]:
import pandas as pd
import numpy as np

from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

from haversine import haversine

In [None]:
def process_train(missing_years):
    TRAIN_PATH = '../input/train.csv.zip'
    
    n_rows = 55_423_857 # total number of rows in the training dataset
    chunksize = 100_000 # batch size
    total_chunk = n_rows // chunksize + 1 # number of batches needed
    
    # defining the columns datatypes (optimize memory usage)
    traintypes = {
        'fare_amount': 'float32',
        'pickup_datetime': 'str', 
        'pickup_longitude': 'float32',
        'pickup_latitude': 'float32',
        'dropoff_longitude': 'float32',
        'dropoff_latitude': 'float32',
        'passenger_count': 'uint8'
    }

    cols = list(traintypes.keys())
    
    # loading the dataframe into list of small dataframes
    i = 0

    for df_chunk in pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, chunksize=chunksize, compression='infer'):
        print('--------------------------------------')
        print('Iteration {}'.format(i))
        
        print('Rows with fare_amount < 2.5 dropped: {}'.format(len(df_chunk[df_chunk['fare_amount'] < 2.5].index)))
        df_chunk = df_chunk.drop(df_chunk[df_chunk['fare_amount'] < 2.5].index)
        
        print('Rows with NA values dropped: {}'.format(max(df_chunk.isna().sum())))
        df_chunk = df_chunk.dropna(axis=0, how='any')
        
        lat_intv = [40.56, 41.71]
        lon_intv = [-74.27, -72.98]  

        idx_zero_plat = df_chunk[(df_chunk['pickup_latitude'] < lat_intv[0]) | (df_chunk['pickup_latitude'] > lat_intv[1])].index
        df_chunk.drop(axis=0, index=idx_zero_plat, inplace=True)

        idx_zero_dlat = df_chunk[(df_chunk['dropoff_latitude'] < lat_intv[0]) | (df_chunk['dropoff_latitude'] > lat_intv[1])].index
        df_chunk.drop(axis=0, index=idx_zero_dlat, inplace=True)

        idx_zero_plon = df_chunk[(df_chunk['pickup_longitude'] > lon_intv[1]) | (df_chunk['pickup_longitude'] < lon_intv[0])].index
        df_chunk.drop(axis=0, index=idx_zero_plon, inplace=True)
        
        idx_zero_dlon = df_chunk[(df_chunk['dropoff_longitude'] > lon_intv[1]) | (df_chunk['dropoff_longitude'] < lon_intv[0])].index
        df_chunk.drop(axis=0, index=idx_zero_dlon, inplace=True)
        
        print('Rows with out of test coordinates dropped: {}'.format(
            len(idx_zero_plat) + len(idx_zero_dlat) + len(idx_zero_plon) + len(idx_zero_dlon)))
        
        del idx_zero_plat
        del idx_zero_dlat
        del idx_zero_plon
        del idx_zero_dlon
        
        print('Rows with out of test passengers dropped: {}'
              .format(len(df_chunk[(df_chunk['passenger_count'] > 6) | (df_chunk['passenger_count'] == 0)].index)))
        df_chunk = df_chunk.drop(df_chunk[(df_chunk['passenger_count'] > 6) | (df_chunk['passenger_count'] == 0)].index)
        
        # extract year, week, weekday, hour from datetime and delete datetime
        process_datetime(df_chunk)
        df_chunk['FROM'] = np.nan
        df_chunk['TO'] = np.nan
        
        # extract geo-locations from cooridnates
        df_chunk[['FROM', 'TO']] = df_chunk.apply(geo_location, axis=1)
        
        # filter dataframe based on missing years derived from test set
        idx = []
        for row in missing_years:
            idx.extend(
                df_chunk[(df_chunk['FROM'] == row[0]) & (df_chunk['TO'] == row[1]) & (df_chunk['year'] == row[2])].index
            )
            
        df_chunk = df_chunk.drop(idx)
        print('Rows out of test sample dropped: {} '.format(len(idx)))
        i = i+1
        print(df_chunk.shape)
        
        fileName = '../input/chunked/train_' + str(i) + '.csv'
        df_chunk.to_csv(fileName)

In [None]:
#process_train(missing_years)

In [1]:
import os, sys
import pandas as pd
import numpy as np

In [2]:
path = '../input/chunked/'
files = os.listdir(path)

df_list = []

for file in files:
    df_list.append(pd.read_csv(path+file))
    
train = pd.concat(df_list)
del df_list

train = train.drop('Unnamed: 0', axis=1)

In [3]:
train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,weekday,week,year,FROM,TO
0,4.5,-73.844315,40.721317,-73.84161,40.712276,1,17,0,25,2009,Queens,Queens
1,16.9,-74.016045,40.711304,-73.97927,40.782005,1,16,1,1,2010,Manhattan,Manhattan
2,5.7,-73.982735,40.76127,-73.99124,40.75056,2,0,3,33,2011,Manhattan,Manhattan
3,7.7,-73.98713,40.733143,-73.99157,40.75809,1,4,5,16,2012,Manhattan,Manhattan
4,5.3,-73.968094,40.76801,-73.95666,40.783764,1,7,1,10,2010,Manhattan,Manhattan


In [4]:
train.shape

(53690622, 12)

In [6]:
train.columns = [
    'fare', 'x0', 'y0', 'x1', 'y1', 'p_count', 'hour', 'weekday', 'week', 'year', 'from', 'to'
]

In [7]:
train.head()

Unnamed: 0,fare,x0,y0,x1,y1,p_count,hour,weekday,week,year,from,to
0,4.5,-73.844315,40.721317,-73.84161,40.712276,1,17,0,25,2009,Queens,Queens
1,16.9,-74.016045,40.711304,-73.97927,40.782005,1,16,1,1,2010,Manhattan,Manhattan
2,5.7,-73.982735,40.76127,-73.99124,40.75056,2,0,3,33,2011,Manhattan,Manhattan
3,7.7,-73.98713,40.733143,-73.99157,40.75809,1,4,5,16,2012,Manhattan,Manhattan
4,5.3,-73.968094,40.76801,-73.95666,40.783764,1,7,1,10,2010,Manhattan,Manhattan


In [9]:
train = train[
    [
    'fare',
    'from',
    'to',
    'hour',
    'weekday',
    'week',
    'year',
    'p_count',
    'x0',
    'x1',
    'y0',
    'y1'
    ]
]

In [10]:
train.head()

Unnamed: 0,fare,from,to,hour,weekday,week,year,p_count,x0,x1,y0,y1
0,4.5,Queens,Queens,17,0,25,2009,1,-73.844315,-73.84161,40.721317,40.712276
1,16.9,Manhattan,Manhattan,16,1,1,2010,1,-74.016045,-73.97927,40.711304,40.782005
2,5.7,Manhattan,Manhattan,0,3,33,2011,2,-73.982735,-73.99124,40.76127,40.75056
3,7.7,Manhattan,Manhattan,4,5,16,2012,1,-73.98713,-73.99157,40.733143,40.75809
4,5.3,Manhattan,Manhattan,7,1,10,2010,1,-73.968094,-73.95666,40.76801,40.783764


In [11]:
def distances(df):
    # euclidean distance
    df.loc[:, 'dist_e'] = np.sqrt(((df.loc[:,'x1'] - df.loc[:,'x0'])*50)**2 + ((df.loc[:,'y1'] - df.loc[:,'y0'])*69)**2)
    
    # manhattan distance
    df.loc[:, 'dist_t'] = abs(df.loc[:,'x1'] - df.loc[:,'x0'])*50 + abs(df.loc[:,'y1'] - df.loc[:,'y0'])*69

In [12]:
distances(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [13]:
train.head()

Unnamed: 0,fare,from,to,hour,weekday,week,year,p_count,x0,x1,y0,y1,dist_e,dist_t
0,4.5,Queens,Queens,17,0,25,2009,1,-73.844315,-73.84161,40.721317,40.712276,0.638322,0.759079
1,16.9,Manhattan,Manhattan,16,1,1,2010,1,-74.016045,-73.97927,40.711304,40.782005,5.213395,6.717119
2,5.7,Manhattan,Manhattan,0,3,33,2011,2,-73.982735,-73.99124,40.76127,40.75056,0.85261,1.16424
3,7.7,Manhattan,Manhattan,4,5,16,2012,1,-73.98713,-73.99157,40.733143,40.75809,1.7356,1.943343
4,5.3,Manhattan,Manhattan,7,1,10,2010,1,-73.968094,-73.95666,40.76801,40.783764,1.228196,1.658726


In [14]:
train = train[
    [
        'fare',
        'from',
        'to',
        'hour',
        'weekday',
        'week',
        'year',
        'p_count',
        'dist_e',
        'dist_t',
        'x0',
        'x1',
        'y0',
        'y1'
    ]
]

In [15]:
train.head()

Unnamed: 0,fare,from,to,hour,weekday,week,year,p_count,dist_e,dist_t,x0,x1,y0,y1
0,4.5,Queens,Queens,17,0,25,2009,1,0.638322,0.759079,-73.844315,-73.84161,40.721317,40.712276
1,16.9,Manhattan,Manhattan,16,1,1,2010,1,5.213395,6.717119,-74.016045,-73.97927,40.711304,40.782005
2,5.7,Manhattan,Manhattan,0,3,33,2011,2,0.85261,1.16424,-73.982735,-73.99124,40.76127,40.75056
3,7.7,Manhattan,Manhattan,4,5,16,2012,1,1.7356,1.943343,-73.98713,-73.99157,40.733143,40.75809
4,5.3,Manhattan,Manhattan,7,1,10,2010,1,1.228196,1.658726,-73.968094,-73.95666,40.76801,40.783764


In [16]:
train.to_csv('../input/train_cleaned_2.csv', index=False)