In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_train():
    TRAIN_PATH = '../input/train.csv.zip'
    
    n_rows = 55_423_857 # total number of rows in the training dataset
    chunksize = 5_000_000 # batch size
    total_chunk = n_rows // chunksize + 1 # number of batches needed
    
    # defining the columns datatypes (optimize memory usage)
    traintypes = {
        'fare_amount': 'float32',
        'pickup_datetime': 'str', 
        'pickup_longitude': 'float32',
        'pickup_latitude': 'float32',
        'dropoff_longitude': 'float32',
        'dropoff_latitude': 'float32',
        'passenger_count': 'uint8'
    }

    cols = list(traintypes.keys())
    
    # loading the dataframe into list of small dataframes
    df_list = []

    for df_chunk in pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, chunksize=chunksize, compression='infer'):
        df_chunk['pickup_datetime'] = df_chunk['pickup_datetime'].str.slice(0, 16)
        df_chunk['pickup_datetime'] = pd.to_datetime(df_chunk['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
        df_list.append(df_chunk) 
        df = pd.concat(df_list)
        
    return df
    
def clean_train(df):
    train.dropna(axis=0, how='any', inplace=True)
    
    idx_negative_fare = df[df['fare_amount'] < 0].index
    df.drop(axis=0, index=idx_negative_fare, inplace=True)
    del idx_negative_fare
    
    lat_intv = [40.56, 41.71]
    lon_intv = [-74.27, -72.98]  
    
    idx_zero_plat = df[(df['pickup_latitude'] < lat_intv[0]) | (df['pickup_latitude'] > lat_intv[1])].index
    df.drop(axis=0, index=idx_zero_plat, inplace=True)
    del idx_zero_plat
    
    idx_zero_dlat = df[(df['dropoff_latitude'] < lat_intv[0]) | (df['dropoff_latitude'] > lat_intv[1])].index
    df.drop(axis=0, index=idx_zero_dlat, inplace=True)
    del idx_zero_dlat
    
    idx_zero_plon = df[(df['pickup_longitude'] > lon_intv[1]) | (df['pickup_longitude'] < lon_intv[0])].index
    df.drop(axis=0, index=idx_zero_plon, inplace=True)
    del idx_zero_plon

    idx_zero_dlon = df[(df['dropoff_longitude'] > lon_intv[1]) | (df['dropoff_longitude'] < lon_intv[0])].index
    df.drop(axis=0, index=idx_zero_dlon, inplace=True)
    del idx_zero_dlon
    
    idx_invalid_passengers = df[(df['passenger_count'] > 6) | (df['passenger_count'] == 0)].index
    df.drop(axis=0, index=idx_invalid_passengers, inplace=True)
    del idx_invalid_passengers
    
    idx_below_min_fare = df[df['fare_amount'] < 2.5].index
    df.drop(axis=0, index=idx_below_min_fare, inplace=True)
    del idx_below_min_fare
    
    idx_cutoff_fare = df[df['fare_amount'] > 250].index
    df.drop(axis=0, index=idx_cutoff_fare)
    del idx_cutoff_fare 

In [7]:
def process_datetime(df):
    df['hour'] = df['pickup_datetime'].dt.hour
    df['day'] = df['pickup_datetime'].dt.day
    df['month'] = df['pickup_datetime'].dt.month
    df['year'] = df['pickup_datetime'].dt.year - 2009
    df['weekday'] = df['pickup_datetime'].dt.dayofweek
    df.drop('pickup_datetime', axis=1, inplace=True)
        
def rename_columns(df):
    cols = [
        'fare',
        'num_pass',
        'year',
        'hour',
        'day',
        'month',
        'weekday',
        'x0',
        'x1',
        'y0',
        'y1'
    ]

    df.columns = cols
    
def distances(df):
    df.loc[:, 'dist_e'] = np.sqrt((df.loc[:,'x1'] - df.loc[:,'x0'])**2 + (df.loc[:,'y1'] - df.loc[:,'y0'])**2)
    df.loc[:, 'dist_t'] = abs(df.loc[:,'x1'] - df.loc[:,'x0']) + abs(df.loc[:,'y1'] - df.loc[:,'y0'])
    
def clean_train_2(df):
    idx_unrealistic_fare = df[
        ((df['fare'] < 3) & (df['dist_e'] > 0.01)) |
        ((df['fare'] < 3) & (df['dist_t'] > 0.01)) |
        ((df['fare'] > 100) & ( (df['dist_e'] < 0.1) | (df['dist_t'] < 0.1)))
    ].index
    df.drop(axis=0, index=idx_unrealistic_fare, inplace=True)
    del idx_unrealistic_fare

In [4]:
train = load_train()
clean_train(train)
process_datetime(train)

In [5]:
cols = [
    'fare_amount',
    'passenger_count',        
    'year',
    'hour',
    'day',
    'month',
    'weekday',
    'pickup_longitude',
    'dropoff_longitude',
    'pickup_latitude',
    'dropoff_latitude'
]

train = train[cols]

In [None]:
rename_columns(train)
distances(train)

In [8]:
clean_train_2(train)

In [13]:
train.to_csv('../input/train_cleaned.csv')