In [2]:
### minimize training data ###
import pandas as pd
import random as rndn

filename = r"../input/train.csv"
n = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
s = 100000 #desired sample size
skip = sorted(rndn.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
df = pd.read_csv(filename, skiprows=skip)
df.to_csv("../input/train_min.csv")


In [2]:
import pandas as pd

df_train = pd.read_csv("../input/train_min.csv")
df_train.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,0,2010-03-27 02:26:39.0000001,14.5,2010-03-27 02:26:39 UTC,-73.961598,40.716487,-73.960417,40.674963,2
1,1,2015-02-15 22:41:32.0000003,9.0,2015-02-15 22:41:32 UTC,-74.003448,40.733101,-74.008003,40.707214,1
2,2,2011-01-14 18:10:00.000000129,5.7,2011-01-14 18:10:00 UTC,-73.98637,40.74738,-73.975567,40.744572,2
3,3,2011-06-20 14:52:22.0000004,18.5,2011-06-20 14:52:22 UTC,-73.997892,40.724297,-73.977578,40.786941,1
4,4,2011-10-14 12:20:00.00000051,4.9,2011-10-14 12:20:00 UTC,-73.952797,40.780497,-73.955042,40.788527,5


In [3]:
df_test = pd.read_csv("../input/test.csv")
df_test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [4]:
### data preprocessing ###

### copy for submission ###
df_test_for_sub = df_test["key"]
df_test_for_sub

0       2015-01-27 13:08:24.0000002
1       2015-01-27 13:08:24.0000003
2       2011-10-08 11:53:44.0000002
3       2012-12-01 21:12:12.0000002
4       2012-12-01 21:12:12.0000003
                   ...             
9909    2015-05-10 12:37:51.0000002
9910    2015-01-12 17:05:51.0000001
9911    2015-04-19 20:44:15.0000001
9912    2015-01-31 01:05:19.0000005
9913    2015-01-18 14:06:23.0000006
Name: key, Length: 9914, dtype: object

In [5]:
# delete the fares that are less than 0
df_train = df_train.loc[df_train.fare_amount > 0]
df_train.fare_amount.min()

1.5

In [6]:
len(df_train)

99990

In [7]:
train_X = df_train.drop(columns=['fare_amount'])

Y = df_train['fare_amount']
#Y = np.log(df_train.fare_amount)

test_X = df_test

In [8]:
df = train_X.append(test_X, sort=False)
df.head()

Unnamed: 0.1,Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,0.0,2010-03-27 02:26:39.0000001,2010-03-27 02:26:39 UTC,-73.961598,40.716487,-73.960417,40.674963,2
1,1.0,2015-02-15 22:41:32.0000003,2015-02-15 22:41:32 UTC,-74.003448,40.733101,-74.008003,40.707214,1
2,2.0,2011-01-14 18:10:00.000000129,2011-01-14 18:10:00 UTC,-73.98637,40.74738,-73.975567,40.744572,2
3,3.0,2011-06-20 14:52:22.0000004,2011-06-20 14:52:22 UTC,-73.997892,40.724297,-73.977578,40.786941,1
4,4.0,2011-10-14 12:20:00.00000051,2011-10-14 12:20:00 UTC,-73.952797,40.780497,-73.955042,40.788527,5


In [11]:
### calculate the distance traveled ###

import numpy as np

def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [12]:
df['dist_km'] = haversine_distance(
    df,
    'pickup_latitude', 
    'pickup_longitude', 
    'dropoff_latitude', 
    'dropoff_longitude'
)

df.head()

Unnamed: 0.1,Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,0.0,2010-03-27 02:26:39.0000001,2010-03-27 02:26:39 UTC,-73.961598,40.716487,-73.960417,40.674963,2,4.618332
1,1.0,2015-02-15 22:41:32.0000003,2015-02-15 22:41:32 UTC,-74.003448,40.733101,-74.008003,40.707214,1,2.903933
2,2.0,2011-01-14 18:10:00.000000129,2011-01-14 18:10:00 UTC,-73.98637,40.74738,-73.975567,40.744572,2,0.962144
3,3.0,2011-06-20 14:52:22.0000004,2011-06-20 14:52:22 UTC,-73.997892,40.724297,-73.977578,40.786941,1,7.172769
4,4.0,2011-10-14 12:20:00.00000051,2011-10-14 12:20:00 UTC,-73.952797,40.780497,-73.955042,40.788527,5,0.912682


In [13]:
### add a datetime column and derive useful statistics ###

df['EDTdate'] = pd.to_datetime(df['pickup_datetime'].str[:19]) - pd.Timedelta(hours=4)
df['Hour'] = df['EDTdate'].dt.hour
df['AMorPM'] = np.where(df['Hour']<12,'am','pm')
df['Weekday'] = df['EDTdate'].dt.strftime("%a")

df.head()

Unnamed: 0.1,Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM,Weekday
0,0.0,2010-03-27 02:26:39.0000001,2010-03-27 02:26:39 UTC,-73.961598,40.716487,-73.960417,40.674963,2,4.618332,2010-03-26 22:26:39,22,pm,Fri
1,1.0,2015-02-15 22:41:32.0000003,2015-02-15 22:41:32 UTC,-74.003448,40.733101,-74.008003,40.707214,1,2.903933,2015-02-15 18:41:32,18,pm,Sun
2,2.0,2011-01-14 18:10:00.000000129,2011-01-14 18:10:00 UTC,-73.98637,40.74738,-73.975567,40.744572,2,0.962144,2011-01-14 14:10:00,14,pm,Fri
3,3.0,2011-06-20 14:52:22.0000004,2011-06-20 14:52:22 UTC,-73.997892,40.724297,-73.977578,40.786941,1,7.172769,2011-06-20 10:52:22,10,am,Mon
4,4.0,2011-10-14 12:20:00.00000051,2011-10-14 12:20:00 UTC,-73.952797,40.780497,-73.955042,40.788527,5,0.912682,2011-10-14 08:20:00,8,am,Fri


In [15]:
### prepare the model ###

### define categorical and continuous columns ###
cat_cols = [
    'Hour', 
    'AMorPM', 
    'Weekday'
]

cont_cols = [
    'pickup_latitude', 
    'pickup_longitude', 
    'dropoff_latitude', 
    'dropoff_longitude', 
    'passenger_count', 
    'dist_km'
]

df.head()

Unnamed: 0.1,Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM,Weekday
0,0.0,2010-03-27 02:26:39.0000001,2010-03-27 02:26:39 UTC,-73.961598,40.716487,-73.960417,40.674963,2,4.618332,2010-03-26 22:26:39,22,pm,Fri
1,1.0,2015-02-15 22:41:32.0000003,2015-02-15 22:41:32 UTC,-74.003448,40.733101,-74.008003,40.707214,1,2.903933,2015-02-15 18:41:32,18,pm,Sun
2,2.0,2011-01-14 18:10:00.000000129,2011-01-14 18:10:00 UTC,-73.98637,40.74738,-73.975567,40.744572,2,0.962144,2011-01-14 14:10:00,14,pm,Fri
3,3.0,2011-06-20 14:52:22.0000004,2011-06-20 14:52:22 UTC,-73.997892,40.724297,-73.977578,40.786941,1,7.172769,2011-06-20 10:52:22,10,am,Mon
4,4.0,2011-10-14 12:20:00.00000051,2011-10-14 12:20:00 UTC,-73.952797,40.780497,-73.955042,40.788527,5,0.912682,2011-10-14 08:20:00,8,am,Fri


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109904 entries, 0 to 9913
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   Unnamed: 0         99990 non-null   float64       
 1   key                109904 non-null  object        
 2   pickup_datetime    109904 non-null  object        
 3   pickup_longitude   109904 non-null  float64       
 4   pickup_latitude    109904 non-null  float64       
 5   dropoff_longitude  109903 non-null  float64       
 6   dropoff_latitude   109903 non-null  float64       
 7   passenger_count    109904 non-null  int64         
 8   dist_km            109903 non-null  float64       
 9   EDTdate            109904 non-null  datetime64[ns]
 10  Hour               109904 non-null  int64         
 11  AMorPM             109904 non-null  object        
 12  Weekday            109904 non-null  object        
dtypes: datetime64[ns](1), float64(6), int64(2), ob

In [None]:
#make the cat categories as "category" and label encoder it
from sklearn import preprocessing

for col in df.columns:
  if col in cat_cols:
    df[col] = preprocessing.LabelEncoder().fit_transform(df[col])
    df[col]= df[col].astype('category')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
### take the relevent columns for the model and split back to train and test ###
df.columns

In [None]:
# keep only the cols for the model
df = df[
    [
        'Hour', 
        'AMorPM', 
        'Weekday',
        'pickup_latitude', 
        'pickup_longitude', 
        'dropoff_latitude', 
        'dropoff_longitude', 
        'passenger_count', 
        'dist_km'
    ]
]

df_train = df[:99990]
df_test = df[99990:]

In [None]:
### continuous values ###

from pandas.api.types import is_numeric_dtype

# Compute the means and stds of `self.cont_names` columns to normalize them
def Normalize(df):
  means, stds = {}, {}
  cont_names = [
      'pickup_latitude', 
      'pickup_longitude', 
      'dropoff_latitude', 
      'dropoff_longitude', 
      'passenger_count', 
      'dist_km'
  ]
  
  for n in cont_names:
    assert is_numeric_dtype(df[n]), (f"""Cannot normalize '{n}' column as it isn't numerical. Are you sure it doesn't belong in the categorical set of columns?""")
    means[n],stds[n] = df[n].mean(),df[n].std()
    df[n] = (df[n]-means[n]) / (1e-7 + stds[n])


Normalize(df_train)
Normalize(df_test)
X = df_train

In [None]:
df_train.head()

In [None]:
X.shape

In [None]:
X.info()

In [None]:
### train-valid split ###
from sklearn import model_selection

X_train, X_val, y_train, y_val = model_selection.train_test_split(
    X, Y, test_size=0.20, random_state=42, shuffle=True
) 

X_train.head()

In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM,Weekday
0,0.0,2010-03-27 02:26:39.0000001,2010-03-27 02:26:39 UTC,-73.961598,40.716487,-73.960417,40.674963,2,4.618332,2010-03-26 22:26:39,22,1,0
1,1.0,2015-02-15 22:41:32.0000003,2015-02-15 22:41:32 UTC,-74.003448,40.733101,-74.008003,40.707214,1,2.903933,2015-02-15 18:41:32,18,1,3
2,2.0,2011-01-14 18:10:00.000000129,2011-01-14 18:10:00 UTC,-73.98637,40.74738,-73.975567,40.744572,2,0.962144,2011-01-14 14:10:00,14,1,0
3,3.0,2011-06-20 14:52:22.0000004,2011-06-20 14:52:22 UTC,-73.997892,40.724297,-73.977578,40.786941,1,7.172769,2011-06-20 10:52:22,10,0,1
4,4.0,2011-10-14 12:20:00.00000051,2011-10-14 12:20:00 UTC,-73.952797,40.780497,-73.955042,40.788527,5,0.912682,2011-10-14 08:20:00,8,0,0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109904 entries, 0 to 9913
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   Unnamed: 0         99990 non-null   float64       
 1   key                109904 non-null  object        
 2   pickup_datetime    109904 non-null  object        
 3   pickup_longitude   109904 non-null  float64       
 4   pickup_latitude    109904 non-null  float64       
 5   dropoff_longitude  109903 non-null  float64       
 6   dropoff_latitude   109903 non-null  float64       
 7   passenger_count    109904 non-null  int64         
 8   dist_km            109903 non-null  float64       
 9   EDTdate            109904 non-null  datetime64[ns]
 10  Hour               109904 non-null  category      
 11  AMorPM             109904 non-null  category      
 12  Weekday            109904 non-null  category      
dtypes: category(3), datetime64[ns](1), float64(6),

In [20]:
### take the relevent columns for the model and split back to train and test ###
df.columns

Index(['Unnamed: 0', 'key', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km', 'EDTdate', 'Hour', 'AMorPM', 'Weekday'],
      dtype='object')

In [21]:
# keep only the cols for the model
df = df[
    [
        'Hour', 
        'AMorPM', 
        'Weekday',
        'pickup_latitude', 
        'pickup_longitude', 
        'dropoff_latitude', 
        'dropoff_longitude', 
        'passenger_count', 
        'dist_km'
    ]
]

df_train = df[:99990]
df_test = df[99990:]

In [22]:
### continuous values ###

from pandas.api.types import is_numeric_dtype

# Compute the means and stds of `self.cont_names` columns to normalize them
def Normalize(df):
  means, stds = {}, {}
  cont_names = [
      'pickup_latitude', 
      'pickup_longitude', 
      'dropoff_latitude', 
      'dropoff_longitude', 
      'passenger_count', 
      'dist_km'
  ]
  
  for n in cont_names:
    assert is_numeric_dtype(df[n]), (f"""Cannot normalize '{n}' column as it isn't numerical. Are you sure it doesn't belong in the categorical set of columns?""")
    means[n],stds[n] = df[n].mean(),df[n].std()
    df[n] = (df[n]-means[n]) / (1e-7 + stds[n])


Normalize(df_train)
Normalize(df_test)
X = df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[n] = (df[n]-means[n]) / (1e-7 + stds[n])


In [23]:
df_train.head()

Unnamed: 0,Hour,AMorPM,Weekday,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,passenger_count,dist_km
0,22,1,0,0.126016,-0.133438,0.125627,-0.141685,0.246944,-0.04035
1,18,1,3,0.128596,-0.137225,0.130781,-0.146169,-0.522203,-0.045066
2,14,1,0,0.130813,-0.13568,0.136751,-0.143113,0.246944,-0.050408
3,10,0,1,0.127229,-0.136722,0.143521,-0.143302,-0.522203,-0.033323
4,8,0,0,0.135955,-0.132642,0.143775,-0.141179,2.554387,-0.050544


In [24]:
X.shape

(99990, 9)

In [25]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99990 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Hour               99990 non-null  category
 1   AMorPM             99990 non-null  category
 2   Weekday            99990 non-null  category
 3   pickup_latitude    99990 non-null  float64 
 4   pickup_longitude   99990 non-null  float64 
 5   dropoff_latitude   99989 non-null  float64 
 6   dropoff_longitude  99989 non-null  float64 
 7   passenger_count    99990 non-null  float64 
 8   dist_km            99989 non-null  float64 
dtypes: category(3), float64(6)
memory usage: 5.6 MB


In [26]:
### train-valid split ###
from sklearn import model_selection

X_train, X_val, y_train, y_val = model_selection.train_test_split(
    X, Y, test_size=0.20, random_state=42, shuffle=True
) 

X_train.head()

Unnamed: 0,Hour,AMorPM,Weekday,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,passenger_count,dist_km
58526,16,1,2,0.135145,-0.132729,0.135762,-0.143564,1.016092,-0.040195
38241,23,1,2,0.129786,-0.137378,0.132194,-0.134693,-0.522203,-0.024467
3806,23,1,4,0.129665,-0.13738,0.131407,-0.140626,-0.522203,-0.037359
27927,17,1,3,0.135288,-0.132924,0.143298,-0.141282,-0.522203,-0.050197
6006,19,1,2,0.133024,-0.135374,0.15377,-0.139646,-0.522203,-0.02384
