<a href="https://colab.research.google.com/github/ashmeenkhaira/ML-models/blob/main/taxi_fare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install opendatasets --quiet

In [2]:
import opendatasets as od
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error

In [3]:
dataset_url = 'https://www.kaggle.com/competitions/new-york-city-taxi-fare-prediction/data'

In [4]:
od.download(dataset_url)

Extracting archive ./new-york-city-taxi-fare-prediction/new-york-city-taxi-fare-prediction.zip to ./new-york-city-taxi-fare-prediction


In [5]:
data_dir = 'new-york-city-taxi-fare-prediction'

In [6]:
!head {data_dir}/train.csv

key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
2011-01-06 09:50:45.0000002,12.1,2011-01-06 09:50:45 UTC,-74.000964,40.73163,-73.972892,40.758233,1
2012-11-20 20:35:00.0000001,7.5,2012-11-20 20:35:00 UTC,-73.980002,40.751662,-73.973802,40.764842,1
2012-01-04 17:22:00.00000081,16.5,2012-01-04 17:22:00 UTC,-73.9513,40.774138,-73.990095,40.751048,1
2012-12-03 13:10:00.000000125,9,2012-12-03 13:10:00 UTC,-74.006462,40.7267



*   training data = 5.5GB in size
*   rows in traing data=5.5 million
*   unique columns
*   key
*   fare_amount
*   pickup_longitude
*   pickup_latitude
*   dropoff_longitude
*   passenger_count
*   dropoff_latitude
*   submission file should contain key and fare_amount

In [7]:
select_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')
select_cols

['fare_amount',
 'pickup_datetime',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count']

In [8]:
sample_fraction = 0.01
dtypes={'fare_amount':'float32',
 'pickup_longitude':'float32',
 'pickup_latitude':'float32',
 'dropoff_longitude':'float32',
 'dropoff_latitude':'float32',
 'passenger_count':'uint8'}

def skip_row(row_idx):
  if row_idx==0:
    return False
  return random.random()> sample_fraction
random.seed(42)

In [10]:
df=pd.read_csv(data_dir+'/train.csv',usecols=select_cols,parse_dates =['pickup_datetime'], dtype=dtypes,skiprows=skip_row)

In [12]:
test_df = pd.read_csv(data_dir+'/test.csv',dtype=dtypes)

In [13]:
df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,13.500000,2013-07-06 17:18:00+00:00,-73.950653,40.783283,-73.984367,40.769802,1
1,9.000000,2015-02-15 22:41:32+00:00,-74.003448,40.733101,-74.008003,40.707214,1
2,5.700000,2011-12-13 05:56:00+00:00,-73.990494,40.755981,-73.974205,40.754211,4
3,5.300000,2012-04-02 08:02:16+00:00,-73.947998,40.778542,-73.958565,40.778484,2
4,13.000000,2013-11-07 10:19:35+00:00,-73.985802,40.777477,-73.987411,40.754368,1
...,...,...,...,...,...,...,...
553883,23.299999,2012-03-25 00:55:00+00:00,-73.888145,40.849079,-73.865898,40.849117,1
553884,6.000000,2014-10-15 17:36:07+00:00,-73.988045,40.759235,-73.985985,40.749264,1
553885,13.500000,2015-03-06 10:16:40+00:00,-73.951881,40.809135,-73.942055,40.841862,2
553886,5.500000,2014-02-26 08:46:00+00:00,-74.004311,40.752426,-74.008064,40.742065,1


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 553888 entries, 0 to 553887
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   fare_amount        553888 non-null  float32            
 1   pickup_datetime    553888 non-null  datetime64[ns, UTC]
 2   pickup_longitude   553888 non-null  float32            
 3   pickup_latitude    553888 non-null  float32            
 4   dropoff_longitude  553884 non-null  float32            
 5   dropoff_latitude   553884 non-null  float32            
 6   passenger_count    553888 non-null  uint8              
dtypes: datetime64[ns, UTC](1), float32(5), uint8(1)
memory usage: 15.3 MB


In [15]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,553888.0,553888.0,553888.0,553884.0,553884.0,553888.0
mean,11.325466,-72.539421,39.910149,-72.531006,39.897808,1.687171
std,9.752986,15.070677,8.741974,13.201148,8.927755,1.33993
min,-52.0,-3442.05957,-2145.906006,-3370.710449,-2530.357422,0.0
25%,6.0,-73.992081,40.734909,-73.991379,40.733974,1.0
50%,8.5,-73.981804,40.752659,-73.98011,40.753166,1.0
75%,12.5,-73.967087,40.767143,-73.963676,40.768139,2.0
max,444.440002,40.813038,3334.677734,40.827824,458.649994,208.0


In [16]:
df['pickup_datetime'].min(),df['pickup_datetime'].max()

(Timestamp('2009-01-01 00:04:54+0000', tz='UTC'),
 Timestamp('2015-06-30 23:56:42+0000', tz='UTC'))

In [17]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   key                9914 non-null   object 
 1   pickup_datetime    9914 non-null   object 
 2   pickup_longitude   9914 non-null   float32
 3   pickup_latitude    9914 non-null   float32
 4   dropoff_longitude  9914 non-null   float32
 5   dropoff_latitude   9914 non-null   float32
 6   passenger_count    9914 non-null   uint8  
dtypes: float32(4), object(2), uint8(1)
memory usage: 319.6+ KB


In [18]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [19]:
len(train_df),len(val_df)

(443110, 110778)

In [20]:
train_df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [21]:
input_cols = ['pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count']

In [22]:
target_cols = 'fare_amount'

In [23]:
train_input = train_df[input_cols]
train_target = train_df[target_cols]

In [24]:
val_input = val_df[input_cols]
val_target = val_df[target_cols]

In [25]:
test_input=test_df[input_cols]

In [36]:
class MeanRegressor:
  def fit(self, inputs, targets):
    self.mean = targets.mean()
  def predict(self, inputs):
    return np.full(inputs.shape[0],self.mean)

In [37]:
mean_model = MeanRegressor()

In [38]:
mean_model.fit(train_input, train_target)

In [39]:
mean_model.mean

np.float32(11.327026)

In [40]:
train_preds = mean_model.predict(train_input)

In [41]:
train_preds

array([11.327026, 11.327026, 11.327026, ..., 11.327026, 11.327026,
       11.327026], dtype=float32)

In [42]:
val_preds=mean_model.predict(val_input)

In [43]:
val_preds

array([11.327026, 11.327026, 11.327026, ..., 11.327026, 11.327026,
       11.327026], dtype=float32)