# the NYC Taxi Fare dataset 
(often seen as train.csv or NYCTaxiFares.csv on Kaggle and other data portals) is a well-known dataset typically used to predict the fare amount for taxi rides in New York City based on various features such as pickup coordinates, drop-off coordinates, datetime, passenger count, and so on.

## Step 1: Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# PyTorch libraries
import torch
import torch.nn as nn
import torch.optim as optim

# Set some display options (optional)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)


## Step 2: Load the Data
In this step, we’ll load the taxi fare data into a Pandas DataFrame. Replace the path/file name with your actual file location.

In [2]:
# Adjust file path as needed
df = pd.read_csv('Data/NYCTaxiFares.csv')
print(df.shape)
df.head()


(120000, 8)


Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


## Step 3: Data Cleaning and Exploration

In [3]:
df.isnull().sum()


pickup_datetime      0
fare_amount          0
fare_class           0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

3.2. Check for Outliers or Invalid Coordinates
Coordinates can sometimes be out of the normal NYC range:

In [4]:
# NYC coordinate bounds (approx):
# lat ~ [40.5, 41.8], lon ~ [-74.3, -73.0]

valid_lon_range = (-75, -72)
valid_lat_range = (40, 42)

df = df[(df['pickup_longitude'].between(*valid_lon_range)) & 
        (df['pickup_latitude'].between(*valid_lat_range)) & 
        (df['dropoff_longitude'].between(*valid_lon_range)) & 
        (df['dropoff_latitude'].between(*valid_lat_range))]

# Also remove obviously wrong fares or passenger counts
df = df[df['fare_amount'] > 0]
df = df[df['passenger_count'] > 0]


In [5]:
df.shape

(120000, 8)

3.3. Parse Datetime\
If your dataset has a pickup_datetime column, convert it to a proper datetime object:

In [6]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')
df.dropna(subset=['pickup_datetime'], inplace=True)  # drop rows where datetime is invalid


In [7]:
df.shape

(120000, 8)

In [8]:
df['hour'] = df['pickup_datetime'].dt.hour
df['day'] = df['pickup_datetime'].dt.day
df['weekday'] = df['pickup_datetime'].dt.weekday
df['month'] = df['pickup_datetime'].dt.month

In [9]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,weekday,month
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,8,19,0,4
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,15,17,5,4
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,11,17,5,4
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,21,11,6,4
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,2,17,5,4


## Step 4: Feature Engineering
We need numeric features suitable for feeding into a neural network. Common features for taxi fare prediction include:

pickup_longitude
pickup_latitude
dropoff_longitude
dropoff_latitude
passenger_count
hour, day, weekday, month (extracted from pickup_datetime) \
distance between pickup and dropoff \
A common approach to measure distance is the Haversine formula (great-circle distance). Let’s define a small helper function:

In [10]:
def haversine_distance(lat1, lon1, lat2, lon2):
    # radius of earth in kilometers
    R = 6371

    # convert degrees to radians
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)

    # Haversine formula
    a = np.sin(delta_phi / 2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c  # final distance in km

df['distance_km'] = haversine_distance(df['pickup_latitude'],
                                       df['pickup_longitude'],
                                       df['dropoff_latitude'],
                                       df['dropoff_longitude'])


In [11]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,weekday,month,distance_km
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,8,19,0,4,2.126312
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,15,17,5,4,1.392307
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,11,17,5,4,3.326763
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,21,11,6,4,1.864129
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,2,17,5,4,7.231321


In [12]:
# We can then drop columns we don’t need. Also, if you want to keep key or pickup_datetime for reference, that’s fine, but they’re often not used directly as features.
# Drop columns not used as features
df.drop(['pickup_datetime'], axis=1, inplace=True)

df.head()


Unnamed: 0,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,weekday,month,distance_km
0,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,8,19,0,4,2.126312
1,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,15,17,5,4,1.392307
2,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,11,17,5,4,3.326763
3,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,21,11,6,4,1.864129
4,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,2,17,5,4,7.231321


## Step 5: Train/Test Split
Since we’re doing a supervised learning task, we need to separate our dataset into training and test sets.

In [13]:
from sklearn.model_selection import train_test_split

# Our target is the fare_amount
X = df.drop('fare_amount', axis=1)
y = df['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape, X_test.shape)


(96000, 11) (24000, 11)


## Step 6: Scaling the Features (Optional but Common)
Neural networks often work better when features are on a similar scale. You can use something like StandardScaler or MinMaxScaler. Here, we’ll demonstrate StandardScaler.

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [15]:
X_train_scaled

array([[-0.7071565 ,  0.05857342,  1.60289278, ...,  0.49345861,
         0.        , -0.50025633],
       [-0.7071565 , -0.14707899, -0.29595306, ..., -0.53157713,
         0.        , -0.64673776],
       [ 1.41411413, -0.14919067,  0.08188991, ...,  0.49345861,
         0.        ,  0.06615894],
       ...,
       [ 1.41411413, -0.85436571, -1.70192339, ...,  1.51849435,
         0.        ,  0.29078752],
       [-0.7071565 , -0.38809417, -0.79657188, ...,  1.00597648,
         0.        , -0.58727432],
       [-0.7071565 , -0.89546468, -0.38170739, ...,  1.00597648,
         0.        , -0.82155914]])

## Step 7: Build the PyTorch Dataset and Dataloader
PyTorch models typically expect data in the form of Dataset and DataLoader. Let’s define a simple dataset class:

In [16]:
from torch.utils.data import Dataset, DataLoader

class TaxiFareDataset(Dataset):
    def __init__(self, X, y):
        # Convert X and y to torch tensors
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)  # shape (N, 1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create dataset objects
train_dataset = TaxiFareDataset(X_train_scaled, y_train)
test_dataset = TaxiFareDataset(X_test_scaled, y_test)

# Create data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


## Step 8: Define the Model
We’ll build a simple feedforward network with a few fully connected (linear) layers. For a regression problem, the final layer will have 1 output neuron (predicting the fare). We can tweak the number of hidden layers and neurons as desired.

In [17]:
import torch
import torch.nn as nn

input_dim = X_train.shape[1]  # number of features
print(f'{input_dim=}')

class TaxiFareModel(nn.Module):
    def __init__(self, input_dim):
        super(TaxiFareModel, self).__init__()
        
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.LayerNorm(128),    # Layer normalization
            nn.ReLU(),
            nn.Dropout(0.2),      # Dropout

            nn.Linear(128, 64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(64, 32),
            nn.LayerNorm(32),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(32, 1)      # 1 output neuron for regression
        )
        
    def forward(self, x):
        return self.net(x)

model = TaxiFareModel(input_dim)



input_dim=11


## Step 9: Define Loss Function and Optimizer
For a regression task, a common choice is Mean Squared Error (MSE) or Smooth L1 (Huber loss). Let’s use MSELoss.

In [18]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Step 10: Training Loop
We’ll now define a function to train the model for a certain number of epochs. During each epoch, we’ll iterate over the training DataLoader, perform a forward pass, compute loss, backpropagate, and update weights.

In [19]:
num_epochs = 10  # increase this for better results

for epoch in range(num_epochs):
    model.train()  # set model to training mode
    
    running_loss = 0.0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * batch_X.size(0)
    
    epoch_loss = running_loss / len(train_dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")


Epoch [1/10], Loss: 35.7064
Epoch [2/10], Loss: 10.6449
Epoch [3/10], Loss: 8.9599
Epoch [4/10], Loss: 8.5718
Epoch [5/10], Loss: 8.2612
Epoch [6/10], Loss: 7.9473
Epoch [7/10], Loss: 7.7492
Epoch [8/10], Loss: 7.5872
Epoch [9/10], Loss: 7.4423
Epoch [10/10], Loss: 7.2873


## Step 11: Evaluate the Model
Once trained, we can evaluate on the test set. We’ll switch to model.eval() mode and disable gradient calculation with torch.no_grad().

In [22]:
model.eval()
test_loss = 0.0

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        test_loss += loss.item() * batch_X.size(0)

test_loss /= len(test_dataset)
print(f"Test MSE: {test_loss:.4f}")
print(f"Test RMSE: {np.sqrt(test_loss):.4f}")


Test MSE: 5.2426
Test RMSE: 2.2897


## Step 12: Make Predictions
If you want to get predictions for some subset of your data (e.g., the test set), you can do:

In [23]:
model.eval()
with torch.no_grad():
    predicted_fares = []
    actual_fares = []
    
    for batch_X, batch_y in test_loader:
        pred = model(batch_X)
        predicted_fares.extend(pred.squeeze().tolist())
        actual_fares.extend(batch_y.squeeze().tolist())

# Let's look at the first 10 predictions vs actual
for i in range(10):
    print(f"Predicted: {predicted_fares[i]:.2f}, Actual: {actual_fares[i]:.2f}")


Predicted: 4.85, Actual: 4.10
Predicted: 6.82, Actual: 6.90
Predicted: 5.87, Actual: 5.70
Predicted: 19.32, Actual: 20.50
Predicted: 4.80, Actual: 4.50
Predicted: 7.72, Actual: 6.90
Predicted: 30.26, Actual: 29.20
Predicted: 4.60, Actual: 3.30
Predicted: 7.49, Actual: 7.70
Predicted: 8.10, Actual: 7.70


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   fare_amount        120000 non-null  float64
 1   fare_class         120000 non-null  int64  
 2   pickup_longitude   120000 non-null  float64
 3   pickup_latitude    120000 non-null  float64
 4   dropoff_longitude  120000 non-null  float64
 5   dropoff_latitude   120000 non-null  float64
 6   passenger_count    120000 non-null  int64  
 7   hour               120000 non-null  int32  
 8   day                120000 non-null  int32  
 9   weekday            120000 non-null  int32  
 10  month              120000 non-null  int32  
 11  distance_km        120000 non-null  float64
dtypes: float64(6), int32(4), int64(2)
memory usage: 9.2 MB
