In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Function to calculate Euclidean distance
def euclidean_distance(lat1, lon1, lat2, lon2):
    return np.sqrt((lat2 - lat1) ** 2 + (lon2 - lon1) ** 2)

# Load data from CSV
df = pd.read_csv('NYC_taxi_fares.csv')

# Perform necessary data preprocessing and feature engineering
# Assuming df contains relevant features and labels
# Modify this section based on your specific dataset and features
# ...




In [None]:
df.columns


Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count'],
      dtype='object')

In [None]:
# Identify null values
null_counts = df.isnull().sum()

# Replace null values with mean of respective column
df.fillna(df.mean(), inplace=True)

# Check if nulls are replaced
null_counts_after_fillna = df.isnull().sum()

# Print counts of nulls before and after replacement
print('Null counts before replacement:')
print(null_counts)

print('\nNull counts after replacement:')
print(null_counts_after_fillna)

Null counts before replacement:
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     1
pickup_latitude      1
dropoff_longitude    3
dropoff_latitude     3
passenger_count      1
dtype: int64

Null counts after replacement:
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64


  df.fillna(df.mean(), inplace=True)


In [None]:
# Drop rows with zero values in pickup and drop-off latitude and longitude
df.dropna(subset=['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'], how='any', inplace=True)

# Identify null values after dropping rows
null_counts_after_drop = df.isnull().sum()

# Print counts of nulls after dropping rows
print('Null counts after dropping rows with zero values:')
print(null_counts_after_drop)

Null counts after dropping rows with zero values:
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64


In [None]:
# this is using the euclidean distance formula, which finds the total distance traveld by used the latitude and longitude from the pickup and dropoff.
df = df[(df['pickup_latitude'] != 0) & (df['pickup_longitude'] != 0) &
        (df['dropoff_latitude'] != 0) & (df['dropoff_longitude'] != 0)]

In [None]:
null_counts_after_drop = df.isnull().sum()

# Print counts of nulls after dropping rows
print('Null counts after dropping rows with zero values:')
print(null_counts_after_drop)

Null counts after dropping rows with zero values:
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64


In [None]:
X.head(14)

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,-73.844311,40.721319,-73.84161,40.712278,1.0
1,-74.016048,40.711303,-73.979268,40.782004,1.0
2,-73.982738,40.76127,-73.991242,40.750562,2.0
3,-73.98713,40.733143,-73.991567,40.758092,1.0
4,-73.968095,40.768008,-73.956655,40.783762,1.0
5,-74.000964,40.73163,-73.972892,40.758233,1.0
6,-73.980002,40.751662,-73.973802,40.764842,1.0
7,-73.9513,40.774138,-73.990095,40.751048,1.0
8,-74.006462,40.726713,-73.993078,40.731628,1.0
9,-73.980658,40.733873,-73.99154,40.758138,2.0


In [None]:
y.head()

0     4.5
1    16.9
2     5.7
3     7.7
4     5.3
Name: fare_amount, dtype: float64

In [None]:
# Split the data into features (X) and labels (y)
X = df.drop(['fare_amount'], axis=1)
X = df.drop(['key'], axis=1)  # Features
X = df.drop(['pickup_datetime'], axis=1)
y = df['fare_amount']  # Labels

In [None]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate Euclidean distance and add it as a feature
X_train['euclidean_distance'] = euclidean_distance(X_train['pickup_latitude'], X_train['pickup_longitude'],
                                                    X_train['dropoff_latitude'], X_train['dropoff_longitude'])

X_test['euclidean_distance'] = euclidean_distance(X_test['pickup_latitude'], X_test['pickup_longitude'],
                                                  X_test['dropoff_latitude'], X_test['dropoff_longitude'])

# Standardize and normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape the data for LSTM input (assuming 1 feature per timestep)
timesteps = 1  # Adjust based on the actual number of timesteps
X_train_reshaped = X_train_scaled.reshape(-1, timesteps, X_train_scaled.shape[1])
X_test_reshaped = X_test_scaled.reshape(-1, timesteps, X_test_scaled.shape[1])

# Define the LSTM model
model = Sequential()
model.add(LSTM(100, input_shape=(timesteps, X_train_reshaped.shape[2])))
model.add(Dense(1))  # Single neuron for regression


In [None]:

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])

# Train the model
model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
loss, mae = model.evaluate(X_test_reshaped, y_test)
print('Mean Absolute Error on Test Data:', mae)

# Make predictions
predictions = model.predict(X_test_reshaped)

# Optionally, inverse transform the predictions if you scaled the data
# inverse_predictions = scaler.inverse_transform(predictions)

# Calculate Mean Absolute Error on the predictions
mae_predictions = mean_absolute_error(y_test, predictions)
print('Mean Absolute Error on Predictions:', mae_predictions)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Mean Absolute Error on Test Data: 2.3455440998077393
Mean Absolute Error on Predictions: 2.345543069387414
