In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
import pandas as pd
from tqdm.notebook import tqdm

# Load the data
lyft_data = pd.read_csv('lyft_data.csv')

# Convert pickup and dropoff times to datetime
lyft_data['tpep_pickup_datetime'] = pd.to_datetime(lyft_data['tpep_pickup_datetime'])
lyft_data['tpep_dropoff_datetime'] = pd.to_datetime(lyft_data['tpep_dropoff_datetime'])

# Calculate trip duration in minutes
lyft_data['trip_duration'] = (lyft_data['tpep_dropoff_datetime'] - lyft_data['tpep_pickup_datetime']).dt.total_seconds() / 60

# Create features for day of the week and hour of the day
lyft_data['pickup_day_of_week'] = lyft_data['tpep_pickup_datetime'].dt.dayofweek
lyft_data['pickup_hour'] = lyft_data['tpep_pickup_datetime'].dt.hour

# Identify peak hours (e.g., 7-9 AM and 4-7 PM on weekdays)
lyft_data['is_peak_hour'] = ((lyft_data['pickup_hour'].between(7, 9) | lyft_data['pickup_hour'].between(16, 19)) & (lyft_data['pickup_day_of_week'] < 5)).astype(int)

# Drop rows with negative or unusually long trip durations
lyft_data = lyft_data[(lyft_data['trip_duration'] > 0) & (lyft_data['trip_duration'] <= 120)]

# Prepare the dataset for the model
X = lyft_data[['trip_distance', 'pickup_day_of_week', 'pickup_hour', 'is_peak_hour']]
y = lyft_data['total_amount']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
lr = LinearRegression()

# Train the model
lr.fit(X_train, y_train)

# Predict on the test set
y_pred = lr.predict(X_test)

# Calculate RMSE
rmse = sqrt(mean_squared_error(y_test, y_pred))

# Output the RMSE
print('Linear Regression RMSE:', rmse)

# Coefficients
coefficients = lr.coef_

# Output coefficients
print('Coefficients:\n', list(zip(X.columns, coefficients)))

# Intercept
intercept = lr.intercept_

# Output intercept
print('Intercept:', intercept)

Linear Regression RMSE: 5.499938665302406
Coefficients:
 [('trip_distance', 3.34003914181955), ('pickup_day_of_week', 1.1565627113032044), ('pickup_hour', 0.046222274014956044), ('is_peak_hour', -0.5061220958869775)]
Intercept: 3.092750109411128


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


#### The Linear Regression model has been trained to predict the cost of a Lyft trip based on trip distance, pickup day of the week, pickup hour, and whether it is peak hour. The model's performance is summarized by an RMSE of 5.4999.
Here are the model's parameters:
The coefficients for each feature are:
Trip Distance: 3.3400
Pickup Day of the Week: 1.1566
Pickup Hour: 0.0462
Is Peak Hour: -0.5061
The intercept of the model is 3.0928.
These parameters indicate how much each unit increase in the respective feature will affect the trip cost, holding other features constant. The negative coefficient for the peak hour suggests that, all else being equal, trips during peak hours are associated with a slightly lower cost, which may seem counterintuitive and could be an area for further investigation.

In [3]:
# Save the model to a file
import joblib

# Save the Linear Regression model to a file for later use in the Streamlit app
joblib.dump(lr, 'linear_regression_model.joblib')

print('Model saved as linear_regression_model.joblib')

Model saved as linear_regression_model.joblib


In [4]:
# Write the Streamlit app script
app_code = """
import streamlit as st
import joblib
import numpy as np

# Load the trained Linear Regression model
model = joblib.load('linear_regression_model.joblib')

# Define the Streamlit app
st.title('Lyft Trip Cost Predictor')

# Input fields
trip_distance = st.number_input('Enter the trip distance (in miles):', min_value=0.0, format='%f')
pickup_day_of_week = st.selectbox('Select the pickup day of the week:', options=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], index=0)
pickup_hour = st.slider('Select the pickup hour:', 0, 23, 0)
is_peak_hour = st.checkbox('Is it peak hour?')

# Convert inputs to model format
pickup_day_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(pickup_day_of_week)
is_peak_hour = int(is_peak_hour)

# Predict button
if st.button('Predict Cost'):
    # Make prediction
    features = np.array([[trip_distance, pickup_day_of_week, pickup_hour, is_peak_hour]])
    prediction = model.predict(features)
    
    # Display the prediction
    st.success(f'The estimated trip cost is: ${prediction[0]:.2f}')
"""

# Save the Streamlit app script to a file
with open('streamlit_app.py', 'w') as file:
    file.write(app_code)

print('Streamlit app script saved as streamlit_app.py')

Streamlit app script saved as streamlit_app.py
