## 1. Load and Preprocess Data

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error


# Load data
df = pd.read_csv('sample_citibike_2023.csv')

# Drop the first column Index
df = df.iloc[:, 1:]

# Check data types and non-null values
print(df.info())

  df = pd.read_csv('sample_citibike_2023.csv')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   ride_id             1000000 non-null  object 
 1   rideable_type       1000000 non-null  object 
 2   started_at          1000000 non-null  object 
 3   ended_at            1000000 non-null  object 
 4   start_station_name  999459 non-null   object 
 5   start_station_id    999459 non-null   object 
 6   end_station_name    997192 non-null   object 
 7   end_station_id      997192 non-null   object 
 8   start_lat           1000000 non-null  float64
 9   start_lng           1000000 non-null  float64
 10  end_lat             999287 non-null   float64
 11  end_lng             999287 non-null   float64
 12  member_casual       1000000 non-null  object 
dtypes: float64(4), object(9)
memory usage: 99.2+ MB
None


In [20]:
# Convert 'started_at' and 'ended_at' to datetime
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

# Calculate trip duration in minutes
df['trip_duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60

# Remove rows with negative or zero durations
df = df[df['trip_duration'] > 0]

# Fill missing categorical values
df['start_station_name'].fillna('Unknown', inplace=True)
df['end_station_name'].fillna('Unknown', inplace=True)

# Remove outliers using the IQR method
Q1 = df['trip_duration'].quantile(0.25)
Q3 = df['trip_duration'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['trip_duration'] >= lower_bound) & (df['trip_duration'] <= upper_bound)]

# Feature engineering
df['hour'] = df['started_at'].dt.hour
df['day_of_week'] = df['started_at'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['start_station_name_encoded'] = label_encoder.fit_transform(df['start_station_name'])
df['end_station_name_encoded'] = label_encoder.fit_transform(df['end_station_name'])


## 2. Prepare Features and Target

In [21]:
# Define features and target
features = [
    'start_lat', 'start_lng', 'end_lat', 'end_lng',
    'hour', 'day_of_week', 'is_weekend',
    'start_station_name_encoded', 'end_station_name_encoded'
]
target = 'trip_duration'

X = df[features]
y = df[target]

# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 3. Hyperparameter Tuning with RandomizedSearchCV

In [23]:
# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],        
    'max_depth': [10, 20, None],          
    'min_samples_split': [2, 5],           
    'min_samples_leaf': [1, 2],           
    'max_features': ['sqrt', 'log2'],    
    'bootstrap': [True]             
}

# Initialize Random Forest
rf = RandomForestRegressor(random_state=123)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=10,          # Test 10 parameter combinations
    cv=2,               # 2-fold cross-validation
    scoring='neg_mean_squared_error',  # Minimize mean squared error
    verbose=2,
    random_state=42,
    n_jobs=-1           # Use all available cores
)

# Perform the search
random_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

print("Best Hyperparameters:", best_params)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
Best Hyperparameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}


## 4. Evaluate the Tuned Model

In [24]:
# Predict using the best model
y_pred_tuned = best_model.predict(X_test)

# Evaluate the tuned model
rmse_tuned = np.sqrt(mean_squared_error(y_test, y_pred_tuned))
print(f'Root Mean Squared Error (Tuned Model): {rmse_tuned}')


Root Mean Squared Error (Tuned Model): 4.66569918151125
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  56.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  59.3s
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 1.9min
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  56.9s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  59.7s
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 1.9min
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time= 

Baseline Comparison

In [27]:
baseline_pred = [y_train.mean()] * len(y_test)
baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_pred))
print(f'Baseline RMSE: {baseline_rmse}')

Baseline RMSE: 7.320116425746809


R² Score (Coefficient of Determination)

In [29]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred_tuned)
print(f'R² Score: {r2}')

R² Score: 0.5937458253159373
