In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split


In [8]:
dataset = pd.read_csv('../Project_datasets/cleaned.csv')

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535 entries, 0 to 534
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0.1               535 non-null    int64  
 1   Unnamed: 0                 535 non-null    int64  
 2   date                       535 non-null    object 
 3   day_of_week                535 non-null    int64  
 4   is_weekend                 535 non-null    int64  
 5   day_of_month               535 non-null    int64  
 6   month                      535 non-null    int64  
 7   year                       535 non-null    int64  
 8   is_holiday                 535 non-null    int64  
 9   season                     535 non-null    int64  
 10  lag_1                      535 non-null    float64
 11  lag_2                      535 non-null    float64
 12  lag_7                      535 non-null    float64
 13  lag_14                     535 non-null    float64

### Model to try:
    XGBoost Regressor
    RandomForestRegressor
    PoissonRegressor

In [10]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import PoissonRegressor

from sklearn.metrics import r2_score, mean_squared_error

In [11]:
# Prepare dataset 
dataset = dataset.fillna(0)
X = dataset.drop(columns=['trip_count', 'Unnamed: 0', 'Unnamed: 0.1','date' ])
y = dataset['trip_count']



#### XGBoost

In [12]:

# Define and train XGBoost model

X_train_boost, X_test_boost, y_train_boost, y_test_boost = train_test_split(
    X, y, test_size=0.2, random_state=42)

model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train_boost, y_train_boost)

# Predict and evaluate
y_pred = model.predict(X_test_boost)
r2 = r2_score(y_test_boost, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_boost, y_pred))

print(f"RÂ² Score: {r2:.3f}")
print(f"RMSE: {rmse:.2f}")

RÂ² Score: -0.144
RMSE: 2.54


In [13]:
model.feature_importances_

array([0.0409667 , 0.        , 0.03329565, 0.03748166, 0.03195314,
       0.00231785, 0.        , 0.03865356, 0.03924443, 0.02743142,
       0.04562882, 0.02707353, 0.07452744, 0.05085611, 0.06911112,
       0.03953969, 0.05592115, 0.03992089, 0.04585561, 0.14615186,
       0.04155722, 0.05532159, 0.05719051], dtype=float32)

In [14]:
y_mean = y_test_boost.mean()
r2_baseline = r2_score(y_test_boost, [y_mean] * len(y_test_boost))
print(f"Baseline RÂ²: {r2_baseline:.3f}")

Baseline RÂ²: 0.000


In [15]:
# RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_boost, y_train_boost)
y_pred_rf = rf_model.predict(X_test_boost)

print("RF RÂ²:", r2_score(y_test_boost, y_pred_rf))
print("RF RMSE:", np.sqrt(mean_squared_error(y_test_boost, y_pred_rf)))

RF RÂ²: -0.02646581977899154
RF RMSE: 2.4034731489180596


In [16]:
# PoissonRegressor

model = PoissonRegressor(alpha=1e-3, max_iter=1000)
model.fit(X_train_boost, y_train_boost)
y_pred = model.predict(X_test_boost)

print("RÂ²:", r2_score(y_test_boost, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test_boost, y_pred)))

RÂ²: -1.5520238390909213e-05
RMSE: 2.372304313928292


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


### !!!The problem that amount of the rides is too low. Data was bined and try to predict with classifiers

### Models to try:
    XGBoost classifier
    Random Forest classifier


In [17]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix



In [40]:
data_bins = pd.read_csv('../Project_datasets/cleaned_bins_big.csv')

In [41]:
data_bins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 4000 non-null   int64  
 1   day_of_week                4000 non-null   int64  
 2   is_weekend                 4000 non-null   int64  
 3   day_of_month               4000 non-null   int64  
 4   month                      4000 non-null   int64  
 5   year                       4000 non-null   int64  
 6   is_holiday                 4000 non-null   int64  
 7   season                     4000 non-null   int64  
 8   lag_1                      4000 non-null   float64
 9   lag_2                      4000 non-null   float64
 10  lag_7                      4000 non-null   float64
 11  lag_14                     4000 non-null   float64
 12  lag_month                  4000 non-null   float64
 13  lag_year                   4000 non-null   float

In [42]:
# XGBClassifier
# Fill only numeric columns with 0

numeric_cols = data_bins.select_dtypes(include=['number']).columns
data_bins[numeric_cols] = data_bins[numeric_cols].fillna(0)

# Convert 'trip_bin' to categorical 
data_bins['trip_bin'] = data_bins['trip_bin'].astype('category')

# Prepare target and features
y_xg_clas = data_bins['trip_bin'].cat.codes
X_xg_clas = data_bins.drop(columns=[ 'trip_bin', 'Unnamed: 0'])

# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_xg_clas, y_xg_clas, test_size=0.2, random_state=4)

# Train XGBoost classifier
model = XGBClassifier(
    random_state=42,
    n_estimators=1000,
    early_stopping_rounds=10,
    eval_metric="mlogloss"
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],  # ðŸ‘ˆ This is required for early stopping
    verbose=False                 # Optional: suppress training logs
)

y_pred = model.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred, target_names=data_bins['trip_bin'].cat.categories))

              precision    recall  f1-score   support

  high (5â€“6)       0.98      0.98      0.98       205
   low (1â€“2)       0.96      0.96      0.96       191
   mid (3â€“4)       0.97      0.97      0.97       197
   peak (7+)       1.00      1.00      1.00       207

    accuracy                           0.98       800
   macro avg       0.98      0.98      0.98       800
weighted avg       0.98      0.98      0.98       800



In [43]:
from sklearn.metrics import accuracy_score

# Predict on training data
y_train_pred = model.predict(X_train)

# Accuracy on train and test
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy :", accuracy_score(y_test, y_pred))

Train Accuracy: 1.0
Test Accuracy : 0.97625
