In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [4]:
import numpy as np
import pandas as pd
from scipy.fft import fft
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
from sklearn.metrics import mean_squared_error

In [12]:
# Now you can import from src
from src.data_utils import load_and_process_taxi_data
rides = load_and_process_taxi_data(year=2023)

File already exists for 2023-01.
Loading data for 2023-01...
Total records: 3,066,766
Valid records: 2,993,140
Records dropped: 73,626 (2.40%)
Successfully processed data for 2023-01.
File already exists for 2023-02.
Loading data for 2023-02...
Total records: 2,913,955
Valid records: 2,845,058
Records dropped: 68,897 (2.36%)
Successfully processed data for 2023-02.
File already exists for 2023-03.
Loading data for 2023-03...
Total records: 3,403,766
Valid records: 3,331,705
Records dropped: 72,061 (2.12%)
Successfully processed data for 2023-03.
File already exists for 2023-04.
Loading data for 2023-04...
Total records: 3,288,250
Valid records: 3,214,922
Records dropped: 73,328 (2.23%)
Successfully processed data for 2023-04.
File already exists for 2023-05.
Loading data for 2023-05...
Total records: 3,513,649
Valid records: 3,435,875
Records dropped: 77,774 (2.21%)
Successfully processed data for 2023-05.
File already exists for 2023-06.
Loading data for 2023-06...
Total records: 3,30

In [13]:
from src.data_utils import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)
ts_data.head()

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2023-01-01 00:00:00,2,0
1,2023-01-01 01:00:00,2,0
2,2023-01-01 02:00:00,2,0
3,2023-01-01 03:00:00,2,0
4,2023-01-01 04:00:00,2,0


In [None]:
print(ts_data.dtypes)

Unnamed: 0,pickup_hour,pickup_location_id,rides
count,2277600,2277600.0,2277600.0
mean,2023-07-02 11:30:00.000000256,132.7231,16.44857
min,2023-01-01 00:00:00,2.0,0.0
25%,2023-04-02 05:45:00,66.75,0.0
50%,2023-07-02 11:30:00,133.5,0.0
75%,2023-10-01 17:15:00,198.25,2.0
max,2023-12-31 23:00:00,263.0,962.0
std,,75.87973,50.91858


In [7]:
from src.config import TRANSFORMED_DATA_DIR
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

In [8]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(55900, 674)
(55900,)
(31720, 674)
(31720,)


In [15]:
def extract_fft_features(time_series):
    # Extract numerical values from the time series
    numerical_values = time_series.astype(float).values
    fft_result = fft(numerical_values)
    fft_magnitude = np.abs(fft_result)
    return fft_magnitude[:len(fft_magnitude)//2]


In [16]:
fft_features = extract_fft_features(ts_data)

TypeError: Cannot cast DatetimeArray to dtype float64

In [None]:


# Model definitions
def fit_arma(data, order):
    model = ARIMA(data, order=(order[0], 0, order[1]))
    results = model.fit()
    return results

def fit_arima(data, order):
    model = ARIMA(data, order=order)
    results = model.fit()
    return results

def fit_prophet(data):
    df = pd.DataFrame({'ds': data.index, 'y': data.values})
    model = Prophet()
    model.fit(df)
    return model

# Main pipeline
def train_and_evaluate_models(data):
    experiment = mlflow_setup()
    fft_features = extract_fft_features(data)
    
    with mlflow.start_run(experiment_id=experiment.id):
        mlflow.log_param("data_shape", data.shape)
        mlflow.log_param("fft_features_shape", fft_features.shape)
        
        # ARMA
        logger.info("Fitting ARMA model...")
        arma_model = fit_arma(data, order=(2, 1))
        arma_predictions = arma_model.forecast(steps=len(data))
        arma_mse = mean_squared_error(data, arma_predictions)
        mlflow.log_metric("arma_mse", arma_mse)
        mlflow.log_metric("arma_aic", arma_model.aic)
        mlflow.sklearn.log_model(arma_model, "arma_model")
        
        # ARIMA
        logger.info("Fitting ARIMA model...")
        arima_model = fit_arima(data, order=(1, 1, 1))
        arima_predictions = arima_model.forecast(steps=len(data))
        arima_mse = mean_squared_error(data, arima_predictions)
        mlflow.log_metric("arima_mse", arima_mse)
        mlflow.log_metric("arima_aic", arima_model.aic)
        mlflow.sklearn.log_model(arima_model, "arima_model")
        
        # Prophet
        logger.info("Fitting Prophet model...")
        prophet_model = fit_prophet(data)
        future_dates = prophet_model.make_future_dataframe(periods=len(data))
        prophet_forecast = prophet_model.predict(future_dates)
        prophet_predictions = prophet_forecast['yhat'][-len(data):]
        prophet_mse = mean_squared_error(data, prophet_predictions)
        mlflow.log_metric("prophet_mse", prophet_mse)
        mlflow.pyfunc.log_model("prophet_model", python_model=prophet_model)
        
        # Log FFT features
        mlflow.log_param("fft_features", fft_features.tolist())
        
        logger.info("Model training and evaluation completed.")