# Random forest fitting
This script is inspired on the 02-experiment-tracking HW, where you fit a bunch of random-forest models to the taxi data, and log onto MLflow both the fitted model as well as the dictionary vectorizer.

## Starting up MLflow
To start up mlflow, run on bash in this folder the following:
```bash
mlflow server --backend-store-uri=sqlite:///mlflow.db --default-artifact-root=./artifacts_local/
```

Note that the default artifact root can be redone with an AWS bucket (s3) and that the backend store uri can just be a setup postgres

In [56]:
import pickle
import pandas as pd
import mlflow
import os
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.pipeline import make_pipeline
import uuid # unique universal id

In [57]:
taxi_type = 'green'
month = 1
year = 2023

input_file = f'../../data/{taxi_type}_tripdata_{year:04d}-{month:02d}.parquet'
output_file = f'output/{taxi_type}/{year:04d}-{month:02d}.parquet'

RUN_ID = os.getenv('RUN_ID','d4f23de4f6bb46d9a46893256a104d07')

## Setting up functions

In [58]:
def create_outfolder(output_file):
    path = os.path.dirname(output_file)
    current_directory = os.getcwd()
    final_directory = os.path.join(current_directory, path)
    if not os.path.exists(final_directory):
        os.makedirs(final_directory)

def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    df['ride_id'] = [str(uuid.uuid4()) for i in range(len(df))]

    return df

def prepare_dictionaries(df: pd.DataFrame):
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    return dicts

def load_model(RUN_ID):
    logged_model = f'../web-service-mlflow/artifacts_local/1/{RUN_ID}/artifacts/model'
    model = mlflow.pyfunc.load_model(logged_model)
    return model

def apply_model(input_file, RUN_ID, output_file):
    df = read_dataframe(input_file)
    dicts = prepare_dictionaries(df)
    model = load_model(RUN_ID)
    y_pred = model.predict(dicts)

    df_result = pd.DataFrame()

    df_result['ride_id'] = df['ride_id']
    df_result['lpep_pickup_datetime'] = df['lpep_pickup_datetime']
    df_result['PULocationID'] = df['PULocationID']
    df_result['DOLocationID'] = df['DOLocationID']
    df_result['actual_duration'] = df['duration']
    df_result['predicted_duration'] = y_pred
    df_result['diff'] = df_result['actual_duration'] - df_result['predicted_duration']
    df_result['model_version'] = RUN_ID

    # create output foldder if not exists
    create_outfolder(output_file)
    
    #save results
    df_result.to_parquet(output_file,index=False)

    return df_result

## Reading in the data and prepping dicts
This time we are not training nor validating, we are just applying the model to predict things

In [59]:
df_result = apply_model(input_file, RUN_ID, output_file)


In [60]:
df_result

Unnamed: 0,ride_id,lpep_pickup_datetime,PULocationID,DOLocationID,actual_duration,predicted_duration,diff,model_version
0,8e37d014-c0a8-4d67-b8b1-5afecaf3bfc0,2023-01-01 00:26:10,166,143,11.016667,14.916123,-3.899457,d4f23de4f6bb46d9a46893256a104d07
1,4d4d8383-7d97-4bf2-8037-092fe5ac60e0,2023-01-01 00:51:03,24,43,6.766667,11.094681,-4.328015,d4f23de4f6bb46d9a46893256a104d07
2,85a5b590-fcda-47a7-a613-ace344296935,2023-01-01 00:35:12,223,179,6.333333,21.668076,-15.334743,d4f23de4f6bb46d9a46893256a104d07
3,26647851-3414-4ff1-bbc5-7d455b6a1a5c,2023-01-01 00:13:14,41,238,5.816667,8.587919,-2.771252,d4f23de4f6bb46d9a46893256a104d07
4,b45ccb37-3465-42e2-a4c1-de9ab23a30b3,2023-01-01 00:33:04,41,74,5.966667,7.502883,-1.536216,d4f23de4f6bb46d9a46893256a104d07
...,...,...,...,...,...,...,...,...
68206,0e388d10-1925-443b-90f2-083eda0655c8,2023-01-31 22:29:00,49,62,13.000000,17.089738,-4.089738,d4f23de4f6bb46d9a46893256a104d07
68207,3954b034-ed19-452a-8ab8-6198c1552c98,2023-01-31 22:40:00,10,205,8.000000,12.828158,-4.828158,d4f23de4f6bb46d9a46893256a104d07
68208,15c87e23-e868-4cfe-b620-a4a9e55b93c9,2023-01-31 23:46:00,66,37,16.000000,18.984462,-2.984462,d4f23de4f6bb46d9a46893256a104d07
68209,b890b915-66ca-4073-a1de-5083fb7701c3,2023-01-31 23:01:00,225,189,18.000000,17.066313,0.933687,d4f23de4f6bb46d9a46893256a104d07
