In [None]:
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

from mlflow import (
    set_tracking_uri,
    set_experiment,
    start_run,
    set_tag,
    log_metric,
    log_param,
    log_artifact,
)
from mlflow.sklearn import log_model

: 

# 0 - Download Data

In [None]:
import urllib.request

train_path = "data/fhv_tripdata_2021-01.parquet"
test_path = "data/fhv_tripdata_2021-02.parquet"
urllib.request.urlretrieve(
    "https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2021-01.parquet",
    train_path,
)
urllib.request.urlretrieve(
    "https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2021-02.parquet",
    test_path,
)

# 1 - Load data

In [2]:
train_df = pd.read_parquet(train_path)

In [3]:
train_df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


# 2 - Prepare the data

## 2-1 Compute the target

We are computing a taxi trip duration in minutes

In [4]:
def compute_target(df):
    df["duration"] = df["dropOff_datetime"] - df["pickup_datetime"]
    df["duration"] = df["duration"].dt.total_seconds() / 60
    return df


train_df = compute_target(train_df)

In [5]:
train_df["duration"].describe()

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
25%      7.766667e+00
50%      1.340000e+01
75%      2.228333e+01
max      4.233710e+05
Name: duration, dtype: float64

Let's remove outliers and reduce the scope to trips between 1 minute and 1 hour

In [6]:
MIN_DURATION = 1
MAX_DURATION = 60


def filter_outliers(df, min_duration=MIN_DURATION, max_duration=MAX_DURATION):
    df = df[(df.duration >= min_duration) & (df.duration <= max_duration)]
    return df


train_df = filter_outliers(train_df)

## 2-2 Prepare features

### 2-2-1 Categorical features

In [7]:
CATEGORICAL_COLS = ["PUlocationID", "DOlocationID"]


def encode_categorical_cols(df):
    df[CATEGORICAL_COLS] = df[CATEGORICAL_COLS].fillna(-1).astype("int")
    df[CATEGORICAL_COLS] = df[CATEGORICAL_COLS].astype("str")
    return df


train_df = encode_categorical_cols(train_df)

In [8]:
def extract_x_y(df, dv=None):
    dicts = df[CATEGORICAL_COLS].to_dict(orient="records")
    if dv is None:
        dv = DictVectorizer()
        dv.fit(dicts)
    X = dv.transform(dicts)
    y = df["duration"].values
    return X, y, dv


X_train, y_train, dv = extract_x_y(train_df)

# 3 - Train model

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)

# 4 - Evaluate model

## 4-1 On train data

In [10]:
y_pred = lr.predict(X_train)
train_me = mean_squared_error(y_train, y_pred, squared=False)
train_me

10.528519429062042

## 4-2 On test data

In [11]:
test_df = pd.read_parquet(test_path)

In [13]:
test_df = compute_target(test_df)
test_df = filter_outliers(test_df)
test_df = encode_categorical_cols(test_df)
X_test, y_test, _ = extract_x_y(test_df, dv=dv)

In [15]:
y_pred_test = lr.predict(X_test)
test_me = mean_squared_error(y_test, y_pred_test, squared=False)
test_me

11.014285430787638

# 5 - Log Model Parameters to MlFlow

In [None]:
mlflow_experiment_path = f"/mlflow/linear_reg_test"
set_experiment(mlflow_experiment_path)
with start_run():
    set_tag("Level", "Development")
    # Log a parameter (key-value pair)
    log_param("filtered_outliers", True)

    # Log a metric; metrics can be updated throughout the run
    log_metric("train_me", train_me)
    log_metric("test_me", test_me)

    # # Log an artifact (output file)
    # with open("output.txt", "w") as f:
    #     f.write("Hello world!")
    # log_artifact("output.txt")

    # Log your model
    log_model(lr, "models")