In [44]:
import s3fs
import pandas as pd
import mlflow
from geopy.distance import great_circle
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

df = pd.read_parquet('s3://chicago-bike-trips/dataset.parquet')

In [39]:
def distance_calc (row):
    start = (row['start_lat'], row['start_lng'])
    stop = (row['end_lat'], row['end_lng'])
    return great_circle(start, stop).km

def pre_processing(df):
    df.dropna(inplace=True)
    df['start_end_id'] = df['start_station_id'] + '-' +df['end_station_id']
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df['distance'] = df.apply(lambda row: distance_calc (row),axis=1)
    columns = ['ride_id','started_at','ended_at','start_station_name','end_station_name','start_station_id','end_station_id',
               'start_lat','start_lng','end_lat','end_lng']
    df.drop(columns=columns, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

df = pre_processing(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['distance'] = df.apply(lambda row: distance_calc (row),axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=columns, inplace=True)


In [40]:
def training_preparation(df):
    categorical = ['rideable_type','member_casual','start_end_id']
    numerical = ['distance']

    X = df.drop(columns=['duration'])
    y = df.duration

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

    dv = DictVectorizer()
    train_dicts = X_train[categorical + numerical].to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)
    test_dicts = X_test[categorical + numerical].to_dict(orient='records')
    X_test = dv.transform(test_dicts)

    return X_train, X_test, y_train, y_test  

X_train, X_test, y_train, y_test  = training_preparation(df)

In [45]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("chicago-bike-share")

2023/07/11 03:08:07 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/07/11 03:08:08 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/home/ubuntu/mlops-zoomcamp-project/mlruns/1', creation_time=1689044888626, experiment_id='1', last_update_time=1689044888626, lifecycle_stage='active', name='chicago-bike-share', tags={}>

In [46]:
def train(X_train,y_train):
    lr = Ridge(alpha=.5)
    lr.fit(X_train, y_train)
    return lr

model = train(X_train,y_train)

In [57]:
import joblib

def calculate_mse(X_test,y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred, squared=False)
    return mse

rmse = calculate_mse(X_test,y_test)
mlflow.log_metric("rmse", rmse)
filename = 'models/model.pkl'
joblib.dump(model, filename)
mlflow.register_model('model',"models/model.pkl")

Registered model 'models/model.pkl' already exists. Creating a new version of this model...
2023/07/11 03:21:57 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: models/model.pkl, version 2
Created version '2' of model 'models/model.pkl'.


<ModelVersion: aliases=[], creation_timestamp=1689045717758, current_stage='None', description=None, last_updated_timestamp=1689045717758, name='models/model.pkl', run_id=None, run_link=None, source='model', status='READY', status_message=None, tags={}, user_id=None, version=2>