In [1]:
import s3fs
import pandas as pd
import mlflow
import joblib
from geopy.distance import great_circle
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

df = pd.read_parquet('s3://chicago-bike-trips/dataset.parquet')

In [2]:
TRACKING_SERVER_HOST = "ec2-18-117-176-70.us-east-2.compute.amazonaws.com" #change if ec2 instance is reinitiated
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
mlflow.set_experiment("chicago-bike-share")

2023/07/13 00:17:23 INFO mlflow.tracking.fluent: Experiment with name 'chicago-bike-share' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow-models-bruno/1', creation_time=1689207443833, experiment_id='1', last_update_time=1689207443833, lifecycle_stage='active', name='chicago-bike-share', tags={}>

In [3]:
def distance_calc (row):
    start = (row['start_lat'], row['start_lng'])
    stop = (row['end_lat'], row['end_lng'])
    return great_circle(start, stop).km

def pre_processing(df):
    df.dropna(inplace=True)
    df['start_end_id'] = df['start_station_id'] + '-' +df['end_station_id']
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df['distance'] = df.apply(lambda row: distance_calc (row),axis=1)
    columns = ['ride_id','started_at','ended_at','start_station_name','end_station_name','start_station_id','end_station_id',
               'start_lat','start_lng','end_lat','end_lng']
    df.drop(columns=columns, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

df = pre_processing(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['distance'] = df.apply(lambda row: distance_calc (row),axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=columns, inplace=True)


In [4]:
def training_preparation(df):
    categorical = ['rideable_type','member_casual','start_end_id']
    numerical = ['distance']

    X = df.drop(columns=['duration'])
    y = df.duration

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

    dv = DictVectorizer()
    train_dicts = X_train[categorical + numerical].to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)
    test_dicts = X_test[categorical + numerical].to_dict(orient='records')
    X_test = dv.transform(test_dicts)

    return X_train, X_test, y_train, y_test  

X_train, X_test, y_train, y_test  = training_preparation(df)

In [5]:
def train(X_train,y_train):
    lr = Ridge(alpha=.5)
    lr.fit(X_train, y_train)
    return lr

with mlflow.start_run():
    model = train(X_train,y_train)
    mlflow.sklearn.log_model(model, artifact_path='models')



In [6]:
def calculate_mse(X_test,y_test):
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse

with mlflow.start_run():
    rmse = calculate_mse(X_test,y_test)
    mlflow.log_metric("rmse", rmse)

agora é botar esse código em py rodar da linha de comando e verificar esse erro de versões acima do mlflow env