In [2]:
import s3fs
import pandas as pd
import mlflow
import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

df = pd.read_parquet('s3://chicago-bike-trips/dataset.parquet')

In [11]:
df

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
0,3564070EEFD12711,electric_bike,2022-04-06 17:42:48,2022-04-06 17:54:36,Paulina St & Howard St,515,University Library (NU),605,42.019135,-87.673532,42.052939,-87.673447,member,11.800000
1,0B820C7FCF22F489,classic_bike,2022-04-24 19:23:07,2022-04-24 19:43:17,Wentworth Ave & Cermak Rd,13075,Green St & Madison St,TA1307000120,41.853085,-87.631931,41.881892,-87.648789,member,20.166667
2,89EEEE32293F07FF,classic_bike,2022-04-20 19:29:08,2022-04-20 19:35:16,Halsted St & Polk St,TA1307000121,Green St & Madison St,TA1307000120,41.871840,-87.646640,41.881892,-87.648789,member,6.133333
3,84D4751AEB31888D,classic_bike,2022-04-22 21:14:06,2022-04-22 21:23:29,Wentworth Ave & Cermak Rd,13075,Delano Ct & Roosevelt Rd,KA1706005007,41.853085,-87.631931,41.867491,-87.632190,casual,9.383333
4,5664BCF0D1DE7A8B,electric_bike,2022-04-16 15:56:30,2022-04-16 16:02:11,Halsted St & Polk St,TA1307000121,Clinton St & Madison St,TA1305000032,41.871808,-87.646574,41.882242,-87.641066,member,5.683333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634853,8891BA0053ECEC4F,electric_bike,2022-05-27 22:00:02,2022-05-27 22:07:01,Clark St & Newport St,632,,,41.944557,-87.654830,41.920000,-87.650000,member,6.983333
634854,47D8B5FBCADECFC1,electric_bike,2022-05-15 16:05:39,2022-05-15 16:44:12,Clark St & Newport St,632,,,41.944479,-87.654758,41.920000,-87.760000,member,38.550000
634855,AA8D16CF38B40703,electric_bike,2022-05-21 10:10:13,2022-05-21 10:26:09,Francisco Ave & Bloomingdale Ave,429,,,41.910000,-87.700000,41.920000,-87.660000,casual,15.933333
634856,897EBFD44F329E0A,electric_bike,2022-05-12 07:53:58,2022-05-12 08:01:18,Francisco Ave & Bloomingdale Ave,429,,,41.910000,-87.700000,41.900000,-87.690000,member,7.333333


In [2]:
TRACKING_SERVER_HOST = "ec2-18-117-176-70.us-east-2.compute.amazonaws.com" #change if ec2 instance is reinitiated
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
mlflow.set_experiment("chicago-bike-share")

2023/07/13 00:17:23 INFO mlflow.tracking.fluent: Experiment with name 'chicago-bike-share' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow-models-bruno/1', creation_time=1689207443833, experiment_id='1', last_update_time=1689207443833, lifecycle_stage='active', name='chicago-bike-share', tags={}>

In [3]:
def distance_calc (row):
    start = (row['start_lat'], row['start_lng'])
    stop = (row['end_lat'], row['end_lng'])
    return great_circle(start, stop).km

def pre_processing(df):
    df.dropna(inplace=True)
    df['start_end_id'] = df['start_station_id'] + '-' +df['end_station_id']
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df['distance'] = df.apply(lambda row: distance_calc (row),axis=1)
    columns = ['ride_id','started_at','ended_at','start_station_name','end_station_name','start_station_id','end_station_id',
               'start_lat','start_lng','end_lat','end_lng']
    df.drop(columns=columns, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

df = pre_processing(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['distance'] = df.apply(lambda row: distance_calc (row),axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=columns, inplace=True)


In [4]:
def training_preparation(df):
    categorical = ['rideable_type','member_casual','start_end_id']
    numerical = ['distance']

    X = df.drop(columns=['duration'])
    y = df.duration

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

    dv = DictVectorizer()
    train_dicts = X_train[categorical + numerical].to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)
    test_dicts = X_test[categorical + numerical].to_dict(orient='records')
    X_test = dv.transform(test_dicts)

    return X_train, X_test, y_train, y_test  

X_train, X_test, y_train, y_test  = training_preparation(df)

In [5]:
def train(X_train,y_train):
    lr = Ridge(alpha=.5)
    lr.fit(X_train, y_train)
    return lr

with mlflow.start_run():
    model = train(X_train,y_train)
    mlflow.sklearn.log_model(model, artifact_path='models')



In [6]:
def calculate_mse(X_test,y_test):
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse

with mlflow.start_run():
    rmse = calculate_mse(X_test,y_test)
    mlflow.log_metric("rmse", rmse)

agora é botar esse código em py rodar da linha de comando e verificar esse erro de versões acima do mlflow env

In [1]:
import mlflow
import pickle
logged_model = 's3://mlflow-models-bruno/1/eff4f74e811840319846bd6f73cc3d73/artifacts/model'
model = mlflow.pyfunc.load_model(logged_model)

In [2]:
with open ('models/preprocessor.b', 'rb') as file:
    dv = pickle.load(file)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
dv

In [37]:
ride = {'start_station_id' : 'KA1706005007', 'end_station_id' : 'TA1305000032', 'rideable_type':'eletric_bike','member_casual': 'member' }

In [13]:
def prepare_features(bike_ride):
    features = {}
    features['start_end_id'] = bike_ride['start_station_id'] + '-' +bike_ride['end_station_id']
    features['rideable_type'] = bike_ride['rideable_type']
    features['member_casual'] = bike_ride['member_casual']
    return features

In [38]:
features = prepare_features(ride)

In [19]:
def predict(features):
    
    X = dv.transform(features)
    preds = model.predict(X)
    return float(preds[0])


In [39]:
predict(features)

14.990719661071937

In [40]:
ride = {'start_station_id' : 'KA1706005007', 'end_station_id' : 'TA1305000032', 'rideable_type':'eletric_bike','member_casual': 'member' }

In [41]:
import requests
url = 'http://localhost:9696/predict'
print(requests.post(url,json=ride).json())

{'duration': 14.823610703362052}
