In [None]:
!pip install mlflow


Collecting mlflow
  Downloading mlflow-2.16.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.16.1 (from mlflow)
  Downloading mlflow_skinny-2.16.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.16.1->mlflow)
  Downloading databricks_sdk-0.32.1-py3-none-any.whl.metadata (37 kB)
Collecting gitpython<4,>=3.1.9 (from mlflow-skinny==2.16.1->mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==2.16.1->mlflow)
  Downloading opentelemetry_api-1.2

In [3]:
import datetime
import pandas as pd
import numpy as np
import requests
import zipfile
import io
import json

from sklearn import datasets, ensemble, model_selection
from scipy.stats import anderson_ksamp

In [4]:
content = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip").content
with zipfile.ZipFile(io.BytesIO(content)) as arc:
    raw_data = pd.read_csv(arc.open("hour.csv"), header=0, sep=',', parse_dates=['dteday'])

In [5]:
raw_data.index = raw_data.apply(lambda row: datetime.datetime.combine(row.dteday.date(), datetime.time(row.hr)),
                                axis=1)


In [6]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17379 entries, 2011-01-01 00:00:00 to 2012-12-31 23:00:00
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     17379 non-null  int64         
 1   dteday      17379 non-null  datetime64[ns]
 2   season      17379 non-null  int64         
 3   yr          17379 non-null  int64         
 4   mnth        17379 non-null  int64         
 5   hr          17379 non-null  int64         
 6   holiday     17379 non-null  int64         
 7   weekday     17379 non-null  int64         
 8   workingday  17379 non-null  int64         
 9   weathersit  17379 non-null  int64         
 10  temp        17379 non-null  float64       
 11  atemp       17379 non-null  float64       
 12  hum         17379 non-null  float64       
 13  windspeed   17379 non-null  float64       
 14  casual      17379 non-null  int64         
 15  registered  17379 non-null  int64  

In [7]:
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2011-01-01 00:00:00,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2011-01-01 01:00:00,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2011-01-01 02:00:00,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
2011-01-01 03:00:00,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
2011-01-01 04:00:00,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [13]:
from scipy import stats

p_value = 0.05
rejected = 0

categorical_features = ["season", "holiday", "weekday", "workingday"]
numerical_features = ["cnt", "temp", "atemp", "windspeed", "yr", "mnth"]

In [12]:
# raw_data["workingday"].value_counts()

In [14]:
reference = raw_data.loc["2011-01-01 00:00:00":"2011-02-01 00:00:00"]
current = raw_data.loc["2011-02-02 00:00:00":"2011-03-01 00:00:00"]

In [18]:
for col in numerical_features:
  test = stats.ks_2samp(reference[col], current[col])
  if test[1] < p_value:
    print(f"Our data for {col} has changed")

Our data for cnt has changed
Our data for temp has changed
Our data for atemp has changed
Our data for windspeed has changed
Our data for mnth has changed


In [19]:
from scipy.stats import chi2_contingency

def drift_chisquare(sample1, sample2):
    return chi2_contingency([sample1, sample2])[1]

for col in categorical_features:
    print(col, set(reference[col].values))

season {1}
holiday {0, 1}
weekday {0, 1, 2, 3, 4, 5, 6}
workingday {0, 1}


In [20]:
for col in categorical_features:
    val = drift_chisquare(reference[col].value_counts(),current[col].value_counts() )

    print(col,val)
    rejected = 0
    if val < p_value:
        rejected += 1
        print("Column rejected", col)

print("We rejected ",rejected," columns in total out of {} columns".format(len(categorical_features)))

season 1.0
holiday 0.8526899182305283
weekday 0.3274988269804718
workingday 0.19027250353327513
We rejected  0  columns in total out of 4 columns


In [21]:
target = 'cnt'
prediction = 'prediction'
numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday', ]#'weathersit']

In [22]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    reference[numerical_features + categorical_features],
    reference[target],
    test_size=0.3
)


In [23]:
regressor = ensemble.RandomForestRegressor(random_state = 0)

regressor.fit(X_train, y_train)

preds_test = regressor.predict(X_test)

In [24]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

ref_mae=mean_absolute_error(y_test,preds_test)
ref_mse=mean_squared_error(y_test,preds_test)
ref_r2 = r2_score(y_test,preds_test)


print("MAE",ref_mae)
print("MSE",ref_mse)
print("R2",ref_r2)

MAE 10.90545893719807
MSE 291.3002169082126
R2 0.9000171787414865


In [25]:
current_x=current[numerical_features + categorical_features]
current_y=current[target]

current_pred = regressor.predict(current_x)

In [26]:
print("MAE",mean_absolute_error(current_y,current_pred))
print("MSE",mean_squared_error(current_y,current_pred))

r2 = r2_score(current_y,current_pred)
print(r2)


MAE 21.17904306220096
MSE 1144.1601984051038
0.7203827437459387


In [27]:
# MLFlow

In [28]:
import mlflow
#import mlflow.sklearn
from mlflow.tracking import MlflowClient
import os

In [29]:
mlflow.set_experiment("Bicycle–Sharing")


2024/09/16 17:29:16 INFO mlflow.tracking.fluent: Experiment with name 'Bicycle–Sharing' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/mlruns/781516220102665981', creation_time=1726507756794, experiment_id='781516220102665981', last_update_time=1726507756794, lifecycle_stage='active', name='Bicycle–Sharing', tags={}>

In [30]:
with mlflow.start_run():

    mlflow.set_tag('mlflow.runName','Refrence_run')
    mlflow.log_metric("MAE",ref_mae)
    mlflow.log_metric("MSE",ref_mse)
    mlflow.log_metric("R2",ref_r2)

    mlflow.sklearn.log_model(regressor, "model")



In [31]:
experiment_batches = [
    ('2011-01-29 00:00:00','2011-02-07 23:00:00'),
    ('2011-02-07 00:00:00','2011-02-14 23:00:00'),
    ('2011-02-15 00:00:00','2011-02-21 23:00:00'),
]

In [32]:
#start new run
for date in experiment_batches:
    with mlflow.start_run() as run: #inside brackets run_name='test'

        mlflow.set_tag('mlflow.runName',"run_"+str(date[0])+" : "+str(date[1]))

        # Log parameters
        mlflow.log_param("begin", date[0])
        mlflow.log_param("end", date[1])

        # Get metrics
        current_data=current.loc[date[0]:date[1]]
        current_x=current_data[numerical_features + categorical_features]
        current_y=current_data[target]
        current_pred = regressor.predict(current_x)

        mae=mean_absolute_error(current_y,current_pred)
        mse=mean_squared_error(current_y,current_pred)
        r2 = r2_score(current_y,current_pred)

        # Log metrics
        mlflow.log_metric('MAE', round(mae, 3))
        mlflow.log_metric('MSE', round(mse, 3))
        mlflow.log_metric('R2', round(r2, 3))

        print(run.info)

<RunInfo: artifact_uri='file:///content/mlruns/781516220102665981/7b53f5a7d73b4006af76de7beccedda3/artifacts', end_time=None, experiment_id='781516220102665981', lifecycle_stage='active', run_id='7b53f5a7d73b4006af76de7beccedda3', run_name='brawny-gull-407', run_uuid='7b53f5a7d73b4006af76de7beccedda3', start_time=1726507787867, status='RUNNING', user_id='root'>
<RunInfo: artifact_uri='file:///content/mlruns/781516220102665981/0e05874bf879488292e17f6df6d62d74/artifacts', end_time=None, experiment_id='781516220102665981', lifecycle_stage='active', run_id='0e05874bf879488292e17f6df6d62d74', run_name='dapper-snail-427', run_uuid='0e05874bf879488292e17f6df6d62d74', start_time=1726507787934, status='RUNNING', user_id='root'>
<RunInfo: artifact_uri='file:///content/mlruns/781516220102665981/dd4a916e8cdf42bea810641216955d7e/artifacts', end_time=None, experiment_id='781516220102665981', lifecycle_stage='active', run_id='dd4a916e8cdf42bea810641216955d7e', run_name='rumbling-skink-139', run_uuid=