In [2]:
import mlflow
import mlflow
import pickle
import pandas as pd 

In [3]:

# Get predictions of the reference data 
with open ("/Users/anis/Desktop/mlopszoomcamp_2023_project/models/DictVectorizer.b", "rb") as f_in:
    dv = pickle.load(f_in)
    
with open("/Users/anis/Desktop/mlopszoomcamp_2023_project/models/xgboost.bin", "rb") as f_in:
    model =  pickle.load(f_in)
    
reference_data = pd.read_parquet('data/reference_data.parquet')

X = reference_data[['location_duree', 'superficie', 'pieces', 'etages',
       'category','commune']]
X = X.to_dict('records')
reference_data['price_pred'] = model.predict(dv.transform(X))

In [None]:
raw_data = reference_data.copy().iloc[0:300]

In [None]:
# Evidently 
from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

In [None]:
reference_data.columns

In [None]:
num_features = ['location_duree', 'superficie', 'pieces', 'etages']
cat_features = ['category',  'commune']

column_mapping = ColumnMapping(
    target=None,
    prediction='price_pred',
    numerical_features=num_features,
    categorical_features=cat_features
)

In [None]:
report = Report(metrics=[
    ColumnDriftMetric(column_name='price_pred'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric()
]
)

In [None]:
report.run(reference_data=reference_data, current_data=raw_data, column_mapping=column_mapping)


In [None]:
report.show(mode='inline')

In [None]:
result = report.as_dict()

In [None]:
result

In [None]:
#prediction drift
result['metrics'][0]['result']['drift_score']

In [None]:
#number of drifted columns
result['metrics'][1]['result']['number_of_drifted_columns']

In [None]:
result['metrics'][2]['result']['current']['share_of_missing_values']


# Script for monitoring 

In [1]:
import datetime
import time
import random
import logging 
import pickle
import uuid
import pytz
import pandas as pd
import io
import psycopg
import joblib

from prefect import task, flow

from evidently.report import Report
from evidently import ColumnMapping
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]: %(message)s")

SEND_TIMEOUT = 10
rand = random.Random()

create_table_statement = """
drop table if exists dummy_metrics;
create table dummy_metrics(
	timestamp timestamp,
	prediction_drift float,
	num_drifted_columns integer,
	share_missing_values float
)
"""

In [16]:

# Get predictions of the reference data 
with open ("/Users/anis/Desktop/mlopszoomcamp_2023_project/models/DictVectorizer.b", "rb") as f_in:
    dv = pickle.load(f_in)
with open("/Users/anis/Desktop/mlopszoomcamp_2023_project/models/xgboost.bin", "rb") as f_in:
    model =  pickle.load(f_in)
  
  
# Prepare data   
data = pd.read_parquet('data/reference_data.parquet')
data['createdAt'] = data['createdAt'].dt.tz_localize(None)

num_features = ['location_duree', 'superficie', 'pieces', 'etages']
cat_features = ['category', 'wilaya']

X = data[num_features+cat_features]
X = X.to_dict('records')



data['price_pred'] = model.predict(dv.transform(X))
data = data.reset_index(drop=True)
data.sort_values(by="createdAt", ascending=False,inplace=True)

reference_data = data.iloc[:500]
raw_data = data.drop('price_pred',axis = 1).iloc[500:]

raw_data.sort_values(by="createdAt", ascending=True,inplace=True)
reference_data.sort_values(by="createdAt", ascending=True,inplace=True)

In [17]:
# Create evidently report 
column_mapping = ColumnMapping(
    prediction='price_pred',
    numerical_features=num_features,
    categorical_features=cat_features,
    target=None
)

report = Report(metrics = [
    ColumnDriftMetric(column_name='price_pred'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric()
])

In [18]:
def prep_db():
	with psycopg.connect("host=localhost port=5432 user=postgres password=example", autocommit=True) as conn:
		res = conn.execute("SELECT 1 FROM pg_database WHERE datname='test'")
		if len(res.fetchall()) == 0:
			conn.execute("create database test;")
		with psycopg.connect("host=localhost port=5432 dbname=test user=postgres password=example") as conn:
			conn.execute(create_table_statement)


In [72]:
def calculate_metrics_postgresql(curr, num_batches, start_index,end_index):
	# Calcul du nombre total de mini DataFrames nécessaires
	current_data = raw_data.iloc[start_index:end_index]
	# Initialiser les variables
	X = current_data[num_features+cat_features]
	X = X.to_dict('records')
	current_data['price_pred'] = model.predict(dv.transform(X))
 
	report.run(reference_data = reference_data, current_data = current_data,
	column_mapping=column_mapping)

	result = report.as_dict()

	prediction_drift = result['metrics'][0]['result']['drift_score']
	num_drifted_columns = result['metrics'][1]['result']['number_of_drifted_columns']
	share_missing_values = result['metrics'][2]['result']['current']['share_of_missing_values']
	print(current_data["createdAt"].max(), prediction_drift,num_drifted_columns,share_missing_values)
    # Insert metrics to the database
	try: 
		curr.execute(
			"insert into dummy_metrics(timestamp, value1, value2, value3) values (%s, %s, %s, %s)",
			(current_data["createdAt"].max(), prediction_drift, num_drifted_columns, share_missing_values)
		)
	except:
		curr.execute(
			"insert into dummy_metrics(timestamp, prediction_drift, num_drifted_columns, share_missing_values) values (%s, %s, %s, %s)",
			(current_data["createdAt"].max(), None, None, None)
		)
	return None

In [73]:


def batch_monitoring_backfill():
    
	batch_size = 100
	num_batches = (len(raw_data) + batch_size - 1) // batch_size
	# Initialiser les variables
	start_index = 0
	end_index = 0
 
	# prep_db()
	last_send = datetime.datetime.now() - datetime.timedelta(seconds=10)
	with psycopg.connect("host=localhost port=5432 dbname=test user=postgres password=example", autocommit=True) as conn:
		for _ in range(num_batches):
			end_index = start_index + batch_size
			with conn.cursor() as curr:
				calculate_metrics_postgresql(curr,num_batches, start_index,end_index)
			start_index = end_index
   
			new_send = datetime.datetime.now()
			seconds_elapsed = (new_send - last_send).total_seconds()
			if seconds_elapsed < SEND_TIMEOUT:
				time.sleep(SEND_TIMEOUT - seconds_elapsed)
			while last_send < new_send:
				last_send = last_send + datetime.timedelta(seconds=10)
			logging.info("data sent")

In [74]:
batch_monitoring_backfill()

2017-11-08 10:01:22 0.7332500171808467 0 0.0
2019-12-18 09:18:21 0.9877878006860294 1 0.0
2020-11-14 11:52:10 0.7629774146142639 0 0.0
2021-01-11 21:00:15 0.020791997193519672 3 0.0


  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp


2021-03-14 14:07:13 0.23504861641221936 1 0.0
2021-05-22 11:32:25 0.1340215826761443 1 0.0


  terms = (f_obs_float - f_exp)**2 / f_exp


2021-07-23 19:27:30 0.4914615381119013 2 0.0
2021-09-01 15:28:10 0.07143654944636102 2 0.0
2021-09-30 11:07:32 0.3598180966851865 1 0.0
2021-11-04 00:00:32 0.020791997193519672 3 0.0


  terms = (f_obs_float - f_exp)**2 / f_exp


2021-11-22 13:47:02 0.5532116445348475 1 0.0


  terms = (f_obs_float - f_exp)**2 / f_exp


In [15]:
num_features = ['location_duree', 'superficie', 'pieces', 'etages']
cat_features = ['category', 'wilaya']

def prep_data(model,dv) -> (pd.DataFrame,pd.DataFrame):
    data = pd.read_parquet('data/reference_data.parquet')
    data['createdAt'] = data['createdAt'].dt.tz_localize(None)
    data = data.reset_index(drop=True)
    data.sort_values(by="createdAt", ascending=True,inplace=True)
    
    total_rows = len(data)
    num_rows_first_df = int(total_rows * 0.8)
    num_rows_second_df = total_rows - num_rows_first_df


    X = data[num_features+cat_features]
    X = X.to_dict('records')



    data['price_pred'] = model.predict(dv.transform(X))

    reference_data = data.head(num_rows_first_df)
    raw_data = data.drop('price_pred',axis = 1).tail(num_rows_second_df)

    # raw_data.sort_values(by="createdAt", ascending=True,inplace=True)
    # reference_data.sort_values(by="createdAt", ascending=True,inplace=True)
    return raw_data, reference_data 

In [16]:
data = pd.read_parquet('data/reference_data.parquet')

In [17]:
raw_data, reference_data  = prep_data(model,dv) 

In [18]:
reference_data.head(1)

Unnamed: 0,createdAt,location_duree,superficie,pieces,etages,category,wilaya,commune,price,price_pred
1587,2013-09-09 06:29:54,6,120,4,1,Appartement,Alger,Hydra,130000.0,61524.242188


In [19]:
reference_data.tail(1)

Unnamed: 0,createdAt,location_duree,superficie,pieces,etages,category,wilaya,commune,price,price_pred
318,2022-01-20 13:34:13,6,65,2,0,Appartement,Alger,Hydra,140000.0,54338.316406


In [20]:
raw_data.head(1)

Unnamed: 0,createdAt,location_duree,superficie,pieces,etages,category,wilaya,commune,price
317,2022-01-20 13:47:35,6,92,3,5,Appartement,Alger,Alger centre,140000.0


In [22]:
raw_data.tail(1)

Unnamed: 0,createdAt,location_duree,superficie,pieces,etages,category,wilaya,commune,price
0,2022-05-24 22:53:17,6,80,3,3,Appartement,Alger,Bir mourad rais,65000.0


In [21]:
data.head(1)

Unnamed: 0,createdAt,location_duree,superficie,pieces,etages,category,wilaya,commune,price
33750,2022-05-24 22:53:17+00:00,6,80,3,3,Appartement,Alger,Bir mourad rais,65000.0


In [23]:
data.tail(1)

Unnamed: 0,createdAt,location_duree,superficie,pieces,etages,category,wilaya,commune,price
40406,2013-09-09 06:29:54+00:00,6,120,4,1,Appartement,Alger,Hydra,130000.0
