In [1]:
# !pip uninstall -y numpy scipy scikit-learn
# !pip install numpy --upgrade --force-reinstall
# !pip install scipy --upgrade --force-reinstall
# !pip install scikit-learn --upgrade --force-reinstall

In [2]:
!pip install numpy==1.26.4



In [3]:
#import packages
import pandas as pd
import numpy as np
import requests
import io
import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib

In [4]:
!pip install evidently==0.6.7





In [5]:
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
from evidently.metric_preset import TargetDriftPreset


In [6]:
# print("Downloading NYC taxi trip data...")
# data_url = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-01.csv.gz'
# response = requests.get(data_url)
# response.raise_for_status() # Check if the download was successful
# df = pd.read_csv(io.BytesIO(response.content), compression='gzip')
 

In [7]:
print("Downloading Auto MPG dataset...")
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
column_names = ["mpg", "cylinders", "displacement", "horsepower", "weight",
                "acceleration", "model_year", "origin", "car_name"]

df = pd.read_csv(data_url, delim_whitespace=True, names=column_names, na_values='?')


Downloading Auto MPG dataset...


In [8]:
feature_columns = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration']
target_column = 'mpg'

df = df[feature_columns + [target_column]]


In [9]:
df

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,mpg
0,8,307.0,130.0,3504.0,12.0,18.0
1,8,350.0,165.0,3693.0,11.5,15.0
2,8,318.0,150.0,3436.0,11.0,18.0
3,8,304.0,150.0,3433.0,12.0,16.0
4,8,302.0,140.0,3449.0,10.5,17.0
...,...,...,...,...,...,...
393,4,140.0,86.0,2790.0,15.6,27.0
394,4,97.0,52.0,2130.0,24.6,44.0
395,4,135.0,84.0,2295.0,11.6,32.0
396,4,120.0,79.0,2625.0,18.6,28.0


In [10]:
# Drop rows with missing target or features
df = df.dropna(subset=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration'])

# Remove impossible or extreme MPG values
df = df[(df['mpg'] > 0) & (df['mpg'] < 100)]

# Remove impossible displacement, horsepower, weight
df = df[(df['displacement'] > 0) & (df['horsepower'] > 0) & (df['weight'] > 0)]

# Keep reasonable acceleration times
df = df[(df['acceleration'] > 0) & (df['acceleration'] < 30)]

print("Cleaned dataset shape:", df.shape)


Cleaned dataset shape: (392, 6)


In [11]:
# --------------------------
# Split into reference & current
# --------------------------
reference_data = df.iloc[:196].copy()
current_data = df.iloc[196:].copy()

In [12]:
# --------------------------
# Simulate data drift
# --------------------------
print("Simulating data drift in the 'current' dataset...")

numeric_cols = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration']

if 'mpg' in numeric_cols:
    current_data['mpg'] *= 1.1  # Increase MPG
if 'horsepower' in numeric_cols:
    current_data['horsepower'] *= 0.9  # Decrease HP
if 'weight' in numeric_cols:
    current_data['weight'] *= 1.05  # Increase weight
if 'cylinders' in numeric_cols:
    current_data['cylinders'] = current_data['cylinders'].replace({4: 6, 6: 8, 8: 4})

print("Reference data shape:", reference_data.shape)
print("Current data shape:", current_data.shape)

Simulating data drift in the 'current' dataset...
Reference data shape: (196, 6)
Current data shape: (196, 6)


In [13]:
print("Reference data shape:", reference_data.shape)
print("Current data shape:", current_data.shape)


Reference data shape: (196, 6)
Current data shape: (196, 6)


In [14]:
# --------------------------
# Drop irrelevant columns & constants
# --------------------------
def drop_constant_columns(df):
    return df.loc[:, (df.nunique(dropna=False) > 1)]

reference_data_clean = reference_data.drop(columns=['car_name', 'origin'], errors='ignore')
current_data_clean = current_data.drop(columns=['car_name', 'origin'], errors='ignore')

reference_data_clean = drop_constant_columns(reference_data_clean)
current_data_clean = drop_constant_columns(current_data_clean)

In [15]:
# Ensure matching columns
common_columns = reference_data_clean.columns.intersection(current_data_clean.columns)
reference_data_clean = reference_data_clean[common_columns]
current_data_clean = current_data_clean[common_columns]

In [16]:
import datetime
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

# Create and run the data drift report for Auto MPG
data_drift_report = Report(metrics=[
    DataDriftPreset()
])

data_drift_report.run(
    current_data=current_data_clean,
    reference_data=reference_data_clean,
    column_mapping=None
)

  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp


In [17]:
# Save the report as an HTML file with timestamp
report_filename = f'auto_mpg_drift_report_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.html'
data_drift_report.save_html(report_filename)

print(f"Auto MPG Data Drift Report saved to {report_filename}")

Auto MPG Data Drift Report saved to auto_mpg_drift_report_20250811_174029.html


# Model Performance and Drift Report

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data", delim_whitespace=True, names=["mpg","cylinders","displacement","horsepower","weight","acceleration","model_year","origin","car_name"], na_values='?')

# Basic cleaning
# Remove rows with missing MPG or key features
df.replace("?", pd.NA, inplace=True)
df.dropna(inplace=True)

# Convert numeric columns to floats
numeric_cols = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
df[numeric_cols] = df[numeric_cols].astype(float)

print("Cleaned dataset shape:", df.shape)

# Split into reference and current datasets
reference_data = df.iloc[:200].copy()
current_data = df.iloc[200:392].copy()


Cleaned dataset shape: (392, 9)


In [19]:
print("Simulating data drift in the 'current' dataset...")

current_data['weight'] = current_data['weight'] * 1.2
current_data['horsepower'] = current_data['horsepower'] * 0.85
current_data['acceleration'] = current_data['acceleration'] + 2

print("Reference data shape:", reference_data.shape)
print("Current data shape:", current_data.shape)


Simulating data drift in the 'current' dataset...
Reference data shape: (200, 9)
Current data shape: (192, 9)


In [20]:
from sklearn.linear_model import LinearRegression
import joblib

print("\nTraining a simple linear regression model...")

features = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration']
target = 'mpg'

X_ref = reference_data[features]
y_ref = reference_data[target]

# Data cleaning for model input
X_ref.dropna(inplace=True)
y_ref = y_ref.loc[X_ref.index]

# Train model
model = LinearRegression()
model.fit(X_ref, y_ref)

# Save model
model_filename = 'auto_mpg_model.joblib'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")



Training a simple linear regression model...
Model saved to auto_mpg_model.joblib


In [21]:
reference_data['prediction'] = model.predict(X_ref)

X_curr = current_data[features]
X_curr.dropna(inplace=True)
current_data['prediction'] = model.predict(X_curr)


In [22]:
from evidently.report import Report
from evidently.metric_preset import TargetDriftPreset
import datetime

print("\nGenerating Evidently Model Performance report...")

model_report = Report(metrics=[
    TargetDriftPreset()
])
model_report.run(current_data=current_data, reference_data=reference_data, column_mapping=None)

# Save the report
model_report_filename = f'auto_mpg_model_performance_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.html'
model_report.save_html(model_report_filename)
print(f"Model performance report saved to {model_report_filename}")



Generating Evidently Model Performance report...
Model performance report saved to auto_mpg_model_performance_20250811_174030.html


In [23]:
from evidently.metric_preset import DataDriftPreset

data_drift_report = Report(metrics=[
    DataDriftPreset()
])
data_drift_report.run(current_data=current_data, reference_data=reference_data, column_mapping=None)

# Save drift report
drift_report_filename = f'auto_mpg_data_drift_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.html'
data_drift_report.save_html(drift_report_filename)
print(f"Data Drift Report saved to {drift_report_filename}")



divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide



Data Drift Report saved to auto_mpg_data_drift_20250811_174030.html


In [26]:
def check_for_drift(report_json, threshold=0.1):
    data_drift_metrics = report_json.get('metrics', [])
    for metric in data_drift_metrics:
        if metric.get('metric') == 'DatasetDriftMetric':
            drift_score = metric.get('result', {}).get('drift_score')  # this is the global drift score
            if drift_score is not None and drift_score > threshold:
                return True
    return False


In [27]:
def local_alert(drift_detected):
    if drift_detected:
        print("\n!!! ALERT: Data Drift Detected !!!")
        print("Check HTML reports for detailed drift analysis.")
    else:
        print("\nNo significant data drift detected.")

# Run drift check
report_json_data = data_drift_report.as_dict()
drift_detected = check_for_drift(report_json_data)
local_alert(drift_detected)



No significant data drift detected.
