# 03 Drift Simulation Analysis

Auto-generated notebook for MLOps Project.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('..'))

from src.utils.config import config
from src.utils.logger import setup_logger

%matplotlib inline
sns.set_style("whitegrid")


In [None]:

from src.data.drift_simulation import DriftSimulator
from src.data.load_data import load_data

df = load_data()
sim = DriftSimulator(df)

# Helper to find a numeric column
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'churn' in numeric_cols: numeric_cols.remove('churn')
target_feature = numeric_cols[0] if numeric_cols else 'feature_0'
print(f"Using feature '{target_feature}' for visualization.")


## 1. Covariate Shift

Shifting the mean of the distribution.

In [None]:

batch_normal, _ = sim.simulate_batch(1000)
batch_drifted, _ = sim.simulate_batch(1000, drift_type="covariate_shift", drift_intensity=0.8)

plt.figure(figsize=(10, 5))
sns.kdeplot(batch_normal[target_feature], label='Normal', fill=True, color='blue')
sns.kdeplot(batch_drifted[target_feature], label='Drifted', fill=True, color='orange')
plt.title(f"Covariate Shift: {target_feature}")
plt.legend()
plt.show()


## 2. Label Shift

Changing the balance of the target class (Prior Probability Shift).

In [None]:

batch_drifted_label, _ = sim.simulate_batch(1000, drift_type="label_shift", drift_intensity=0.9)

fig, ax = plt.subplots(1, 2, figsize=(14, 5))

sns.countplot(x='churn', data=batch_normal, ax=ax[0], palette='Blues')
ax[0].set_title("Normal Class Balance")

sns.countplot(x='churn', data=batch_drifted_label, ax=ax[1], palette='Oranges')
ax[1].set_title("Drifted Class Balance (Label Shift)")

plt.show()


## 3. Noise Injection

Adding random Gaussian noise to features (Degrading data quality).

In [None]:

batch_drifted_noise, _ = sim.simulate_batch(1000, drift_type="noise_injection", drift_intensity=1.5)

plt.figure(figsize=(12, 5))
# visualising the first 50 points to see the jitter
plt.plot(batch_normal[target_feature].iloc[:50], 'o-', label='Normal', alpha=0.7)
plt.plot(batch_drifted_noise[target_feature].iloc[:50], 'x--', label='Noisy', alpha=0.7)
plt.title(f"Noise Injection: {target_feature} (First 50 samples)")
plt.legend()
plt.show()

# Distribution view
plt.figure(figsize=(10, 5))
sns.kdeplot(batch_normal[target_feature], label='Normal', fill=True)
sns.kdeplot(batch_drifted_noise[target_feature], label='Noisy', fill=True)
plt.title(f"Noise Injection Distribution: {target_feature}")
plt.legend()
plt.show()


## 4. Scaling Drift

Multiplying features by a factor (e.g., Unit change).

In [None]:

batch_drifted_scale, _ = sim.simulate_batch(1000, drift_type="scaling", drift_intensity=0.5)

plt.figure(figsize=(10, 6))
data_to_plot = pd.DataFrame({
    'Normal': batch_normal[target_feature],
    'Scaled': batch_drifted_scale[target_feature]
})
sns.boxplot(data=data_to_plot)
plt.title(f"Scaling Drift: {target_feature} Range Comparison")
plt.show()
