In [2]:
# Question: Detecting Data Drift
# Description: Identify potential data drift between two time periods for a numeric attribute.
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp

def detect_data_drift(data, column_name, time_col, time_period_1, time_period_2):
    # Ensure the column exists in the dataset
    if column_name not in data.columns or time_col not in data.columns:
        raise ValueError(f"Column '{column_name}' or '{time_col}' not found in the dataset.")

    # Filter data for both time periods
    data['Time'] = pd.to_datetime(data[time_col])  # Convert time column to datetime
    data_period_1 = data[data['Time'] >= time_period_1]
    data_period_2 = data[data['Time'] <= time_period_2]

    # Ensure the column is numeric
    col_data_1 = pd.to_numeric(data_period_1[column_name], errors='coerce').dropna()
    col_data_2 = pd.to_numeric(data_period_2[column_name], errors='coerce').dropna()

    # Check if there are enough data points
    if len(col_data_1) == 0 or len(col_data_2) == 0:
        raise ValueError("One or both time periods have no valid data.")

    # Descriptive Statistics
    desc_1 = col_data_1.describe()
    desc_2 = col_data_2.describe()

    print(f"Descriptive statistics for {time_period_1} to {time_period_2}:\n")
    print(f"Period 1 ({time_period_1}):\n", desc_1)
    print(f"Period 2 ({time_period_2}):\n", desc_2)

    # Perform Kolmogorov-Smirnov Test for data drift (distribution comparison)
    ks_stat, ks_p_value = ks_2samp(col_data_1, col_data_2)

    print(f"\nKS Statistic: {ks_stat}")
    print(f"KS P-value: {ks_p_value}")

    if ks_p_value < 0.05:
        print(f"Potential data drift detected: p-value = {ks_p_value}")
    else:
        print(f"No significant data drift detected: p-value = {ks_p_value}")
