In [1]:
# Data Drift Impact on Model
# Question: Use a simple linear regression model to demonstrate how data drift affects model predictions.

# 1. Train a model on the original data:
# 2. Evaluate on the drifted data:
# 3. Compare errors:

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 1. Original data (training)
np.random.seed(42)
X_train = np.random.rand(100, 1) * 10
y_train = 2 * X_train.squeeze() + 1 + np.random.randn(100) * 2

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# 2. Drifted data (shifted feature distribution)
X_drifted = np.random.rand(100, 1) * 10 + 5  # Shifted distribution
y_drifted = 2 * X_drifted.squeeze() + 1 + np.random.randn(100) * 2

# Evaluate on original and drifted data
y_pred_train = model.predict(X_train)
y_pred_drifted = model.predict(X_drifted)

# 3. Compare errors
train_error = mean_squared_error(y_train, y_pred_train)
drifted_error = mean_squared_error(y_drifted, y_pred_drifted)

print(f"Training MSE: {train_error:.2f}")
print(f"Drifted data MSE: {drifted_error:.2f}")


Training MSE: 3.23
Drifted data MSE: 3.65


In [2]:
# Monitoring Data Distribution Changes
# Question: Use Python to monitor distribution changes in features to detect potential data drift.

# 1. Calculate feature statistics (mean and standard deviation) for both original and drifted data:
# 2. Compare statistics:
# 3. Set thresholds to detect significant drift:

import numpy as np
from scipy.stats import ks_2samp

# 1. Original and drifted data
np.random.seed(42)
original_data = np.random.normal(0, 1, 1000)
drifted_data = np.random.normal(1, 1.2, 1000)  # Mean and std deviation changed

# 2. Calculate statistics
original_mean, original_std = np.mean(original_data), np.std(original_data)
drifted_mean, drifted_std = np.mean(drifted_data), np.std(drifted_data)

# 3. Compare distributions and detect drift
ks_stat, p_value = ks_2samp(original_data, drifted_data)

# Set thresholds
MEAN_THRESHOLD = 0.5
STD_THRESHOLD = 0.3
KS_P_THRESHOLD = 0.05

print(f"Original mean: {original_mean:.2f}, Drifted mean: {drifted_mean:.2f}")
print(f"Original std: {original_std:.2f}, Drifted std: {drifted_std:.2f}")
print(f"KS-test p-value: {p_value:.4f}")

# Drift detection
mean_drift = abs(original_mean - drifted_mean) > MEAN_THRESHOLD
std_drift = abs(original_std - drifted_std) > STD_THRESHOLD
ks_drift = p_value < KS_P_THRESHOLD

if mean_drift or std_drift or ks_drift:
    print("Warning: Significant data drift detected!")
    if mean_drift:
        print(f"- Mean changed by {abs(original_mean - drifted_mean):.2f}")
    if std_drift:
        print(f"- Std deviation changed by {abs(original_std - drifted_std):.2f}")
    if ks_drift:
        print("- Distribution shape changed (KS-test)")
else:
    print("No significant drift detected")


Original mean: 0.02, Drifted mean: 1.09
Original std: 0.98, Drifted std: 1.20
KS-test p-value: 0.0000
- Mean changed by 1.07
- Distribution shape changed (KS-test)


In [3]:
# Automating Data Quality Checks with Python
# Question: Automate a basic data validation process using Python to ensure the dataset's
# structural integrity.

# 1. Define validation checks:
# 2. Apply validation:

import pandas as pd

def validate_data(df):
    # 1. Define validation checks
    checks = {
        'missing_values': df.isnull().sum().sum(),
        'duplicate_rows': df.duplicated().sum(),
        'negative_values': (df.select_dtypes(include=['number']) < 0).sum().sum(),
        'zero_values': (df.select_dtypes(include=['number']) == 0).sum().sum(),
        'data_types': df.dtypes
    }
    
    # 2. Apply validation and report results
    print("Data Quality Report:")
    print(f"Total rows: {len(df)}")
    print(f"Missing values: {checks['missing_values']}")
    print(f"Duplicate rows: {checks['duplicate_rows']}")
    print(f"Negative values in numeric columns: {checks['negative_values']}")
    print(f"Zero values in numeric columns: {checks['zero_values']}")
    print("\nData Types:")
    print(checks['data_types'])
    
    # Return validation status
    if checks['missing_values'] > 0 or checks['duplicate_rows'] > 0:
        return False
    return True

# Example usage
data = {
    'id': [1, 2, 3, 4, 4],
    'value': [10, -5, 0, 15, None],
    'category': ['A', 'B', 'A', 'C', 'C']
}

df = pd.DataFrame(data)
is_valid = validate_data(df)

print(f"\nData validation status: {'Passed' if is_valid else 'Failed'}")


Data Quality Report:
Total rows: 5
Missing values: 1
Duplicate rows: 0
Negative values in numeric columns: 1
Zero values in numeric columns: 1

Data Types:
id            int64
value       float64
category     object
dtype: object

Data validation status: Failed


In [4]:
# Introducing Great Expectations for Data Validation
# Question: Use Great Expectations to set up data validation checks for a dataset.

# 1. Install Great Expectations:
# 2. Create a new expectations suite:
# 3. Load data and generate expectations:

import great_expectations as ge
from great_expectations.core.batch import BatchRequest
from great_expectations.data_context.types.base import DataContextConfig
import pandas as pd
df=pd.read_csv("/workspaces/AI_DATA_ANALYSIS_/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css")
context=ge.get_context()
suite_name="my_suite"
context.add_or_update_expectation_suite(expectation_suite_name=suite_name)
validator=context.sources.pandas_default.read_dataframe(df).validate(expectation_suite_name=suite_name)
validator.expect_column_to_exist("column_name")
validator.expect_column_values_to_not_be_null("column_name")
validator.expect_column_values_to_be_unique("column_name")


FileNotFoundError: [Errno 2] No such file or directory: '/workspaces/AI_DATA_ANALYSIS_/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css'

In [None]:
# Automating Constraint Checks with Python
# Question: Automate primary key and foreign key constraint checks using Python to ensure dataset compliance.


# 1. Assuming datasets exist with primary and foreign key relationships in pandas dataframes employees_df and departments_df :

import pandas as pd
employees_df=pd.DataFrame({'employee_id':[1,2,3,4],'department_id':[10,20,10,30]})
departments_df=pd.DataFrame({'department_id':[10,20]})
def check_primary_key(df,key_column):return df[key_column].is_unique
def check_foreign_key(foreign_df,foreign_key,primary_df,primary_key):return foreign_df[foreign_key].isin(primary_df[primary_key]).all()
pk_valid=check_primary_key(employees_df,'employee_id')
fk_valid=check_foreign_key(employees_df,'department_id',departments_df,'department_id')
print(f"Primary Key Valid: {pk_valid}")
print(f"Foreign Key Valid: {fk_valid}")


In [None]:
# Advanced Data Drift Detection using Statistical Tests
# Question: Implement Kolmogorov-Smirnov test using Python to detect data drift at a more sophisticated level.

# 1. Use SciPy to perform KS test:

import pandas as pd
import numpy as np
from scipy import stats

def detect_ks_drift(reference_data: pd.Series, current_data: pd.Series, alpha: float = 0.05) -> dict:
    """
    Detects data drift between two numerical datasets using the Kolmogorov-Smirnov test.

    Args:
        reference_data (pd.Series): The reference dataset (e.g., historical data).
        current_data (pd.Series): The current dataset to compare against the reference.
        alpha (float, optional): The significance level for the test. Defaults to 0.05.

    Returns:
        dict: A dictionary containing the KS statistic, p-value, and a boolean indicating drift.
    """
    ks_statistic, p_value = stats.ks_2samp(reference_data, current_data)
    drift_detected = p_value < alpha
    return {
        "ks_statistic": ks_statistic,
        "p_value": p_value,
        "drift_detected": drift_detected,
        "alpha": alpha
    }

# Example Usage:
# Generate some synthetic data
np.random.seed(42)
reference_sample = np.random.normal(loc=0, scale=1, size=1000)
current_sample_no_drift = np.random.normal(loc=0.1, scale=1.1, size=1000)
current_sample_drift = np.random.normal(loc=0.5, scale=1.5, size=1000)

# Convert to pandas Series for easier handling
reference_series = pd.Series(reference_sample)
no_drift_series = pd.Series(current_sample_no_drift)
drift_series = pd.Series(current_sample_drift)

# Detect drift when there is likely no significant drift (small changes)
drift_report_no_drift = detect_ks_drift(reference_series, no_drift_series)
print("KS Test - No Significant Drift:")
print(f"  KS Statistic: {drift_report_no_drift['ks_statistic']:.4f}")
print(f"  P-value: {drift_report_no_drift['p_value']:.4f}")
print(f"  Drift Detected (alpha={drift_report_no_drift['alpha']}): {drift_report_no_drift['drift_detected']}")

print("\n" + "="*30 + "\n")

# Detect drift when there is a more noticeable change
drift_report_drift = detect_ks_drift(reference_series, drift_series)
print("KS Test - Significant Drift:")
print(f"  KS Statistic: {drift_report_drift['ks_statistic']:.4f}")
print(f"  P-value: {drift_report_drift['p_value']:.4f}")
print(f"  Drift Detected (alpha={drift_report_drift['alpha']}): {drift_report_drift['drift_detected']}")
        
