### Implementing Adversarial Validation for Data Drift
Description: Create and train a classifier that distinguishes between train and test datasets, using the classifier’s performance to infer data drift.

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle

# Step 1: Create dummy train and test datasets
np.random.seed(42)

# Simulate train data
train_df = pd.DataFrame({
    'feature1': np.random.normal(loc=0.0, scale=1.0, size=1000),
    'feature2': np.random.normal(loc=5.0, scale=1.5, size=1000),
    'feature3': np.random.randint(0, 2, size=1000)
})

# Simulate test data with a slight shift to introduce drift
test_df = pd.DataFrame({
    'feature1': np.random.normal(loc=0.5, scale=1.0, size=1000),
    'feature2': np.random.normal(loc=6.0, scale=1.5, size=1000),
    'feature3': np.random.randint(0, 2, size=1000)
})

# Step 2: Add label for adversarial validation
train_df['is_test'] = 0
test_df['is_test'] = 1

# Step 3: Combine datasets
combined_df = pd.concat([train_df, test_df], axis=0)
combined_df = shuffle(combined_df, random_state=42).reset_index(drop=True)

# Step 4: Separate features and label
X = combined_df.drop('is_test', axis=1)
y = combined_df['is_test']

# Step 5: Train a classifier to distinguish train vs test
clf = RandomForestClassifier(n_estimators=100, random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_scores = []

for train_idx, val_idx in cv.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    clf.fit(X_train, y_train)
    preds = clf.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, preds)
    auc_scores.append(auc)

# Step 6: Output results
print("Adversarial Validation AUC scores:", auc_scores)
print("Mean AUC:", np.mean(auc_scores))

# Interpretation
if np.mean(auc_scores) > 0.75:
    print("⚠️ Significant data drift detected.")
elif np.mean(auc_scores) > 0.6:
    print("⚠️ Moderate data drift detected.")
else:
    print("✅ No significant data drift detected.")


Adversarial Validation AUC scores: [0.62235, 0.6172625, 0.621225, 0.630175, 0.6422749999999999]
Mean AUC: 0.6266575
⚠️ Moderate data drift detected.
