### Implementing Adversarial Validation for Data Drift
Description: Create and train a classifier that distinguishes between train and test datasets, using the classifier’s performance to infer data drift.

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle

def generate_sample_data():
    np.random.seed(42)
    train_df = pd.DataFrame({
        'feature1': np.random.normal(loc=0.0, scale=1.0, size=1000),
        'feature2': np.random.normal(loc=5.0, scale=1.5, size=1000),
        'feature3': np.random.randint(0, 2, size=1000)
    })
    test_df = pd.DataFrame({
        'feature1': np.random.normal(loc=0.5, scale=1.0, size=1000),
        'feature2': np.random.normal(loc=6.0, scale=1.5, size=1000),
        'feature3': np.random.randint(0, 2, size=1000)
    })
    return train_df, test_df

def check_input_validity(train_df, test_df):
    if train_df.empty or test_df.empty:
        raise ValueError("Input DataFrames must not be empty.")

    if not all(col in test_df.columns for col in train_df.columns):
        raise ValueError("Train and test datasets must have the same columns.")

def adversarial_validation(train_df, test_df):
    check_input_validity(train_df, test_df)
    
    train_df['is_test'] = 0
    test_df['is_test'] = 1

    combined_df = pd.concat([train_df, test_df], axis=0)
    combined_df = shuffle(combined_df, random_state=42).reset_index(drop=True)

    X = combined_df.drop('is_test', axis=1)
    y = combined_df['is_test']

    X = pd.get_dummies(X)

    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    auc_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        clf.fit(X_train, y_train)
        preds = clf.predict_proba(X_val)[:, 1]
        auc_scores.append(roc_auc_score(y_val, preds))

    return auc_scores


In [5]:
import unittest

class TestAdversarialValidation(unittest.TestCase):
    def test_valid_data(self):
        train_df, test_df = generate_sample_data()
        auc_scores = adversarial_validation(train_df.copy(), test_df.copy())
        self.assertTrue(all(0.5 <= auc <= 1.0 for auc in auc_scores), "AUCs should be between 0.5 and 1")

    def test_empty_data(self):
        with self.assertRaises(ValueError):
            adversarial_validation(pd.DataFrame(), pd.DataFrame())

    def test_mismatched_columns(self):
        train_df, test_df = generate_sample_data()
        test_df.drop(columns=["feature3"], inplace=True)
        with self.assertRaises(ValueError):
            adversarial_validation(train_df, test_df)

# Run the tests
if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)


...
----------------------------------------------------------------------
Ran 3 tests in 1.246s

OK
