In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, roc_auc_score
import sys
# # Import XAdapt-Drift
# from .xadapt_drift import XAdaptDrift
# from ..xadapt_drift.adapters.sklearn_adapter import SklearnAdapter
# sys.path.append('/home/alexandress/Documents/tcc/XDrift-Analyzer/examples')  # Adjust path to import XAdapt-Drift
# from advanced_example import create_synthetic_data
# Set up plotting style
plt.style.use('seaborn-v0_8-pastel')
sns.set_palette('pastel')



In [16]:
def create_synthetic_data(n_samples=10000, n_cat_features=3, n_num_features=5, seed=42):
    """Create a synthetic dataset with mixed data types"""
    np.random.seed(seed)

    # Create numerical features
    X_numerical = np.random.randn(n_samples, n_num_features)
    
    # Create categorical features (3 categories each)
    X_categorical = np.random.randint(0, 3, size=(n_samples, n_cat_features))

    # Create target based on both numerical and categorical features
    y = (0.5 * np.sum(X_numerical[:, :2], axis=1) + 
         0.8 * (X_categorical[:, 0] == 2).astype(int) - 
         0.5 * (X_categorical[:, 1] == 0).astype(int) + 
         0.1 * np.random.randn(n_samples)) > 0
    
    # Combine features
    X = np.hstack([X_numerical, X_categorical])

    # Create feature names
    numerical_cols = [f'num_{i}' for i in range(n_num_features)]
    categorical_cols = [f'cat_{i}' for i in range(n_cat_features)]
    feature_names = numerical_cols + categorical_cols

    # Convert to DataFrame
    df = pd.DataFrame(X, columns=feature_names)
    
    # Convert categorical columns to correct type
    for col in categorical_cols:
        df[col] = df[col].astype('category')

    return df, y.astype(int), numerical_cols, categorical_cols

In [17]:
def induce_drift(df, num_cols, cat_cols, drift_type='mean_shift'):
    """Induce different types of drift in the dataset.
    Args:
        df: Original DataFrame
        num_cols: List of numerical feature names
        cat_cols: List of categorical feature names
        drift_type: Type of drift to induce ('mean_shift', 'variance_change', 'category_frequency', 'multiple')
    Returns:
        drifted_df: DataFrame with induced drift
        drifted_features: List of features that were changed
    """
    
    drifted_df = df.copy()
    
    if drift_type == 'mean_shift':
        # Shift the mean of the first numerical feature
        feature = num_cols[0]
        shift = 1.5 * drifted_df[feature].std()
        drifted_df[feature] += shift
        drifted_features = [feature]
        
    elif drift_type == 'variance_change':
        # Increase the variance of the second numerical feature
        feature = num_cols[1]
        drifted_df[feature] = drifted_df[feature] * 2.0
        drifted_features = [feature]
    
    elif drift_type == 'category_frequency':
        # Change the distribution of a categorical feature
        feature = cat_cols[0]
        # Find the least common category
        least_common = drifted_df[feature].value_counts().idxmin()
        # Make it more common by replacing some values
        mask = np.random.choice([True, False], size=len(drifted_df), p=[0.4, 0.6])
        drifted_df.loc[mask, feature] = least_common
        drifted_features = [feature]
        
    elif drift_type == 'multiple':
        # Induce multiple drifts
        # Shift mean of first numerical feature
        drifted_df[num_cols[0]] += 1.2 * drifted_df[num_cols[0]].std()
        # Increase variance of second numerical feature
        drifted_df[num_cols[1]] = drifted_df[num_cols[1]] * 1.8
        # Change categorical distribution
        feature = cat_cols[0]
        mask = np.random.choice([True, False], size=len(drifted_df), p=[0.3, 0.7])
        drifted_df.loc[mask, feature] = drifted_df[feature].value_counts().idxmin()
        drifted_features = [num_cols[0], num_cols[1], cat_cols[0]]
    
    return drifted_df, drifted_features

In [18]:
reference_df, y_ref, numerical_cols, categorical_cols = create_synthetic_data(n_samples=10000)
print(f"Created dataset with {len(numerical_cols)} numerical features and {len(categorical_cols)} categorical features")

Created dataset with 5 numerical features and 3 categorical features


In [19]:
reference_df

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,cat_0,cat_1,cat_2
0,0.496714,-0.138264,0.647689,1.523030,-0.234153,2.0,0.0,1.0
1,-0.234137,1.579213,0.767435,-0.469474,0.542560,2.0,1.0,1.0
2,-0.463418,-0.465730,0.241962,-1.913280,-1.724918,0.0,1.0,1.0
3,-0.562288,-1.012831,0.314247,-0.908024,-1.412304,1.0,0.0,1.0
4,1.465649,-0.225776,0.067528,-1.424748,-0.544383,0.0,2.0,2.0
...,...,...,...,...,...,...,...,...
9995,-1.153965,0.265739,-0.192730,1.527117,-2.505187,0.0,2.0,0.0
9996,0.040727,0.433389,0.897684,0.143323,0.364795,0.0,2.0,2.0
9997,0.176032,0.439752,-0.019282,2.116179,0.577845,1.0,0.0,1.0
9998,0.197389,-0.738124,-0.342293,1.510120,-2.484976,1.0,1.0,0.0


In [20]:
reference_df.describe(include='all')

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,cat_0,cat_1,cat_2
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
unique,,,,,,3.0,3.0,3.0
top,,,,,,0.0,1.0,1.0
freq,,,,,,3404.0,3351.0,3368.0
mean,2e-06,0.014416,-0.010528,-0.004993,-0.001002,,,
std,1.014868,0.997083,1.005219,0.994265,0.98924,,,
min,-4.295391,-4.465604,-3.6352,-3.631539,-3.856375,,,
25%,-0.696539,-0.654058,-0.704792,-0.675572,-0.664076,,,
50%,0.012872,0.006485,-0.019121,0.006501,-0.00021,,,
75%,0.683884,0.683468,0.676844,0.662879,0.679485,,,


In [21]:
y_ref

array([1, 1, 0, ..., 0, 0, 0], shape=(10000,))

In [23]:
numerical_cols

['num_0', 'num_1', 'num_2', 'num_3', 'num_4']

In [24]:
categorical_cols

['cat_0', 'cat_1', 'cat_2']