In [1]:
from sklearn.datasets import make_classification

# Generate a 2-class classification dataset with 5 features and 200 instances
X, y = make_classification(n_samples=200, n_features=6, n_informative=2, n_redundant=0, n_clusters_per_class=1, n_classes=2, random_state=0)

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(random_state=0, max_iter=1000)
model.fit(X_train, y_train)

In [4]:
model.coef_

array([[ 2.99866316,  0.5183853 ,  0.23212027,  0.2242117 , -0.06942065,
        -0.19618184]])

In [5]:
from AITIA.syboid import SyBoid
from AITIA.complexity_measures import F1, N1
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score
from scipy.stats import multivariate_normal
from tqdm import tqdm
from sklearn.neighbors import KNeighborsRegressor

In [6]:
from sklearn.model_selection import StratifiedKFold
import itertools
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

In [18]:
class ConceptDriftAnalysis:
    
    def __init__(self, X, y, f1_target=1, n1_target=1, pop_size=40, n_gen=10):
        self.pop_size = pop_size
        self.n_gen = n_gen
        self.f1_target = f1_target
        self.n1_target = n1_target
        self.X = X
        self.y = y

        self.prepare_analysis()


    def prepare_analysis(self):

        # drifted dataset
        syboid = SyBoid(F1_Score=self.f1_target, 
                             N1_Score=self.n1_target, 
                             X=self.X, 
                             y=self.y, 
                             Mimic_Classes=True, 
                             Mimic_DataTypes=True,
                             Mimic_Dataset=True)
        
        syboid.Generate_Data(pop_size=self.pop_size, n_gen=self.n_gen)

        self.X_, self.y_ = syboid.return_best_dataset()


    def evaluate_model_with_synthetic_data(self, model, n_splits, scoring='accuracy'):
        """
        Evaluate a model with synthetic data using k-fold cross-validation.

        Parameters:
        - model: Machine learning model
        - n_splits: Number of stratified folds for cross-validation
        - evaluation_metric: String representing the evaluation metric ('accuracy', 'roc_auc', 'precision', 'recall', 'f1'), default is 'accuracy'

        Returns:
        - performance_scores: List of performance scores for each n
        """

        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

        splits = []
        for _, test_index in skf.split(self.X_, self.y_):
            splits.append(test_index)

        results = {}

        for n_folds in range(n_splits + 1):
            
            f1_scores = []
            n1_scores = []
            performance = []

            if n_folds > 0:
                folds_included = itertools.combinations(range(n_splits), r=n_folds)

                for folds in folds_included:
                    idx = np.hstack([splits[f] for f in folds])

                    X = np.concatenate([self.X, self.X_[idx]])
                    y = np.concatenate([self.y, self.y_[idx]])    

                    f1_scores.append(F1(X, y))
                    n1_scores.append(N1(X, y))

                    score = cross_val_score(model, X, y, cv = skf, scoring = scoring)
                    performance.append(np.mean(score))

            else:
                f1_scores.append(F1(self.X, self.y))
                n1_scores.append(N1(self.X, self.y))

                score = cross_val_score(model, self.X, self.y, cv = skf, scoring = scoring)
                performance.append(np.mean(score))

            try:
                synthetic_percentage = self.X.shape[0]/(self.X.shape[0]+idx.shape[0])
            except NameError:
                synthetic_percentage = 0

            results[synthetic_percentage] = {
                                            'mean_f1': np.mean(f1_scores),
                                            'std_f1': np.std(f1_scores),
                                            'mean_n1': np.mean(n1_scores),
                                            'std_n1': np.std(n1_scores),
                                            'mean_score': np.mean(performance),
                                            'std_score': np.std(performance)
                                            }
            
        return results

In [19]:
ca = ConceptDriftAnalysis(X, y, f1_target=1, n1_target=1)

Gen: 1, Best: 0.715, Avg: 1.162, Worst: 1.414
Gen: 2, Best: 0.715, Avg: 0.994, Worst: 1.220
Gen: 3, Best: 0.715, Avg: 0.927, Worst: 1.108
Gen: 4, Best: 0.625, Avg: 0.877, Worst: 1.077
Gen: 5, Best: 0.625, Avg: 0.844, Worst: 1.077
Gen: 6, Best: 0.530, Avg: 0.792, Worst: 0.955
Gen: 7, Best: 0.530, Avg: 0.810, Worst: 0.955
Gen: 8, Best: 0.530, Avg: 0.756, Worst: 0.925
Gen: 9, Best: 0.530, Avg: 0.740, Worst: 0.925


In [20]:
ca.evaluate_model_with_synthetic_data(model, n_splits=5)

[0.9697080604193874, 0.9951749877576729, 0.9442896156586883, 0.953290128902258, 0.9960198404808019]
[0.9953017485830834, 0.9233307106446491, 0.931037787214122, 0.9957981142313005, 0.9938849974169468, 0.9959336851562859, 0.9975977363565386, 0.907467457200119, 0.9914843719119454, 0.995643761932847]
[0.994052897788852, 0.9959517490515284, 0.9975177248627866, 0.8928987727106508, 0.9894194372048252, 0.9936212662166234, 0.9947297254407528, 0.9965441362015024, 0.997940584002812, 0.9889235562889729]
[0.9947880441984476, 0.9964991032042743, 0.9978340819559998, 0.9870532057912416, 0.9969807950776864]
[0.996899747743998]


{0: {'mean_f1': 0.3461590859435389,
  'std_f1': 0.0,
  'mean_n1': 0.2,
  'std_n1': 0.0,
  'mean_score': 0.9200000000000002,
  'std_score': 0.0},
 0.8333333333333334: {'mean_f1': 0.9716965266437617,
  'std_f1': 0.021150650003157075,
  'mean_n1': 0.30416666666666664,
  'std_n1': 0.09545214042184236,
  'mean_score': 0.8833333333333332,
  'std_score': 0.016873713942763845},
 0.7142857142857143: {'mean_f1': 0.9727480370647837,
  'std_f1': 0.03458390456151566,
  'mean_n1': 0.3264285714285714,
  'std_n1': 0.0999515188601403,
  'mean_score': 0.8828571428571428,
  'std_score': 0.02647678998978665},
 0.625: {'mean_f1': 0.9841599849769306,
  'std_f1': 0.03055947368604563,
  'mean_n1': 0.34374999999999994,
  'std_n1': 0.06976523131187913,
  'mean_score': 0.8859375,
  'std_score': 0.020881454840360143},
 0.5555555555555556: {'mean_f1': 0.9946310460455299,
  'std_f1': 0.0039171088431583945,
  'mean_n1': 0.3438888888888889,
  'std_n1': 0.014098419489388364,
  'mean_score': 0.8922222222222222,
  'std_