In [1]:
# ------ River imports ---------

from river import stream

# Metrics
from river.metrics import Accuracy, Precision, Recall, F1, CohenKappa, BalancedAccuracy

# Drift Detectors
from river.drift import ADWIN, KSWIN, PageHinkley, DriftRetrainingClassifier
from river.drift.binary import *

# Classifiers
from river.linear_model import LogisticRegression
from river.forest import ARFClassifier
from river.tree import HoeffdingTreeClassifier, ExtremelyFastDecisionTreeClassifier

In [2]:
# ------ Sk-learn imports -------------

# sklearn classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Sklearn Metric
from sklearn.metrics import cohen_kappa_score, accuracy_score, f1_score

# Others
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [3]:
# ------ Basic python lib imports ----------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mpld3
import mplcursors
from collections import deque
import warnings
from scipy.stats import ks_2samp

warnings.filterwarnings("ignore")

In [4]:
# ------ Get "cfpdss" data --------------------------
df_cfpdss = pd.read_csv(
    "/Users/shreeyacy/GitHub/stream-viz/data/cfpdss.csv"
)
df_cfpdss.head()

Unnamed: 0,c5,c6,c7,c8,c9,n0,n1,n2,n3,n4,class
0,a,a,b,a,a,0.448573,1.212644,2.555515,0.242702,0.697604,A
1,a,a,a,b,b,1.902515,-0.467349,-0.744523,-0.017156,-0.380288,A
2,a,a,a,b,b,0.825045,1.002267,1.972207,-0.790095,0.233456,B
3,a,a,a,b,b,-0.124974,0.902601,1.696092,-0.320182,0.243261,A
4,b,a,a,b,a,-1.259765,0.793134,1.547527,-2.132172,-0.788367,B


In [12]:
# ------ Data pre-processing --------------
X_df = df_cfpdss.drop(columns="class")

# Separating categorical and non-categorical columns
categorical_columns = X_df.select_dtypes(include=["object"]).columns.tolist()
X_df_categorical = X_df[categorical_columns]
non_cat_columns = [col for col in X_df.columns if col not in categorical_columns]
X_df_non_categorical = X_df[non_cat_columns]

# One hot encoding - Categorical data
encoder = OneHotEncoder(sparse_output=False, drop="if_binary", dtype=np.int32)
one_hot_encoded = encoder.fit_transform(X_df_categorical)
columns = encoder.get_feature_names_out()
X_df_cat_one_hot = pd.DataFrame(one_hot_encoded, columns=columns)

# Feature scaling numerical data/non-categorical data
scaler = MinMaxScaler()
X_non_cat_df = pd.DataFrame(
    scaler.fit_transform(X_df_non_categorical), columns=scaler.get_feature_names_out()
)

# Concatenate categorical and non-categorical data
X_df_encoded = pd.concat(
    [
        X_df_cat_one_hot,
        X_non_cat_df,
    ],
    axis=1,
)

# Encoding the target variable
y_df = df_cfpdss[["class"]]
y_encoder = OneHotEncoder(sparse_output=False, drop="if_binary", dtype=np.int32)
y_one_hot = y_encoder.fit_transform(y_df)
y_encoded = pd.Series(y_one_hot.ravel())


In [20]:
window_size = 400
gap_size = 100

# Initialize the deque with the first 'window_size' elements
window_x = deque(y_encoded.head(window_size*2+100), maxlen=window_size*2+100)

# Function to get populations P1 and P2 from the deque
def get_population(window, window_size, gap_size):
    P1 = list(window)[:window_size]
    P2 = list(window)[window_size+gap_size:2*window_size+gap_size]
    return P1, P2

# Initialize P1 and P2
P1, P2 = get_population(window_x, window_size, gap_size)
pvalue = ks_2samp(P1, P2)[1]
if pvalue < 0.001:
    print(f"Gradual Drift Detected at {window_size} with p-value: {pvalue}")

# Iterate over the elements starting from window_size*2 + gap_size
for idx, xi in enumerate(y_encoded[window_size*2+gap_size:], start=window_size*2+gap_size):
    window_x.append(xi)  # Correctly append the new element to the deque
    P1, P2 = get_population(window_x, window_size, gap_size)
    pvalue = ks_2samp(P1, P2)[1]
    if pvalue < 0.0001:
        print(f"Gradual Drift Detected at {idx-(window_size+100)}")


Gradual Drift Detected at 3056
Gradual Drift Detected at 3057
Gradual Drift Detected at 3058
Gradual Drift Detected at 3059
Gradual Drift Detected at 3060
Gradual Drift Detected at 3061
Gradual Drift Detected at 3062
Gradual Drift Detected at 3063
Gradual Drift Detected at 3066
Gradual Drift Detected at 3078
Gradual Drift Detected at 3079
Gradual Drift Detected at 3080
Gradual Drift Detected at 3081
Gradual Drift Detected at 3082
Gradual Drift Detected at 3083
Gradual Drift Detected at 3084
Gradual Drift Detected at 3085
Gradual Drift Detected at 3086
Gradual Drift Detected at 3087
Gradual Drift Detected at 3088
Gradual Drift Detected at 3089
Gradual Drift Detected at 3090
Gradual Drift Detected at 3091
Gradual Drift Detected at 3095
Gradual Drift Detected at 3096
Gradual Drift Detected at 3097
Gradual Drift Detected at 3098
Gradual Drift Detected at 3099
Gradual Drift Detected at 3103
Gradual Drift Detected at 3112
Gradual Drift Detected at 3130
Gradual Drift Detected at 3131
Gradual 