In diesem Notebook werden Streams aus synthetischen Daten generiert und anschließend in einem statischen Datensatz gespeichert.

In [1]:
import numpy as np
import pandas as pd

from skmultiflow.data import ConceptDriftStream
from skmultiflow.data.agrawal_generator import AGRAWALGenerator
from skmultiflow.data.sea_generator import SEAGenerator
from skmultiflow.data.led_generator_drift import LEDGeneratorDrift

LED Generator -> led_g1

1. 100 Instanzen
2. Graduelle Drifts -> 1 Drift Feature
3. Rauschen 10%

In [11]:
led_g1_stream1 = LEDGeneratorDrift(random_state = 112, noise_percentage = 0.10,has_noise= True,
n_drift_features=1)

led_g1_stream = led_g1_stream1.next_sample(100000)
led_g1 = pd.DataFrame(np.column_stack(led_g1_stream), columns=['att_1', 'att_2', 'att_3', 'att_4', 'att_5',
                                                               'att_6', 'att_7', 'att_8', 'att_9', 'att_10',
                                                               'att_11', 'att_12', 'att_13', 'att_14', 'att_15',
                                                               'att_16', 'att_17', 'att_18', 'att_19', 'att_20',
                                                               'att_21', 'att_22', 'att_23', 'att_24', 'class'])
led_g1 = led_g1.astype(int)

led_g1.to_csv('led_g1.csv')
led_g1

Unnamed: 0,att_1,att_2,att_3,att_4,att_5,att_6,att_7,att_8,att_9,att_10,...,att_16,att_17,att_18,att_19,att_20,att_21,att_22,att_23,att_24,class
0,0,1,1,1,0,1,0,0,1,0,...,1,1,0,0,1,1,0,1,1,4
1,0,0,1,0,1,0,0,1,0,0,...,0,1,1,0,0,0,0,1,1,1
2,1,1,1,0,0,1,1,1,1,0,...,1,1,1,1,0,1,0,1,0,0
3,1,0,1,0,0,1,1,1,0,0,...,0,0,1,0,1,1,1,1,1,7
4,1,1,0,0,1,1,1,1,0,1,...,0,1,1,0,0,0,0,1,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,1,1,1,0,1,0,1,0,1,...,0,0,0,0,0,1,1,0,1,4
99996,1,1,1,1,0,1,1,0,1,1,...,1,1,1,0,1,0,1,1,1,9
99997,1,0,1,1,0,1,1,1,0,1,...,1,1,1,0,1,0,1,0,0,3
99998,1,0,1,1,0,0,1,1,0,1,...,1,0,1,1,1,1,0,1,1,2


LED Generator -> led_g2

1. 100.000 Instanzen
2. Gradueller Drift -> 2 Features
3. Rauschen 35\%

In [13]:
led_g2_stream1 = LEDGeneratorDrift(random_state = 112, noise_percentage = 0.35,has_noise= True,
n_drift_features=2)

led_g2_stream = led_g2_stream1.next_sample(100000)
led_g2 = pd.DataFrame(np.column_stack(led_g2_stream), columns=['att_1', 'att_2', 'att_3', 'att_4', 'att_5',
                                                               'att_6', 'att_7', 'att_8', 'att_9', 'att_10',
                                                               'att_11', 'att_12', 'att_13', 'att_14', 'att_15',
                                                               'att_16', 'att_17', 'att_18', 'att_19', 'att_20',
                                                               'att_21', 'att_22', 'att_23', 'att_24', 'class'])
led_g2 = led_g2.astype(int)

led_g2.to_csv('led_g2.csv')

AGRAWAL Generator -> agr_a

1. 100.000 Instanzen
2. 19 abrupte Drifts (Konzepte wiederholen sich, da die Klassifizierungsfunktionen im Stream erneut auftauchen)

In [5]:
import warnings
warnings.filterwarnings('ignore')

agr_a_stream1 = ConceptDriftStream(stream=AGRAWALGenerator
                                   (classification_function=0, 
                                    balance_classes=False), 
                   drift_stream=AGRAWALGenerator(classification_function=2, 
                                    balance_classes=False),
                                   position=5000, width=1) 

agr_a_stream2 = ConceptDriftStream(stream=AGRAWALGenerator
                                   (classification_function=1, 
                                    balance_classes=False), 
                   drift_stream=AGRAWALGenerator(classification_function=3, 
                                    balance_classes=False),
                                   position=5000, width=1)

agr_a_stream3 = ConceptDriftStream(stream=AGRAWALGenerator
                                   (classification_function=4, 
                                    balance_classes=False), 
                   drift_stream=AGRAWALGenerator(classification_function=6, 
                                    balance_classes=False),
                                   position=5000, width=1)

agr_a_stream4 = ConceptDriftStream(stream=AGRAWALGenerator
                                   (classification_function=5, 
                                    balance_classes=False), 
                   drift_stream=AGRAWALGenerator(classification_function=7, 
                                    balance_classes=False),
                                   position=5000, width=1)

agr_a_stream5 = ConceptDriftStream(stream=AGRAWALGenerator
                                   (classification_function=9, 
                                    balance_classes=False), 
                   drift_stream=AGRAWALGenerator(classification_function=8, 
                                    balance_classes=False),
                                   position=5000, width=1)

agr_a = pd.DataFrame(np.column_stack(agr_a_stream1.next_sample(10000)))
agr_a = agr_a.append(pd.DataFrame(np.column_stack(agr_a_stream2.next_sample(10000))))
agr_a = agr_a.append(pd.DataFrame(np.column_stack(agr_a_stream3.next_sample(10000))))
agr_a = agr_a.append(pd.DataFrame(np.column_stack(agr_a_stream4.next_sample(10000))))
agr_a = agr_a.append(pd.DataFrame(np.column_stack(agr_a_stream5.next_sample(10000))))
agr_a = agr_a.append(pd.DataFrame(np.column_stack(agr_a_stream1.next_sample(10000))))
agr_a = agr_a.append(pd.DataFrame(np.column_stack(agr_a_stream2.next_sample(10000))))
agr_a = agr_a.append(pd.DataFrame(np.column_stack(agr_a_stream3.next_sample(10000))))
agr_a = agr_a.append(pd.DataFrame(np.column_stack(agr_a_stream4.next_sample(10000))))
agr_a = agr_a.append(pd.DataFrame(np.column_stack(agr_a_stream5.next_sample(10000))))

agr_a.columns = ['salary', 'comission', 'age', 'elevel',
                 'car', 'zipcode', 'hvalue', 'hyears', 'loan', 'class']

agr_a = agr_a.astype({'age': 'int', 'elevel': 'int', 
                      'car': 'int', 'zipcode': 'int',
                      'hyears': 'int', 'class': 'int'})

agr_a.reset_index(drop=True, inplace=True)
agr_a.to_csv('agr_a.csv')
agr_a

Unnamed: 0,salary,comission,age,elevel,car,zipcode,hvalue,hyears,loan,class
0,63804.430611,77760.431783,51,0,3,5,384665.623059,5,21673.914747,1
1,45835.178338,35011.133051,58,1,15,4,421058.017135,4,324558.762378,1
2,109897.588393,0.000000,65,1,10,5,564051.820023,26,311201.528479,0
3,84778.081784,0.000000,48,2,1,3,686562.666859,15,171795.820123,1
4,57648.867877,71700.499255,30,1,9,0,871006.331816,28,358888.515726,0
...,...,...,...,...,...,...,...,...,...,...
99995,82373.653194,0.000000,61,1,14,8,78660.974252,19,472346.671674,1
99996,20378.070253,79148.320023,63,3,2,7,202179.102031,16,499760.439841,1
99997,70430.193968,47048.407557,24,4,10,4,425030.358706,20,439757.288052,1
99998,67811.777469,16969.784065,68,2,5,7,260671.175972,29,183325.226788,1


AGRAWAL Generator -> agr_g1

1. 100.000 Instanzen
2. 4 graduelle Drifts (Breite: 500)

In [19]:
import warnings
warnings.filterwarnings('ignore')

agr_g1_stream1 = ConceptDriftStream(stream=AGRAWALGenerator
                                   (classification_function=2, 
                                    balance_classes=False), 
                   drift_stream=AGRAWALGenerator(classification_function=4, 
                                    balance_classes=False),
                                   position=15000, width=500) 

agr_g1_stream2 = ConceptDriftStream(stream=AGRAWALGenerator
                                   (classification_function=4, 
                                    balance_classes=False), 
                   drift_stream=AGRAWALGenerator(classification_function=6, 
                                    balance_classes=False),
                                   position=15000, width=500)

agr_g1_stream3 = ConceptDriftStream(stream=AGRAWALGenerator
                                   (classification_function=6, 
                                    balance_classes=False), 
                   drift_stream=AGRAWALGenerator(classification_function=3, 
                                    balance_classes=False),
                                   position=15000, width=500)

agr_g1_stream4 = ConceptDriftStream(stream=AGRAWALGenerator
                                   (classification_function=3, 
                                    balance_classes=False), 
                   drift_stream=AGRAWALGenerator(classification_function=1, 
                                    balance_classes=False),
                                   position=15000, width=500)

agr_g1 = pd.DataFrame(np.column_stack(agr_g1_stream1.next_sample(25000)))
agr_g1 = agr_g1.append(pd.DataFrame(np.column_stack(agr_g1_stream2.next_sample(25000))))
agr_g1 = agr_g1.append(pd.DataFrame(np.column_stack(agr_g1_stream3.next_sample(25000))))
agr_g1 = agr_g1.append(pd.DataFrame(np.column_stack(agr_g1_stream4.next_sample(25000))))

agr_g1.columns = ['salary', 'comission', 'age', 'elevel',
                 'car', 'zipcode', 'hvalue', 'hyears', 'loan', 'class']

agr_g1 = agr_g1.astype({'age': 'int', 'elevel': 'int', 
                      'car': 'int', 'zipcode': 'int',
                      'hyears': 'int', 'class': 'int'})

agr_g1.reset_index(drop=True, inplace=True)
agr_g1.to_csv('agr_g1.csv')
agr_g1

Unnamed: 0,salary,comission,age,elevel,car,zipcode,hvalue,hyears,loan,class
0,145598.879633,0.000000,78,0,3,6,190803.600696,13,70717.834189,1
1,24459.972652,10315.978123,44,1,19,4,403439.225263,6,108558.807674,0
2,141510.637100,0.000000,55,2,14,3,674514.281419,15,80022.808168,0
3,73197.599799,83303.494908,21,3,2,3,692398.714346,22,10101.511648,1
4,37609.060139,32152.461607,58,2,4,1,508581.123681,24,183619.723418,0
...,...,...,...,...,...,...,...,...,...,...
99995,109034.840646,0.000000,71,4,9,4,527637.640055,21,75622.610248,1
99996,118744.506759,0.000000,44,3,10,7,171256.612603,24,367793.634484,0
99997,70830.355136,18501.383745,24,1,17,1,723566.563629,22,255319.890793,0
99998,23933.051872,73542.376349,34,3,1,2,415430.917078,14,343867.081536,1


AGRAWAL Generator -> agr_g2

1. 100.000 Instanzen
2. 2 graduelle Drifts (Breite: 1000)
3. Rauschen 30%

In [4]:
import warnings
warnings.filterwarnings('ignore')

agr_g2_stream1 = ConceptDriftStream(stream=AGRAWALGenerator
                                   (classification_function=2, 
                                    balance_classes=False, perturbation=0.3), 
                   drift_stream=AGRAWALGenerator(classification_function=4, 
                                    balance_classes=False, perturbation=0.3),
                                   position=25000, width=1000) 

agr_g2_stream2 = ConceptDriftStream(stream=AGRAWALGenerator
                                   (classification_function=4, 
                                    balance_classes=False, perturbation=0.3), 
                   drift_stream=AGRAWALGenerator(classification_function=6, 
                                    balance_classes=False, perturbation=0.3),
                                   position=25000, width=1000)

agr_g2 = pd.DataFrame(np.column_stack(agr_g2_stream1.next_sample(50000)))
agr_g2 = agr_g2.append(pd.DataFrame(np.column_stack(agr_g2_stream2.next_sample(50000))))

agr_g2.columns = ['salary', 'comission', 'age', 'elevel',
                 'car', 'zipcode', 'hvalue', 'hyears', 'loan', 'class']

agr_g2 = agr_g2.astype({'age': 'int', 'elevel': 'int', 
                      'car': 'int', 'zipcode': 'int',
                      'hyears': 'int', 'class': 'int'})

agr_g2.reset_index(drop=True, inplace=True)
agr_g2.to_csv('agr_g2.csv')
agr_g2

Unnamed: 0,salary,comission,age,elevel,car,zipcode,hvalue,hyears,loan,class
0,86060.986557,70349.701959,25,4,1,5,0.0,1,203736.547642,1
1,150000.000000,0.000000,20,0,2,8,0.0,20,80527.357668,0
2,104278.989321,0.000000,71,1,1,2,700000.0,5,470816.110624,1
3,68198.057555,47294.809882,57,0,3,0,900000.0,3,313741.390082,1
4,47310.473873,49298.596174,20,3,3,8,100000.0,7,219538.795433,1
...,...,...,...,...,...,...,...,...,...,...
99995,78413.104866,0.000000,24,2,12,5,400000.0,30,204041.607165,1
99996,29955.476332,75000.000000,59,3,7,2,0.0,1,486346.812637,1
99997,115081.708684,0.000000,54,0,15,8,100000.0,6,427222.632409,1
99998,89902.187355,0.000000,67,3,4,2,0.0,25,193497.242189,1


SEA Generator -> sea_a

1. 100.000 Instanzen
2. 19 abrupte Drifts (Konzepte wiederholen sich, da die Klassifizierungsfunktionen im Stream erneut auftauchen)

In [18]:
import warnings
warnings.filterwarnings('ignore')

sea_a_stream1 = ConceptDriftStream(stream=SEAGenerator
                                   (classification_function=1, 
                                    balance_classes=False), 
                   drift_stream=SEAGenerator(classification_function=3, 
                                    balance_classes=False),
                                   position=5000, width=1) 

sea_a_stream2 = ConceptDriftStream(stream=SEAGenerator
                                   (classification_function=2, 
                                    balance_classes=False), 
                   drift_stream=SEAGenerator(classification_function=0, 
                                    balance_classes=False),
                                   position=5000, width=1)



sea_a = pd.DataFrame(np.column_stack(sea_a_stream1.next_sample(10000)))
sea_a = sea_a.append(pd.DataFrame(np.column_stack(sea_a_stream2.next_sample(10000))))
sea_a = sea_a.append(pd.DataFrame(np.column_stack(sea_a_stream1.next_sample(10000))))
sea_a = sea_a.append(pd.DataFrame(np.column_stack(sea_a_stream2.next_sample(10000))))
sea_a = sea_a.append(pd.DataFrame(np.column_stack(sea_a_stream1.next_sample(10000))))
sea_a = sea_a.append(pd.DataFrame(np.column_stack(sea_a_stream2.next_sample(10000))))
sea_a = sea_a.append(pd.DataFrame(np.column_stack(sea_a_stream1.next_sample(10000))))
sea_a = sea_a.append(pd.DataFrame(np.column_stack(sea_a_stream2.next_sample(10000))))
sea_a = sea_a.append(pd.DataFrame(np.column_stack(sea_a_stream1.next_sample(10000))))
sea_a = sea_a.append(pd.DataFrame(np.column_stack(sea_a_stream2.next_sample(10000))))

sea_a.columns = ['att_num_0', 'att_num_1', 'att_num_2', 'target_0']

sea_a = sea_a.astype({'target_0': 'int'})

sea_a.reset_index(drop=True, inplace=True)

sea_a.to_csv('sea_a.csv')
sea_a

Unnamed: 0,att_num_0,att_num_1,att_num_2,target_0
0,0.046890,7.408816,4.946948,0
1,9.353653,9.776648,7.871109,1
2,0.870217,3.882321,7.647933,0
3,8.801614,3.754183,9.190277,1
4,7.639922,1.444004,6.673986,1
...,...,...,...,...
99995,1.592368,8.358116,3.306780,1
99996,1.106689,8.331479,1.292364,1
99997,5.052210,4.086616,5.253367,1
99998,3.795950,3.208476,8.143784,0


SEA Generator -> sea_g1

1. 100.000 Instanzen
2. 2 graduellen Drifts (Breite: 2000)

In [2]:
import warnings
warnings.filterwarnings('ignore')

sea_g1_stream1 = ConceptDriftStream(stream=SEAGenerator
                                   (classification_function=1, 
                                    balance_classes=False), 
                   drift_stream=SEAGenerator(classification_function=3, 
                                    balance_classes=False),
                                   position=25000, width=2000) 

sea_g1_stream2 = ConceptDriftStream(stream=SEAGenerator
                                   (classification_function=3, 
                                    balance_classes=False), 
                   drift_stream=SEAGenerator(classification_function=0, 
                                    balance_classes=False),
                                   position=25000, width=2000)

sea_g1 = pd.DataFrame(np.column_stack(sea_g1_stream1.next_sample(50000)))
sea_g1 = sea_g1.append(pd.DataFrame(np.column_stack(sea_g1_stream2.next_sample(50000))))

sea_g1.columns = ['att_num_0', 'att_num_1', 'att_num_2', 'target_0']

sea_g1 = sea_g1.astype({'target_0': 'int'})

sea_g1.reset_index(drop=True, inplace=True)

sea_g1.to_csv('sea_g1.csv')
sea_g1

Unnamed: 0,att_num_0,att_num_1,att_num_2,target_0
0,7.678673,9.687995,3.117225,1
1,4.316736,6.388305,5.844619,1
2,3.561456,4.686825,7.026918,0
3,6.733057,8.902173,6.811822,1
4,6.055232,0.245388,5.654791,0
...,...,...,...,...
99995,7.633954,1.386940,3.206358,1
99996,5.614831,5.881474,1.122303,1
99997,4.457994,7.752017,7.605029,1
99998,6.234482,3.926466,4.962671,1


SEA Generator -> sea_g2

1. 100.000 Instanzen
2. 2 graduellen Drifts (Breite: 1000)
3. Rauschen 25%

In [3]:
import warnings
warnings.filterwarnings('ignore')

sea_g2_stream1 = ConceptDriftStream(stream=SEAGenerator
                                   (classification_function=1, 
                                    balance_classes=False, noise_percentage=0.25), 
                   drift_stream=SEAGenerator(classification_function=3, 
                                    balance_classes=False, noise_percentage=0.25),
                                   position=25000, width=1000) 

sea_g2_stream2 = ConceptDriftStream(stream=SEAGenerator
                                   (classification_function=3, 
                                    balance_classes=False, noise_percentage=0.25), 
                   drift_stream=SEAGenerator(classification_function=0, 
                                    balance_classes=False, noise_percentage=0.25),
                                   position=25000, width=1000)

sea_g2 = pd.DataFrame(np.column_stack(sea_g2_stream1.next_sample(50000)))
sea_g2 = sea_g2.append(pd.DataFrame(np.column_stack(sea_g2_stream2.next_sample(50000))))

sea_g2.columns = ['att_num_0', 'att_num_1', 'att_num_2', 'target_0']

sea_g2 = sea_g2.astype({'target_0': 'int'})

sea_g2.reset_index(drop=True, inplace=True)

sea_g2.to_csv('sea_g2.csv')
sea_g2

Unnamed: 0,att_num_0,att_num_1,att_num_2,target_0
0,5.547272,1.186118,1.294934,0
1,0.445940,6.795033,2.108909,0
2,5.235536,2.802237,1.180871,1
3,1.321094,6.990774,9.458238,0
4,5.172757,1.487323,1.739648,0
...,...,...,...,...
99995,3.426874,6.332102,5.872836,0
99996,1.645236,2.429045,4.632333,1
99997,3.679096,0.975563,1.198596,1
99998,8.952168,0.435542,9.841004,1
