In [1]:
import pandas as pd
import numpy as np

#### Importing dataset

In [2]:
from skmultiflow.data import DataStream
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('Selected_Features.csv', delimiter=',')

In [3]:
df[df['is_fraud']==1]

Unnamed: 0,amt,city_pop,trans_month,Weekday,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,...,Frequency_1DAY_WINDOW,Monetary_1DAY_WINDOW,Monetary_7DAY_WINDOW,merchant_NB_TX_1DAY_WINDOW,merchant_RISK_1DAY_WINDOW,merchant_RISK_7DAY_WINDOW,category_NB_TX_1DAY_WINDOW,category_RISK_1DAY_WINDOW,distance,is_fraud
4,727.560000,737,3,1,0,0,0,0,0,0,...,6.000000,513.525000,305.419412,1.0,0.0,0.0,1.0,0.0,81.109957,1
65,753.950000,737,3,0,0,0,0,0,0,0,...,4.000000,428.315000,178.918333,1.0,0.0,0.0,1.0,0.0,103.325912,1
362,352.480000,397,2,1,0,0,0,1,0,0,...,5.000000,379.690000,229.224762,1.0,0.0,0.0,1.0,0.0,83.377704,1
499,930.990000,1202,8,1,0,0,0,0,0,0,...,5.000000,611.388000,324.377000,1.0,0.0,0.0,1.0,0.0,49.727068,1
792,866.280000,1909,7,1,0,0,0,0,0,0,...,4.000000,432.785000,142.274250,1.0,0.0,0.0,1.0,0.0,44.374615,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2062212,822.162922,1701,4,1,0,0,0,0,0,0,...,7.960907,424.475889,157.448078,1.0,0.0,0.0,1.0,0.0,70.010940,1
2062213,867.200643,2566,4,1,0,0,0,0,0,0,...,7.000000,828.588578,394.262589,1.0,0.0,0.0,1.0,0.0,57.768957,1
2062214,864.113532,2099,4,1,0,0,0,0,0,0,...,7.518016,525.215419,232.678566,1.0,0.0,0.0,1.0,0.0,68.928900,1
2062215,883.236326,2566,4,1,0,0,0,0,0,0,...,7.000000,763.877967,368.596562,1.0,0.0,0.0,1.0,0.0,61.219699,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2062217 entries, 0 to 2062216
Data columns (total 27 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   amt                         float64
 1   city_pop                    int64  
 2   trans_month                 int64  
 3   Weekday                     int64  
 4   category_food_dining        int64  
 5   category_gas_transport      int64  
 6   category_grocery_net        int64  
 7   category_grocery_pos        int64  
 8   category_health_fitness     int64  
 9   category_home               int64  
 10  category_kids_pets          int64  
 11  category_misc_net           int64  
 12  category_misc_pos           int64  
 13  category_personal_care      int64  
 14  category_shopping_net       int64  
 15  category_shopping_pos       int64  
 16  category_travel             int64  
 17  Frequency_1DAY_WINDOW       float64
 18  Monetary_1DAY_WINDOW        float64
 19  Monetary_7DAY_WINDOW 

In [5]:
df.shape

(2062217, 27)

#### Defining Training Features

In [6]:
data=df.iloc[:,:-1]
# Converting into array for ARF
data=np.array(data)

In [7]:
data.shape

(2062217, 26)

In [8]:
y=df.iloc[:,-1]
y=np.array(y)

In [9]:
y.shape

(2062217,)

#### Creating Static Stream

In [10]:
# Create a data stream from the Pandas DataFrame
stream = DataStream(data=data, y=y)

In [11]:
stream

DataStream(allow_nan=False, cat_features=None,
           data=                 0         1    2    3    4    5    6    7    8    9   ...   
0        135.040000    6284.0  7.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  \
1         68.910000    1453.0  2.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   
2         47.970000  224256.0  5.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   
3         18.660000    1304.0  4.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   
4        727.560000     737.0  3.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   
...             ...       ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
2062212  822.162922    1701.0  4.0  1.0  0.0...
2062214  0.0  7.518016  525.215419  232.678566  1.0  0.0  0.0  1.0  0.0   
2062215  0.0  7.000000  763.877967  368.596562  1.0  0.0  0.0  1.0  0.0   
2062216  0.0  7.946942  541.753910  246.345750  1.0  0.0  0.0  1.0  0.0   

                 25  
0          7.539311  
1         93.086434  
2        128.012419  
3        134.944352  
4      

In [12]:
stream.get_data_info()

'1 target(s), 2 classes'

#### Defining Model 

* By continuously monitoring the data stream and adapting the model to concept drift, skmultiflow's Adaptive Random Forest provides an effective solution for handling evolving data. It allows the model to maintain its accuracy and generalization capabilities in dynamic environments. 
* It dynamically updates the model by incrementally adding or removing decision trees based on the concept drift detection mechanism.

In [13]:
from skmultiflow.meta import AdaptiveRandomForestClassifier
arf = AdaptiveRandomForestClassifier(n_estimators=200)

##### Prequential Evaluator


* The prequential evaluation is designed specifically for stream settings. This method consists of using each sample to test the model, which means to make a predictions, and then the same sample is used to train the model (partial fit). This way the model is always tested on samples that it hasn’t seen yet.

In [14]:
from skmultiflow.evaluation import EvaluatePrequential
# Initialize the evaluator
evaluator = EvaluatePrequential(pretrain_size=100000, max_samples=2062217,batch_size=512,metrics=['accuracy', 'kappa', 'precision', 'recall'])

In [15]:
evaluator

EvaluatePrequential(batch_size=512, data_points_for_classification=False,
                    max_samples=2062217, max_time=inf,
                    metrics=['accuracy', 'kappa', 'precision', 'recall'],
                    n_wait=200, output_file=None, pretrain_size=100000,
                    restart_stream=True, show_plot=False)

#### Incremental Evaluation

In [16]:
# Run the incremental evaluation
evaluator.evaluate(stream=stream, model=arf, model_names=['ARF'])

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 100000 sample(s).
Evaluating...

Processed samples: 449696
Mean performance:
ARF - Accuracy     : 0.9972
ARF - Kappa        : 0.6805
ARF - Precision: 0.9762
ARF - Recall: 0.5238


[AdaptiveRandomForestClassifier(binary_split=False, disable_weighted_vote=False,
                                drift_detection_method=ADWIN(delta=0.001),
                                grace_period=50, lambda_value=6,
                                leaf_prediction='nba', max_byte_size=33554432,
                                max_features=5, memory_estimate_period=2000000,
                                n_estimators=200, nb_threshold=0,
                                no_preprune=False, nominal_attributes=None,
                                performance_metric='acc', random_state=None,
                                remove_poor_atts=False, split_confidence=0.01,
                                split_criterion='info_gain',
                                stop_mem_management=False, tie_threshold=0.05,

### Unseen Data Prediction

In [17]:
x_test=data[500000]

In [18]:
single_row = x_test.reshape(1, -1)
single_row


array([[3.31990000e+02, 3.87600000e+03, 4.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 2.00000000e+00, 1.77070000e+02, 5.98358333e+01,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 6.23178649e+01]])

In [19]:
prediction = evaluator.predict(single_row)

In [20]:
prediction

[array([0])]

In [21]:
y[500000]

0

###### Model Predicts correctly on Unseen Data

In [22]:
x_2=data[565000]

In [23]:
x_2 = x_2.reshape(1, -1)
x_2

array([[6.3600000e+00, 7.6383000e+04, 3.0000000e+00, 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 5.0000000e+00, 2.1240000e+01, 2.9403000e+01,
        1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        0.0000000e+00, 7.1921895e+01]])

In [24]:
pred2 = evaluator.predict(x_2)

In [25]:
pred2

[array([0])]

In [26]:
y[565000]

0

In [27]:
x_3=data[2062216]

In [28]:
x_3 = x_3.reshape(1, -1)
x_3

array([[7.35240045e+02, 2.13900000e+03, 4.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 7.94694245e+00, 5.41753910e+02, 2.46345750e+02,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 7.65343453e+01]])

In [29]:
evaluator.predict(x_3)

[array([1])]

In [30]:
y[2062216]

1

###### Prediction on unseen rows

In [31]:
test_rows=last_rows = data[-10:]

In [32]:
test_rows.shape

(10, 26)

In [33]:
pred_test=evaluator.predict(test_rows)

In [34]:
pred_test

[array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]

In [35]:
y[-10:]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

###### Saving for Real-time Predictions

In [41]:
import pickle

with open('ARF_evaluator.pkl', 'wb') as file:
    pickle.dump(evaluator, file)