In [1]:
# Use this to check if java_home is correctly set
import subprocess
subprocess.call("ECHO $JAVA_HOME", shell=True)
# If JAVA_HOME is not set, then jpype will fail. 

/Users/gomeshe/Library/Java/JavaVirtualMachines/openjdk-20.0.1/Contents/Home


0

In [2]:
import jpype

import jpype.imports
from jpype.types import *

# Starts the JVM
jpype.startJVM()

In [3]:
# Add the moa jar to the class path
jpype.addClassPath('/Users/gomeshe/Dropbox/ciencia_computacao/dev/Using-MOA-API/moa.jar')

## Basic classification using ARF

In [4]:
%%time
import pandas as pd

from moa.classifiers.meta import AdaptiveRandomForest
from moa.core import Example
from moa.evaluation import BasicClassificationPerformanceEvaluator
from moa.streams.generators import RandomTreeGenerator

maxInstancesToProcess = 1000
instancesProcessed = 1
sampleFrequency = 100

learner = AdaptiveRandomForest()
# Setting parameters using setViaCLIString
learner.getOptions().setViaCLIString("-s 10")
# Setting parameters using the option attribute directly
learner.setRandomSeed(5)
learner.prepareForUse()

rtg = RandomTreeGenerator()
# Setting parameters using setViaCLIString
rtg.getOptions().setViaCLIString("-c 3 -u 10 -o 0")
rtg.prepareForUse()

# Setting parameters using the option attribute directly
evaluator = BasicClassificationPerformanceEvaluator()
evaluator.recallPerClassOption.set()
evaluator.prepareForUse()

learner.setModelContext(rtg.getHeader())

# Create empty lists to store the data
data = []
performance_names = []
performance_values = []

while rtg.hasMoreInstances() and instancesProcessed <= maxInstancesToProcess:
    trainInst = rtg.nextInstance()
    testInst = trainInst

    prediction = learner.getVotesForInstance(testInst)

    evaluator.addResult(testInst, prediction)
    learner.trainOnInstance(trainInst)

    if instancesProcessed == 1:
        performance_measurements = evaluator.getPerformanceMeasurements()
        performance_names = ["".join(measurement.getName()) for measurement in performance_measurements]

    if instancesProcessed % sampleFrequency == 0:
        performance_values = [measurement.getValue() for measurement in evaluator.getPerformanceMeasurements()]
        data.append(performance_values)
    
    instancesProcessed += 1

# Create a DataFrame using collected data
results_df = pd.DataFrame(data, columns=performance_names)

# Print the DataFrame
results_df

CPU times: user 3.33 s, sys: 1.92 s, total: 5.25 s
Wall time: 1.13 s


Unnamed: 0,classified instances,classifications correct (percent),Kappa Statistic (percent),Kappa Temporal Statistic (percent),Kappa M Statistic (percent),Recall for class 0 (percent),Recall for class 1 (percent),Recall for class 2 (percent)
0,100.0,75.0,51.399689,56.896552,34.210526,87.5,70.27027,0.0
1,200.0,76.5,55.126981,58.40708,45.348837,88.888889,73.684211,6.25
2,300.0,77.666667,58.234634,62.146893,49.242424,87.654321,79.279279,11.111111
3,400.0,79.25,61.918745,64.529915,55.135135,88.516746,83.006536,13.157895
4,500.0,80.2,64.225057,66.889632,57.51073,87.739464,84.736842,22.44898
5,600.0,80.833333,65.742713,68.144044,59.363958,88.102894,86.283186,25.396825
6,700.0,81.285714,66.78908,68.357488,61.127596,88.235294,87.640449,26.315789
7,800.0,81.5,67.593518,68.240343,61.558442,88.508557,88.135593,31.25
8,900.0,81.888889,68.292324,68.471954,62.268519,88.095238,89.425982,31.775701
9,1000.0,82.5,69.382685,69.298246,63.389121,88.75969,89.010989,35.833333


## Function to encapsulate test_train_loop

In [5]:
import pandas as pd

## Function to abstract the test and train loop
def test_train_loop(stream, learner, evaluator, maxInstances=1000, sampleFrequency=100):
    instancesProcessed = 1
    
    learner.setModelContext(stream.getHeader())
    
    data = []
    performance_names = []
    performance_values = []
    
    while stream.hasMoreInstances() and instancesProcessed <= maxInstances:
        trainInst = stream.nextInstance()
        testInst = trainInst
    
        prediction = learner.getVotesForInstance(testInst)
    
        evaluator.addResult(testInst, prediction)
        learner.trainOnInstance(trainInst)
    
        if instancesProcessed == 1:
            performance_measurements = evaluator.getPerformanceMeasurements()
            performance_names = ["".join(measurement.getName()) for measurement in performance_measurements]
    
        if instancesProcessed % sampleFrequency == 0:
            performance_values = [measurement.getValue() for measurement in evaluator.getPerformanceMeasurements()]
            data.append(performance_values)
        
        instancesProcessed += 1
    
    return pd.DataFrame(data, columns=performance_names)

# Experiments using MOA

## MOA ARF10

In [6]:
%%time

from moa.classifiers.meta import AdaptiveRandomForest
from moa.core import Example
from moa.evaluation import BasicClassificationPerformanceEvaluator
from moa.streams import ArffFileStream

arf10 = AdaptiveRandomForest()
arf10.getOptions().setViaCLIString("-s 10")
arf10.setRandomSeed(1)
arf10.prepareForUse()

## Example reading from an arff file
rtg_2abrupt = ArffFileStream("/Users/gomeshe/Desktop/data/RTG_2abrupt.arff", -1)
rtg_2abrupt.prepareForUse()

evaluator = BasicClassificationPerformanceEvaluator()
evaluator.recallPerClassOption.set()
evaluator.prepareForUse()

test_train_loop(rtg_2abrupt, arf10, evaluator, maxInstances=100000, sampleFrequency=1000)

CPU times: user 46.8 s, sys: 357 ms, total: 47.1 s
Wall time: 45 s


Unnamed: 0,classified instances,classifications correct (percent),Kappa Statistic (percent),Kappa Temporal Statistic (percent),Kappa M Statistic (percent),Recall for class 0 (percent),Recall for class 1 (percent),Recall for class 2 (percent),Recall for class 3 (percent),Recall for class 4 (percent)
0,1000.0,89.800000,53.123707,61.797753,37.423313,98.805257,43.558282,,,
1,2000.0,90.400000,58.706036,66.017699,43.026706,98.556825,50.148368,,,
2,3000.0,90.500000,60.598715,66.470588,45.297505,98.507463,52.399232,,,
3,4000.0,90.750000,61.863907,67.081851,46.298984,98.338870,54.281567,,,
4,5000.0,91.000000,62.924343,67.811159,47.429907,98.334942,55.490654,,,
...,...,...,...,...,...,...,...,...,...,...
95,96000.0,83.923958,74.948497,69.319921,68.486717,96.429711,71.908980,59.043928,14.564831,89.870517
96,97000.0,83.809278,74.746905,69.173242,68.236793,96.439987,71.804000,58.749069,14.620730,89.712358
97,98000.0,83.718367,74.578505,69.067928,68.023407,96.465770,71.732947,58.511980,14.578005,89.524733
98,99000.0,83.627273,74.403858,68.943516,67.800954,96.489930,71.634394,58.273531,14.432990,89.375391


CPU times: user 48.2 s, sys: 193 ms, total: 48.4 s
Wall time: 47.4 s

83.527000

## ARF100 -j 4

In [7]:
%%time

from moa.classifiers.meta import AdaptiveRandomForest
from moa.core import Example
from moa.evaluation import BasicClassificationPerformanceEvaluator
from moa.streams import ArffFileStream

arf100 = AdaptiveRandomForest()
arf100.getOptions().setViaCLIString("-s 100 -j 4")
arf100.setRandomSeed(1)
arf100.prepareForUse()

rtg_2abrupt = ArffFileStream("/Users/gomeshe/Desktop/data/RTG_2abrupt.arff", -1)
rtg_2abrupt.prepareForUse()

evaluator = BasicClassificationPerformanceEvaluator()
evaluator.recallPerClassOption.set()
evaluator.prepareForUse()

test_train_loop(rtg_2abrupt, arf100, evaluator, maxInstances=100000, sampleFrequency=1000)

CPU times: user 7min 45s, sys: 10.3 s, total: 7min 56s
Wall time: 3min 3s


Unnamed: 0,classified instances,classifications correct (percent),Kappa Statistic (percent),Kappa Temporal Statistic (percent),Kappa M Statistic (percent),Recall for class 0 (percent),Recall for class 1 (percent),Recall for class 2 (percent),Recall for class 3 (percent),Recall for class 4 (percent)
0,1000.0,88.900000,48.185077,58.426966,31.901840,98.566308,39.263804,,,
1,2000.0,89.600000,54.086419,63.185841,38.278932,98.616957,45.103858,,,
2,3000.0,89.466667,55.139083,62.823529,39.347409,98.467124,46.641075,,,
3,4000.0,89.875000,56.827170,63.967972,41.219158,98.550287,48.185776,,,
4,5000.0,90.120000,57.742296,64.663805,42.289720,98.624517,48.948598,,,
...,...,...,...,...,...,...,...,...,...,...
95,96000.0,83.966667,74.936690,69.401427,68.570437,96.937929,70.651886,58.936262,16.074600,89.859058
96,97000.0,83.862887,74.750844,69.275311,68.341963,96.961477,70.542352,58.642698,16.184134,89.718076
97,98000.0,83.781633,74.598578,69.188120,68.147658,96.983431,70.496975,58.448928,16.197783,89.530439
98,99000.0,83.696970,74.434459,69.075720,67.938021,97.003699,70.428652,58.169527,16.160490,89.392473
