In [19]:
import joblib
from iBCM_Python import *
from sklearn.ensemble import RandomForestClassifier as RF
import pandas as pd
import joblib
import numpy as np
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.naive_bayes import MultinomialNB as NB
from sklearn.tree import DecisionTreeClassifier as DT

from run_iBCM import iBCM, iBCM_verify

In [21]:
def run_iBCM(dataset, support, no_folds=1, no_win=1): 
    reduce_feature_space = True
    name_result_file = 'results_iBCM_Python.csv'
    classifiers = [('Random forest', RF(n_estimators=100))]
    np.random.seed(42)
    write_results = True

    # Read files
    trace_file = open('./datasets/' + dataset + '.dat', 'r')
    label_file = open('./datasets/' + dataset + '.lab', 'r')

    # Store information and create folds
    traces = []
    label_list = []
    for trace, label in zip(trace_file, label_file):
        traces.append(trace)
        label_list.append(label.replace('\n', ''))

    label_set = set(label_list)
    no_labels = len(label_set)
    print('#labels:', no_labels)

    class_counts = {}
    for label in label_set:
        class_counts[label] = label_list.count(label)

    min_class_count = min(class_counts.values())
    if no_folds > min_class_count:
        no_folds = min_class_count
        print('Adjusted n_folds to:', no_folds)

    skf = StratifiedKFold(n_splits=no_folds)
    
    
    ##########################
    ## Apply iBCM on all folds
    fold_train_results = []
    fold_test_results = []
    
    acc_sum = 0
    feat_sum = 0
    auc_sum = 0
    print(f'{traces}, {label_list}')
    for fold, (train_index, test_index) in enumerate(skf.split(traces, label_list)):
        print('\nFold ', (fold+1), '/', no_folds)
        training_points = []
        test_points = []
        training_labels = []
        test_labels = []
    
        for i in train_index:
            training_points.append(traces[i])
            training_labels.append(label_list[i])
    
        filename_train = dataset + '_training_fold_' + str(fold) + '_support_' 
        filename_train += f'{support}.csv'
        print(filename_train)
        final_constraints = iBCM(filename_train, training_points, training_labels, reduce_feature_space, support, no_win)       
        
        # Label training data
        iBCM_verify(filename_train, training_points, training_labels, final_constraints, no_win)
        fold_train_results.append(pd.read_csv(filename_train))
            
        filename_test = dataset + '_test_fold_' + str(fold) + '_support_' 
        filename_test += f'{support}.csv'
        
        for i in test_index:
            test_points.append(traces[i])
            test_labels.append(label_list[i])
        
        # Label test data
        iBCM_verify(filename_test, test_points, test_labels, final_constraints, no_win)
        fold_test_results.append(pd.read_csv(filename_test))
        
        os.remove(filename_train)
        os.remove(filename_test)
         

    #######################
    ## Apply classification       
        
    if write_results:
        write_header = False
        
        if not os.path.exists(name_result_file):
            write_header = True
        results = open(name_result_file,'a')
        
        if write_header:
            results.write('dataset,support,classifier,no_folds,reduce,no_features,accuracy,auc\n')
    
                
    for name, classifier in classifiers: 
        print(f'Classifier ', name)
        
        acc_sum = 0
        feat_sum = 0
        auc_sum = 0
        for i in range(0, no_folds):
    #                print('Fold:',i)
            training = fold_train_results[i]
            test = fold_test_results[i]
                    
            y_train = training['label']
            X_train = training.drop(['label'], axis=1)                 
                                
            y_test = test['label']
            X_test = test.drop(['label'], axis=1)
            
            # print('#Features:', len(X_train.columns))
            if len(X_train.columns) < 2:
                print('No features for fold', i)
                continue
                        
            feat_sum += len(X_train.columns)

            classifier.fit(X_train,y_train)
            predictions = classifier.predict(X_test)
            predictions_prob = classifier.predict_proba(X_test)
            
            
            acc = accuracy_score(y_test,predictions)
            if len(y_test.unique()) > 2:
                auc = roc_auc_score(y_test,predictions_prob,multi_class='ovo')
            else:
                auc = roc_auc_score(y_test,predictions_prob[:,1])
            acc_sum += acc
            auc_sum += auc
            filename = f'models/{name}_{dataset}_.joblib' 
            print(classifier)
            joblib.dump(classifier, filename)
        
        avg_feat = feat_sum/no_folds
        avg_acc = acc_sum/no_folds
        avg_auc = auc_sum/no_folds
        if write_results:
            results = open(name_result_file,'a')
            results.write(f'{dataset},{support},{name},{no_folds},{reduce_feature_space},{avg_feat},{avg_acc},{avg_auc}\n')
            results.close()
        print('Avg. acc.:', avg_acc)
        print('Avg. AUC.:', avg_auc)
        print('Avg. #features.:', avg_feat)                   

In [None]:
import pandas as pd

def convert_csv_to_spmf(input_file, output_file, format_type):
    # Load CSV file
    df = pd.read_csv(input_file, header=None)

    # Convert to .dat format
    if format_type == 'dat':
        with open(output_file, 'w') as f:
            for index, row in df.iterrows():
                items = row.dropna().astype(str).tolist()
                transaction = ' '.join(items)
                f.write(transaction + '\n')
        print("CSV file converted to .dat format successfully!")

    # Convert to .lab format
    elif format_type == 'lab':
        with open(output_file, 'w') as f:
            for index, row in df.iterrows():
                sequence = row.dropna().astype(str).tolist()
                labels = ' '.join(sequence)
                f.write(labels + '\n')
        print("CSV file converted to .lab format successfully!")

    else:
        print("Invalid format type. Please choose 'dat' or 'lab'.")

# Usage example:
data=convert_csv_to_spmf('sampledata/input.csv', 'output.dat', 'dat')




CSV file converted to .dat format successfully!


In [23]:
no_folds = 3
no_win = 1
reduce_feature_space = True
name_result_file = 'results_iBCM_Python.csv'
classifiers = [('Random forest',RF(n_estimators=100))]
np.random.seed(42)
write_results = True
support = .1
dataset = 'alice'
run_iBCM(dataset, support, no_folds, no_win)


#labels: 1638
Adjusted n_folds to: 1


ValueError: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=1.

In [24]:
from sklearn.ensemble import RandomForestClassifier as RF
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
import os
from joblib import dump as joblib_dump

def iBCM(filename, training_points, training_labels, reduce_feature_space, support, no_win):
    # Add the implementation of the iBCM function here
    pass

def iBCM_verify(filename, test_points, test_labels, final_constraints, no_win):
    # Add the implementation of the iBCM_verify function here
    pass

def run_iBCM(dataset, support, no_folds=1, no_win=1):
    reduce_feature_space = True
    name_result_file = 'results_iBCM_Python.csv'
    classifiers = [('Random forest', RF(n_estimators=100))]
    np.random.seed(42)
    write_results = True

    # Read files
    trace_file = open('./datasets/' + dataset + '.dat', 'r')
    label_file = open('./datasets/' + dataset + '.lab', 'r')

    # Store information and create folds
    traces = []
    label_list = []
    for trace, label in zip(trace_file, label_file):
        traces.append(trace)
        label_list.append(label.replace('\n', ''))

    label_set = set(label_list)
    no_labels = len(label_set)
    print('#labels:', no_labels)

    class_counts = {}
    for label in label_set:
        class_counts[label] = label_list.count(label)

    min_class_count = min(class_counts.values())
    if no_folds > min_class_count:
        no_folds = min_class_count
        print('Adjusted n_folds to:', no_folds)

    skf = StratifiedKFold(n_splits=no_folds)

    # Rest of the code remains the same

# Define the variables
no_folds = 2
no_win = 1
reduce_feature_space = True
name_result_file = 'results_iBCM_Python.csv'
classifiers = [('Random forest', RF(n_estimators=100))]
np.random.seed(42)
write_results = True
support = 0.1
dataset = 'alice'

# Call the run_iBCM function
run_iBCM(dataset, support, no_folds, no_win)


#labels: 1638
Adjusted n_folds to: 1


ValueError: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=1.