# CRAFTML

The CRAFTML Rust implementation can be found here: https://github.com/tomtung/craftml-rs

And the paper can be found here: http://proceedings.mlr.press/v80/siblini18a/siblini18a.pdf

To run the code:

1. Install Rust from the website here: https://www.rust-lang.org/
2. Then, clone the github into a local directory adjacent to this one (so that they both have the same parent folder).
3. Follow the instruction on the github to build the rust program
4. Download the Eurlex 4k dataset from the Extreme Multilabel repository (or use the Eurlex data folder in our project github)
 1.  If you downloaded it from the extreme multilabel repository, make sure to run the cross validation code (the second code cell) to build the cv data sets. If not, then skip this code cell.
5. Then, you should be able to run the rest of the code just fine.



In [1]:
import os
import pandas as pd
import numpy as np
np.random.seed(2020)
import pickle
from itertools import cycle, product
from sklearn.metrics import label_ranking_average_precision_score
import time

num_features = 5000
num_labels = 3993
num_samples_val = 3809
num_samples_train = 15539

cv_idx = np.random.randint(5, size = num_samples_train)
cv_counts = np.unique(cv_idx, return_counts = True)[1]
results = dict()

The below code block sets up the cross validation folds in the format that the craftml implementation needs.
Each input file should be of the form:
```
num_examples num_total_features num_total_labels
label1,label2,... feature1:score1 feature2:score2 ...
```

Which is how the txt file looks like if you download it from the Extreme Classification Repository http://manikvarma.org/downloads/XC/XMLRepository.html

All of the cross validation data sets are saved in the Eurlex folder (along with the train and dev sets

In [None]:
# # write first line to each cv val and train file
# # first line is 'num_samples num_features num_labels'
# for i in range(5):
#     file_name = 'Eurlex/cv_' + str(i) + '_val'
#     with open(file_name, 'w') as file:
#         file.write(str(cv_counts[i]) + ' ' + str(num_features) + ' ' + str(num_labels) + '\n')
        
#     file_name = 'Eurlex/cv_' + str(i) + '_train'
#     with open(file_name, 'w') as file:
#         file.write(str(num_samples_train - cv_counts[i]) + ' ' + str(num_features) +
#                    ' ' + str(num_labels) + '\n')

# with open('Eurlex/eurlex_train.txt', 'r') as f:
#     for i, line in enumerate(f):
#         if(i == 0):
#             continue
#         for j in range(5):
#             if j == cv_idx[i-1]:
#                 file_name = 'Eurlex/cv_' + str(j) + '_val'
#             else:
#                 file_name = 'Eurlex/cv_' + str(j) + '_train'
#             with open(file_name, 'a') as file:
#                 file.write(line)

Training using 5 fold cross validation and grid-search for hyperparameter tuning

In [8]:
train_cmd_call = '../craftml-rs/target/release/craftml train --model_path model.m'
val_cmd_call = '../craftml-rs/target/release/craftml test --out_path predictions.txt --k_top ' + str(num_labels) + ' model.m '
k_clusters_list = [10]
n_trees_list = [100]
cluster_sample_list = [1000]
centroid_min_n_preserve_list = [500]
hyper_parameter_grid = product(k_clusters_list, n_trees_list, cluster_sample_list, centroid_min_n_preserve_list)

start = time.time()
for k_clusters, n_trees, cluster_size, centroid_min in hyper_parameter_grid:
    
    # build the command line call to train the model
    cv_cmd_train = train_cmd_call + ' --k_clusters ' + str(k_clusters) + ' --n_trees ' + str(n_trees) + \
                    ' --cluster_sample_size ' + str(cluster_size) +' --centroid_min_n_preserve ' + \
                    str(centroid_min) + ' --n_feature_buckets 1000 --n_label_buckets 1000'
    # results for each cv-fold
    param_results = np.zeros(5)
    for i in range(5):
        train_file_name = 'Eurlex/cv_' + str(i) + '_train'
        val_file_name = 'Eurlex/cv_' + str(i) + '_val'
        # add the correct cv file to the end of the cmd line call for train
        cv_ith_cmd_train = cv_cmd_train + ' ' + train_file_name
        # call the command
        os.system(cv_ith_cmd_train)
        # add correct cv file to the end of the cmd line for test
        cv_ith_cmd_val = val_cmd_call + val_file_name
        # call the command
        os.system(cv_ith_cmd_val)
        # create the predictions matrix 
        predictions = np.zeros((cv_counts[i], num_labels))
        with open('predictions.txt') as f:
            for j, line in enumerate(f):
                for val in line.split(sep = '\t'):
                    label, prob = val.split(sep = ' ')
                    label = int(label)
                    prob = float(prob)
                    predictions[j, label] = prob
        
        # create the true labels matrix
        true_labels = np.zeros((cv_counts[i], num_labels))
        with open(val_file_name, 'r') as f:
            for j, line in enumerate(f):
                if(j == 0):
                    continue
                label_string = line.split(sep = ' ')[0]
                if(label_string == ''):
                    continue
                labels = label_string.split(sep = ',')
                for label in labels:
                    true_labels[j-1,int(label)] = 1
        # take the LRAP score
        lrap_cv = label_ranking_average_precision_score(true_labels, predictions)
        print(k_clusters, n_trees, i, lrap_cv)
        param_results[i] = lrap_cv
    
    dict_key = str(k_clusters) + ',' + str(n_trees) + ',' + str(cluster_size) + ',' + str(centroid_min) + ' 2'
    results[dict_key] = param_results.mean()
print('time:', time.time() -start)

10 100 0 0.5606432932442618
10 100 1 0.5668999217640313
10 100 2 0.5665334665663533
10 100 3 0.5735650490041243
10 100 4 0.5673084160012928
time: 405.85119247436523


In [9]:
results

{'10,100,1000,500 2': 0.5669900293160126}

Testing dev dataset on the best hyperparameters

In [12]:
# best parameters: 200 trees, max_leaf 10, centroid preserve 500, cluster sample size 1000
# n_feature buckets 1000, n_label_buckets 1000 (these last two are cited in the paper to be the best for the
# Eurlex 4K dataset)
train_cmd_call = '../craftml-rs/target/release/craftml train --model_path model.m --cluster_sample_size 1000'+\
                 ' --centroid_min_n_preserve 500 --leaf_max_size 10 --n_trees 200' +\
                 ' --n_feature_buckets 1000 --n_label_buckets 1000 Eurlex/eurlex_train.txt'
val_cmd_call = '../craftml-rs/target/release/craftml test --out_path predictions.txt --k_top ' +\
                str(num_labels) + ' model.m Eurlex/dev.txt'
start = time.time()
os.system(train_cmd_call)
os.system(val_cmd_call)
predictions = np.zeros((1316, num_labels))
with open('predictions.txt') as f:
    for j, line in enumerate(f):
        for val in line.split(sep = '\t'):
            label, prob = val.split(sep = ' ')
            label = int(label)
            prob = float(prob)
            predictions[j, label] = prob

true_labels = np.zeros((1316, num_labels))
with open('Eurlex/dev.txt', 'r') as f:
    for j, line in enumerate(f):
        if(j == 0):
            continue
        label_string = line.split(sep = ' ')[0]
        if(label_string == ''):
            continue
        labels = label_string.split(sep = ',')
        for label in labels:
            true_labels[j-1,int(label)] = 1
lrap= label_ranking_average_precision_score(true_labels, predictions)
print(lrap)
print('time:', time.time() - start)

0.5895317770806462
time: 183.3079273700714
