##**Classification experiments**
To reproduce the classification experiments:
 - Download the CBDs folder from datasets in https://github.com/cbobed/PartMining and the [pretrained transaction embeddings for CBDs](https://drive.google.com/uc?export=download&id=1AKmY40Ws0OV0L-cSCo60pPcjBxp1Haq5)

 - Modifify the bellow varibles (if it is necessary)

   ```
    PATH_DIR_MODELS = "embedingsClassic/"
    PATH_DIR_DATASETS = "CBDs/"
    ```
    with the folders containing the embeddings and the datasets, respectively.

- Run the notebook and it is produced a file named `OUTPUT_FILE_NAME` with the accuracy results of `NVAL=10` runs for each dataset in folder `PATH_DIR_DATASETS`.

In [None]:
import numpy as np
import gensim
import os
import re
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from gensim.models import Word2Vec
from scipy.special import softmax
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.preprocessing import normalize
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:

class IteratorDataFile:
    """
     An iterator that yields items from all lines in a file
    """
    def __init__(self, path, removecls=False):
        self.path = path
        self.removecls = removecls

    def __iter__(self):
        for line in open(self.path, encoding="utf8"):
            #remove classification: last number in line
            if (self.removecls == True):
                line = re.sub("[0-9]+$","",line)
            yield word_tokenize(line)

In [None]:
PATH_DIR_MODELS = "embedingsClassic/"
PATH_DIR_DATASETS = "CBDs/"

DIM = ['50', '100','200']
MODEL = ['w2v', 'Glove']
NVAL = 10

now=datetime.now()
now = now.strftime("%Y-%m-%d_%H-%M-%S")
OUTPUT_FILE_NAME="resultsClassification"+now+".txt"
out = open(OUTPUT_FILE_NAME, 'a')
out.write("dataset,model,dim,exp,acc\n")
out.close()

dirs = os.listdir(PATH_DIR_MODELS)
dirs = sorted(dirs)

for dir in dirs:
    now=datetime.now()
    now = now.strftime("%Y-%m-%d_%H-%M-%S")
    print(now)
    print(dir)
    MIN_KNG = 1
    MAX_KNG = 21
    KNEIGH = range(MIN_KNG, MAX_KNG)
    kneighs = [KNeighborsClassifier(n_neighbors=i,p=2) for i in KNEIGH]

    skf = StratifiedKFold(n_splits=10)

    for dim in DIM:
        # model filenames and dataset filename
        file_models = []
        for model in MODEL:
            file_model=[file for file in os.listdir(PATH_DIR_MODELS + dir) if re.match('.*'+model+'.*D0?'+dim+'_.*',file) ]
            assert(len(file_model) == 1)
            print('    ' + file_model[0])
            file_models.append(file_model[0])
        assert(len(file_models) == len(MODEL))

        file_dataset = [file for file in os.listdir(PATH_DIR_DATASETS + dir) if re.match("[^_]*\.data$",file)]
        assert(len(file_dataset) == 1)
        print('     ' + dim)

        #read model files and classification
        print('    Loading models.....')
        trans_embs_list = [np.load(PATH_DIR_MODELS + dir + "/" + file) for file in file_models]
        for t in trans_embs_list:
            assert(t.shape[1] == int(dim))

        itemIteratorCls = IteratorDataFile(PATH_DIR_DATASETS + dir + "/" + file_dataset[0], False)
        cls_embs=[l[len(l)-1] for l in itemIteratorCls]
        cls_embs=np.array(cls_embs)

        out = open(OUTPUT_FILE_NAME, 'a')
        for ntest in range(NVAL):
            print('     TEST: ' + str(ntest))
            #indexes (stratify): split 80% for cv and 20% for validation
            index_cv, index_val, cls_cv, cls_val = train_test_split(np.arange(trans_embs_list[0].shape[0]), cls_embs,
                                                                         test_size=0.2, stratify=cls_embs)
            #embeddings for cv and validation
            trans_embs_cv_list = [t[index_cv] for t in trans_embs_list]
            trans_embs_val_list = [t[index_val] for t in trans_embs_list]

            #perform validation: find best K for K neighbours in validation set
            #indexes (stratify) for validation train/test
            index_val_train, index_val_test, cls_val_train, cls_val_test = train_test_split(np.arange(trans_embs_val_list[0].shape[0]), cls_val,
                                                               test_size=0.3, stratify=cls_val)
            trans_embs_val_train_list = [t[index_val_train] for t in trans_embs_val_list]
            trans_embs_val_test_list = [t[index_val_test] for t in trans_embs_val_list]

            fit_models_val_list = [[KNeighborsClassifier(n_neighbors=i,p=2).fit(t, cls_val_train) for i in KNEIGH] for t in  trans_embs_val_train_list]
            scores_models_val_list = [[m.score(t,cls_val_test) for m in models] for models, t in zip(fit_models_val_list, trans_embs_val_test_list)]
            best_ks = [sc.index(max(sc))+MIN_KNG for sc in scores_models_val_list]

            #crossvalidation
            vals_acc = [[] for i in range(len(MODEL))]
            for train_index, test_index in skf.split(trans_embs_cv_list[0], cls_cv):
                trans_embs_train = [embs[train_index] for embs in trans_embs_cv_list]
                trans_embs_test = [embs[test_index] for embs in trans_embs_cv_list]
                cls_train = cls_cv[train_index]
                cls_test = cls_cv[test_index]

                models = [KNeighborsClassifier(n_neighbors=i,p=2) for i in best_ks]
                fit_models_list = [m.fit(t,cls_train) for m, t in zip(models, trans_embs_train)]
                scores_list = [m.score(t, cls_test) for m, t in zip(fit_models_list, trans_embs_test)]
                for l, s in zip(vals_acc, scores_list):
                    l.append(s)
                for model, acc in zip(MODEL, scores_list):
                    out.write(file_dataset[0]+","+model+","+dim+","+str(ntest)+","+str(acc)+"\n")
            means = [np.mean(l) for l in vals_acc]
            print('     ' + str(means))
        out.close()

