Semisupervised for Restricted Datasets
==

In [None]:
# Imports
import numpy as np
import pandas as pd

import pickle as pk
import os
import sys
import gc
import warnings

from sklearn.base import clone as skclone
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.semi_supervised import LabelPropagation, LabelSpreading

In [None]:
from sslearn.wrapper import TriTraining, WiWTriTraining, DemocraticCoLearning, CoTrainingByCommittee, CoForest

In [None]:
from sklearn.metrics import (accuracy_score, 
                             f1_score, 
                             hamming_loss, 
                             jaccard_score,
                             precision_score, 
                             recall_score,
                             cohen_kappa_score,
                             roc_auc_score)

In [None]:
global_scores = {"acc":         (accuracy_score,   "pred",  {}),
#                  "f1_micro":        (f1_score,          "pred", {"average": "micro"}),
#                  "f1_macro":        (f1_score,          "pred", {"average": "macro"}),
#                  "hamming":         (hamming_loss,      "pred", {}),
#                  "jacc_micro":      (jaccard_score,     "pred", {"average": "micro"}),
#                  "jacc_macro":      (jaccard_score,     "pred", {"average": "macro"}),
#                  "precision_micro": (precision_score,   "pred", {"average": "micro"}),
#                  "precision_macro": (precision_score,   "pred", {"average": "macro"}),
#                  "racall_micro":    (recall_score,      "pred", {"average": "micro"}),
#                  "racall_macro":    (recall_score,      "pred", {"average": "macro"}),
#                  "cohen_kappa":     (cohen_kappa_score, "pred", {}) 
                }

In [None]:
experiment_done = list(filter(lambda x: "Test.pkl" in x, os.listdir("results")))

### Load datasets

In [None]:
with open("files", "rb") as f:
    data_pairs = pk.load(f)
    
datasets = {}
    
for pair in data_pairs:
    
    data_identifier = pair[0].split("/h2_")[1].split(".")[0]
    
    if "MN2" in data_identifier:
        data_first = pd.read_csv(pair[1], header=None)
        data_first.columns = [f"MN2_{i}" for i in range(1280)]+["Labels"]        
        data_second = pd.read_csv(pair[0], header=None)
        data_second.columns = [f"MN2_{i}" for i in range(1280)]+["Labels"]
        
        data_first.Labels += 1
        data_second.Labels += 1
    else:
        
        data_first = pd.read_csv(pair[1])
        data_second = pd.read_csv(pair[0])
    
    complete_data = pd.concat((data_first, data_second), axis=0, ignore_index=True)
    complete_data.Labels = complete_data.Labels.astype("int")
    
    datasets[data_identifier] = complete_data
    
# Add frames to MN2
for dt in set(map(lambda x: "_".join(x.split("_")[:-1]), datasets.keys())):
    G = datasets[dt+"_RGB"].Frames
    datasets[dt+"_MN2"]["Frames"] = G

# Only MN2
datasets = dict(filter(lambda x: "MN2" in x[0], datasets.items()))

In [None]:
def separate_dataset(dataframe: pd.DataFrame, unlabel="second-half", ssl=True):
    X = dataframe.loc[:, (dataframe.columns != "Labels") & (dataframe.columns != "Frames")]
    y = dataframe.Labels
    try:
        G = dataframe.Frames
    except:
        G = False
    
    y_unlabel = y.copy()
    if unlabel == "first-half":
        condition = G<=G.max()//2       
    elif unlabel == "second-half":
        condition = G>G.max()//2 
    else:
        return ValueError(f"unlabel must be 'first-half' or 'second-half', given '{unlabel}'")
    
    
    U = X[condition]
    Uy = y[condition]
    Gy = G[condition]
    if ssl:
        y_unlabel[condition] = -1 
    else:
        X = X[~condition]
        G = G[~condition]
        y_unlabel = y[~condition]
    
    return X, y_unlabel, U, Uy, G, Gy

### Train and score functions

In [None]:
def fit_score(identifier, classifier, ssl=False, wiw=True, test="second-half"):
    global datasets, global_scores
    
    result = {}    
    dataset = datasets[identifier]
    features = identifier.split("_")[-1]
    name = identifier.split("_"+features)[0]  
    
    X, y, U, Uy, G, Gy = separate_dataset(dataset, test, ssl)
    
    if wiw:
        result["clf"] = skclone(classifier).fit(X, y, G)
    else:
        result["clf"] = skclone(classifier).fit(X, y)
    
    result["scores"]= dict()
    
    if wiw:
        y_pred = result["clf"].predict(U, Gy)
    else:
        y_pred = result["clf"].predict(U)
    y_proba = result["clf"].predict_proba(U)
    
    result["pred"] = y_pred
    result["proba"] = y_proba
    
    for score_name, params in global_scores.items():
        score, mode, kwards = params
        if mode == "pred":
            y_score = y_pred
        else:
            y_score = y_proba

        result["scores"][score_name] = score(Uy, y_score, **kwards)
        
    return name, features, result
    

## Experimenter

In [None]:
def experimenter(method, filename, ssl=False, test="second-half", wiw=True, overwrite=False, models=True):
    if models:
        models = {"C45": DecisionTreeClassifier(random_state=0),
                  "5NN": KNeighborsClassifier(n_jobs=-1), 
                  "NB": GaussianNB(), 
                  "LR": LogisticRegression(random_state=0, n_jobs=-1)}
    else:
        models = {"Default": None}
    
    resultados = dict()
    if overwrite or filename not in experiment_done:
        print("Calculating:",filename)
        for dataset in datasets:
            for model_name, model in models.items():
                if model is not None:
                    method.set_params(**{"base_estimator": model})
                name, features, result = fit_score(dataset, method, ssl=ssl, wiw=wiw, test=test)
                if name not in resultados:
                    resultados[name] = dict()
                if features not in resultados[name]:
                    resultados[name][features] = dict()
                resultados[name][features][model_name] = result


        with open("results/"+filename, "wb") as f:
            pk.dump(resultados, f)

### Generate experiments

In [None]:
special_base_classifier = {"DemoCo": [DecisionTreeClassifier(random_state=0), 
                                      GaussianNB(),
                                      KNeighborsClassifier(n_neighbors=3)],
                           "CoBag":BaggingClassifier(
                               base_estimator=DecisionTreeClassifier(random_state=0),
                               random_state=0, n_jobs=-1),
                           "Spreading": None,
                           "Propagation": None
                          }


In [None]:
assigment = ["Greedy", "Hungarian"]
conflict_over = ["Labeled", "Labeled_plus", "Unlabeled", "All", "None"]
test = {"1Test": "first-half", "2Test": "second-half"}
models = {"Tri": TriTraining, 
          "DemoCo": DemocraticCoLearning, "CoFor": CoForest, "CoBag": CoTrainingByCommittee,
          "Spreading": LabelSpreading, "Propagation": LabelPropagation}
experiments = []
# Name format: Model - Method - Conflict over - Conflict weighted - Test
name = ["", "", "", "", ""]
for model_name, model in models.items():
    name[0] = model_name
    base_experiment = {"method": None, "filename": "", "ssl": False, "test": "", "wiw": True}
    
    if "WiW" not in model_name:
        base_experiment["wiw"] = False
    if "Tri" in model_name or model_name != "WiW":
        base_experiment["ssl"] = True
        
    for test_name, test_type in test.items():
        exp = base_experiment.copy()
        exp["test"] = test_type
        name[4] = test_name
        if exp["wiw"]:
            for ass in assigment:
                name[1] = ass
                kwards = {"method": ass.lower()}
                if model_name == "WiWTri":
                    for co in conflict_over:
                        kwards["conflict_over"] = co.lower()
                        name[2] = co
                        if co != "None":
                            for weighted in [True, False]:
                                kwards["conflict_weighted"] = weighted
                                if weighted:
                                    name[3] = "Weighted"
                                else:
                                    name[3] = "NoWeighted"
                            
                                exp_end = exp.copy()
                                exp_end["method"] = model(base_estimator=DummyClassifier(), random_state=0, **kwards)
                                exp_end["filename"] = "-".join(name)+".pkl"
                                experiments.append(exp_end)
                        else:
                            name[3] = ""
                            exp_end = exp.copy()
                            exp_end["method"] = model(base_estimator=DummyClassifier(), random_state=0, **kwards)
                            exp_end["filename"] = "-".join(name)+".pkl"
                            experiments.append(exp_end)
                else:
                    name[2] = ""
                    name[3] = ""
                    exp_end = exp.copy()
                    exp_end["method"] = model(base_estimator=DummyClassifier(), **kwards)
                    exp_end["filename"] = "-".join(name)+".pkl"
                    experiments.append(exp_end)
        else:
            name[1] = ""
            name[2] = ""
            name[3] = ""
            exp_end = exp.copy()
            if model_name in special_base_classifier:
                if special_base_classifier[model_name] is not None:
                    exp_end["method"] = model(special_base_classifier[model_name], random_state=0)
                else:
                    exp_end["method"] = model()
            else:
                exp_end["method"] = model(base_estimator=DummyClassifier(), random_state=0)
            exp_end["filename"] = "-".join(name)+".pkl"
            experiments.append(exp_end)
    

### Launch experiments

In [None]:
for exp in experiments:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        models=True
        if exp["filename"].split("-")[0] in special_base_classifier:
            models=False
        exp["filename"]="MN2"+exp["filename"]
        experimenter(overwrite=False, models=models, **exp)