In [3]:
from mulearn import FuzzyInductor
from mulearn.kernel import PrecomputedKernel
from mulearn.fuzzifier import *
from mulearn.optimization import GurobiSolver
import csv
import numpy as np
import statistics
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from mulearn.distributions import *

In [4]:
#Jaccard Similarity experiments
def get_kernel_and_solver(gram):
    eigvals = np.linalg.eigvals(gram)
    assert(sum([abs(e.imag) for e in eigvals]) < 1e-4)
    abs_neg_eigvals = [-l.real for l in eigvals if l < 0]
    adjustment = max(abs_neg_eigvals) if abs_neg_eigvals else 0

    kernel = PrecomputedKernel(gram)
    solver = GurobiSolver(adjustment=adjustment) if adjustment else GurobiSolver()

    return kernel, solver

def get_dataset(filename):
    with open(filename) as data_file:
        data = np.array(list(csv.reader(data_file)))

    n = len(data) - 1

    # ## Extract data names, membership values and Gram matrix

    names = np.array(data[0])[1:n+1]
    mu = np.array([float(row[0]) for row in data[1:n+1]])
    gram = np.load('data/jaccard_similarity.npy')

    assert(len(names.shape) == 1)
    assert(len(mu.shape) == 1)
    assert(len(gram.shape) == 2)
    assert(names.shape[0] == gram.shape[0] == gram.shape[1] == mu.shape[0])

    X = np.array([[x] for x in np.arange(n)])

    return X, gram, mu


data_file_name = 'data/data-tettamanzi-complete.csv'
X, gram, mu = get_dataset(data_file_name)

out_cv = KFold()

k, solver = get_kernel_and_solver(gram)

fuzzifiers = [CrispFuzzifier(), QuantileConstantPiecewiseFuzzifier(), QuantileLinearPiecewiseFuzzifier(), LinearFuzzifier(), ExponentialFuzzifier()]
mean_test_scores = []
stdev_test_scores = []
mean_train_scores = []
stdev_train_scores = []

for fuzzifier in fuzzifiers: 
    test_scores = []
    train_scores = []
    i = 1
    
    fi = FuzzyInductor(k=k, solver=solver, fuzzifier= fuzzifier)

    inner_folds = 5
    rmse = make_scorer(mean_squared_error)
    
    gs = GridSearchCV(fi, {'c': np.logspace(-1, 1, 7)},
                        verbose=0, cv=inner_folds,
                        error_score= np.nan, scoring = rmse, n_jobs=1,
                        pre_dispatch=10, refit = True)

    for train_idx, test_idx in out_cv.split(X):
        X_train = X[train_idx]
        X_test = X[test_idx]
        mu_train = mu[train_idx]
        mu_test = mu[test_idx]

        try:
            gs.fit(X_train, mu_train)
            print(f"fold {i}: best parameters: {gs.best_params_['c']}")
            train_score = gs.score(X_train, mu_train)
            test_score = gs.score(X_test, mu_test)
            print(f'fold {i}: train score {train_score:.2f}, test score {test_score:.2f}')
            test_scores.append(test_score)
            train_scores.append(train_score)
            i += 1
        except ValueError as e:
            print(e)
            test_scores.append(np.nan)
            train_scores.append(np.nan)
            i += 1
            continue
        
    mean_test_scores.append(np.nanmean(test_scores))
    mean_train_scores.append(np.nanmean(train_scores))
    stdev_test_scores.append(np.nanstd(test_scores))
    stdev_train_scores.append(np.nanstd(train_scores))

fold 1: best parameters: 4.6415888336127775
fold 1: train score 0.22, test score 0.11
fold 2: best parameters: 4.6415888336127775
fold 2: train score 0.24, test score 0.42
fold 3: best parameters: 1.0
fold 3: train score 0.21, test score 0.10
fold 4: best parameters: 1.0
fold 4: train score 0.21, test score 0.56
fold 5: best parameters: 4.6415888336127775
fold 5: train score 0.20, test score 0.16
fold 1: best parameters: 0.1
fold 1: train score 0.11, test score 0.06
fold 2: best parameters: 0.46415888336127786
fold 2: train score 0.10, test score 0.19
fold 3: best parameters: 4.6415888336127775
fold 3: train score 0.12, test score 0.06
fold 4: best parameters: 1.0
fold 4: train score 0.10, test score 0.17
fold 5: best parameters: 10.0
fold 5: train score 0.11, test score 0.12
fold 1: best parameters: 0.1
fold 1: train score 0.10, test score 0.06
fold 2: best parameters: 10.0
fold 2: train score 0.09, test score 0.15
fold 3: best parameters: 1.0
fold 3: train score 0.11, test score 0.06

In [5]:
import pandas as pd

rmse_test = []
for i in range(len(mean_test_scores)):
    rmse_test.append(str(round(mean_test_scores[i],5)) + " +/- " + str(2*round(stdev_test_scores[i],3)))

rmse_train = []
for i in range(len(mean_train_scores)):
    rmse_train.append(str(round(mean_train_scores[i],5)) + ' +/- ' + str(2*round(stdev_train_scores[i],3)))

d = {'RMSE test' : rmse_test, 'RMSE train' : rmse_train}
df = pd.DataFrame(d, index = ['CrispFuzzifier', 'QuantileConstantPiecewiseFuzzifier', 'QuantileLinearPiecewiseFuzzifier','LinearFuzzifier','ExponentialFuzzifier'])
df.head()

Unnamed: 0,RMSE test,RMSE train
CrispFuzzifier,0.26974 +/- 0.37,0.21694 +/- 0.024
QuantileConstantPiecewiseFuzzifier,0.11759 +/- 0.108,0.10881 +/- 0.02
QuantileLinearPiecewiseFuzzifier,0.1032 +/- 0.08,0.09935 +/- 0.014
LinearFuzzifier,0.10167 +/- 0.054,0.17326 +/- 0.018
ExponentialFuzzifier,0.14603 +/- 0.17,0.13391 +/- 0.02


In [6]:
def get_dataset(filename,file_gram):
    with open(filename) as data_file:
        data = np.array(list(csv.reader(data_file)))

    n = len(data) - 1

    # ## Extract data names, membership values and Gram matrix

    names = np.array(data[0])[1:n+1]
    mu = np.array([float(row[0]) for row in data[1:n+1]])
    gram = np.load(file_gram)

    assert(len(names.shape) == 1)
    assert(len(mu.shape) == 1)
    assert(len(gram.shape) == 2)
    assert(names.shape[0] == gram.shape[0] == gram.shape[1] == mu.shape[0])

    X = np.array([[x] for x in np.arange(n)])

    return X, gram, mu

def make_experiments(kernel,solver,fuzzifiers):
    mean_test_scores = []
    stdev_test_scores = []
    mean_train_scores = []
    stdev_train_scores = []
    
    for fuzzifier in fuzzifiers: 
        test_scores = []
        train_scores = []
        i = 1

        fi = FuzzyInductor(k=kernel, solver=solver, fuzzifier= fuzzifier)

        inner_folds = 5
        rmse = make_scorer(mean_squared_error)

        gs = GridSearchCV(fi, {'c': np.logspace(-1, 1, 7)},
                            verbose=0, cv=inner_folds,
                            error_score= np.nan, scoring = rmse, n_jobs=1,
                            pre_dispatch=10, refit = True)

        for train_idx, test_idx in out_cv.split(X):
            X_train = X[train_idx]
            X_test = X[test_idx]
            mu_train = mu[train_idx]
            mu_test = mu[test_idx]

            try:
                gs.fit(X_train, mu_train)
                print(f"fold {i}: best parameters: {gs.best_params_['c']}")
                train_score = gs.score(X_train, mu_train)
                test_score = gs.score(X_test, mu_test)
                print(f'fold {i}: train score {train_score:.2f}, test score {test_score:.2f}')
                test_scores.append(test_score)
                train_scores.append(train_score)
                i += 1
            except ValueError as e:
                print(e)
                test_scores.append(np.nan)
                train_scores.append(np.nan)
                i += 1
                continue

        mean_test_scores.append(np.nanmean(test_scores))
        mean_train_scores.append(np.nanmean(train_scores))
        stdev_test_scores.append(np.nanstd(test_scores))
        stdev_train_scores.append(np.nanstd(train_scores))
        
    return mean_test_scores, mean_train_scores, stdev_test_scores, stdev_train_scores

In [None]:
#Length-Based Similarity experiments
x, gram, mu = get_dataset('data/data-tettamanzi-complete.csv', 'data/length_distance.npy')
kernel, solver = get_kernel_and_solver(gram)
mtest, mtrain, stdevtest, stdevtrain = make_experiments(kernel,solver,fuzzifiers)

In [28]:
rmse_test = []
for i in range(len(mean_test_scores)):
    rmse_test.append(str(round(mtest[i],5)) + " +/- " + str(2*round(stdevtest[i],3)))

rmse_train = []
for i in range(len(mean_train_scores)):
    rmse_train.append(str(round(mtrain[i],3)) + ' +/- ' + str(2*round(stdevtrain[i],3)))

d = {'RMSE test' : rmse_test, 'RMSE train' : rmse_train}
df = pd.DataFrame(d, index = ['CrispFuzzifier', 'QuantileConstantPiecewiseFuzzifier', 'QuantileLinearPiecewiseFuzzifier','LinearFuzzifier','ExponentialFuzzifier'])
df.head()

Unnamed: 0,RMSE test,RMSE train
CrispFuzzifier,0.38573 +/- 0.466,0.391 +/- 0.108
QuantileConstantPiecewiseFuzzifier,0.31572 +/- 0.288,0.31 +/- 0.058
QuantileLinearPiecewiseFuzzifier,0.30646 +/- 0.252,0.3 +/- 0.052
LinearFuzzifier,nan +/- nan,nan +/- nan
ExponentialFuzzifier,nan +/- nan,nan +/- nan


In [13]:
#Hamming distance Similarity experiments
x, gram, mu = get_dataset('data/data-tettamanzi-complete.csv', 'data/hamming_distance.npy')
kernel, solver = get_kernel_and_solver(gram)
mtest, mtrain, stdevtest, stdevtrain = make_experiments(kernel,solver,fuzzifiers)

fold 1: best parameters: 0.1
fold 1: train score 0.73, test score 0.85
fold 2: best parameters: 0.46415888336127786
fold 2: train score 0.79, test score 0.59
fold 3: best parameters: 0.1
fold 3: train score 0.73, test score 0.19
fold 4: best parameters: 0.46415888336127786
fold 4: train score 0.61, test score 0.56
fold 5: best parameters: 0.1
fold 5: train score 0.70, test score 0.55
fold 1: best parameters: 1.0
fold 1: train score 0.39, test score 0.47
fold 2: best parameters: 10.0
fold 2: train score 0.48, test score 0.35
fold 3: best parameters: 10.0
fold 3: train score 0.46, test score 0.35
fold 4: best parameters: 0.1
fold 4: train score 0.45, test score 0.18
fold 5: best parameters: 0.21544346900318834
fold 5: train score 0.46, test score 0.44
fold 1: best parameters: 1.0
fold 1: train score 0.36, test score 0.46
fold 2: best parameters: 0.21544346900318834
fold 2: train score 0.45, test score 0.33
fold 3: best parameters: 10.0
fold 3: train score 0.43, test score 0.40
fold 4: be

  keepdims=keepdims)


`x0` is infeasible.
`x0` is infeasible.
`x0` is infeasible.
`x0` is infeasible.
`x0` is infeasible.


  keepdims=keepdims)


In [24]:
rmse_test = []
for i in range(len(mean_test_scores)):
    rmse_test.append(str(round(mtest[i],5)) + " +/- " + str(2*round(stdevtest[i],3)))

rmse_train = []
for i in range(len(mean_train_scores)):
    rmse_train.append(str(round(mtrain[i],3)) + ' +/- ' + str(2*round(stdevtrain[i],3)))
    
d = {'RMSE test' : rmse_test, 'RMSE train' : rmse_train}
df = pd.DataFrame(d, index = ['CrispFuzzifier', 'QuantileConstantPiecewiseFuzzifier', 'QuantileLinearPiecewiseFuzzifier','LinearFuzzifier','ExponentialFuzzifier'])
df.head()

Unnamed: 0,RMSE test,RMSE train
CrispFuzzifier,0.54868 +/- 0.418,0.713 +/- 0.118
QuantileConstantPiecewiseFuzzifier,0.35935 +/- 0.198,0.448 +/- 0.064
QuantileLinearPiecewiseFuzzifier,0.35008 +/- 0.162,0.42 +/- 0.06
LinearFuzzifier,nan +/- nan,nan +/- nan
ExponentialFuzzifier,nan +/- nan,nan +/- nan


In [25]:
#Levenshtein distance Similarity experiments
x, gram, mu = get_dataset('data/data-tettamanzi-complete.csv', 'data/levenshtein_distance.npy')
kernel, solver = get_kernel_and_solver(gram)
mtest, mtrain, stdevtest, stdevtrain = make_experiments(kernel,solver,fuzzifiers)

fold 1: best parameters: 0.1
fold 1: train score 0.75, test score 0.38
fold 2: best parameters: 0.21544346900318834
fold 2: train score 0.75, test score 0.52
fold 3: best parameters: 0.1
fold 3: train score 0.67, test score 0.22
fold 4: best parameters: 0.21544346900318834
fold 4: train score 0.78, test score 0.59
fold 5: best parameters: 0.1
fold 5: train score 0.70, test score 0.80
fold 1: best parameters: 1.0
fold 1: train score 0.40, test score 0.41
fold 2: best parameters: 10.0
fold 2: train score 0.46, test score 0.39
fold 3: best parameters: 10.0
fold 3: train score 0.45, test score 0.39
fold 4: best parameters: 10.0
fold 4: train score 0.41, test score 0.24
fold 5: best parameters: 0.1
fold 5: train score 0.47, test score 0.39
fold 1: best parameters: 10.0
fold 1: train score 0.37, test score 0.43
fold 2: best parameters: 10.0
fold 2: train score 0.44, test score 0.36
fold 3: best parameters: 10.0
fold 3: train score 0.42, test score 0.40
fold 4: best parameters: 10.0
fold 4: t

  keepdims=keepdims)


`x0` is infeasible.
`x0` is infeasible.
`x0` is infeasible.
`x0` is infeasible.
`x0` is infeasible.


  keepdims=keepdims)


In [26]:
rmse_test = []
for i in range(len(mean_test_scores)):
    rmse_test.append(str(round(mtest[i],5)) + " +/- " + str(2*round(stdevtest[i],3)))

rmse_train = []
for i in range(len(mean_train_scores)):
    rmse_train.append(str(round(mtrain[i],3)) + ' +/- ' + str(2*round(stdevtrain[i],3)))
    
d = {'RMSE test' : rmse_test, 'RMSE train' : rmse_train}
df = pd.DataFrame(d, index = ['CrispFuzzifier', 'QuantileConstantPiecewiseFuzzifier', 'QuantileLinearPiecewiseFuzzifier','LinearFuzzifier','ExponentialFuzzifier'])
df.head()

Unnamed: 0,RMSE test,RMSE train
CrispFuzzifier,0.50254 +/- 0.388,0.73 +/- 0.076
QuantileConstantPiecewiseFuzzifier,0.36324 +/- 0.122,0.439 +/- 0.058
QuantileLinearPiecewiseFuzzifier,0.35503 +/- 0.138,0.412 +/- 0.05
LinearFuzzifier,nan +/- nan,nan +/- nan
ExponentialFuzzifier,nan +/- nan,nan +/- nan
