In [1]:
from mulearn import FuzzyInductor
from mulearn.kernel import PrecomputedKernel
from mulearn.fuzzifier import *
from mulearn.optimization import GurobiSolver
import csv
import numpy as np
import statistics
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from mulearn.distributions import *
import os

In [2]:
#Length-based Similarity experiments (generating matrix)
def get_kernel_and_solver(gram):
    eigvals = np.linalg.eigvals(gram)
    assert(sum([abs(e.imag) for e in eigvals]) < 1e-4)
    abs_neg_eigvals = [-l.real for l in eigvals if l < 0]
    adjustment = max(abs_neg_eigvals) if abs_neg_eigvals else 0

    kernel = PrecomputedKernel(gram)
    solver = GurobiSolver(adjustment=adjustment) if adjustment else GurobiSolver()

    return kernel, solver

def _length_distance(ax1, ax2):
    return abs(len(ax1) - len(ax2)) / max(len(ax1), len(ax2))



def length_distance(ax1, ax2):
    sign_negated = 1

    ax1_clean = ax1[2:-1]
    if ax1_clean[0] == '-':
        sign_negated *= -1
        ax1_clean = ax1_clean[1:]

    ax2_clean = ax2[2:-1]
    if ax2_clean[0] == '-':
        sign_negated *= -1
        ax2_clean = ax2_clean[1:]
    
    e = _length_distance(ax1_clean, ax2_clean)
    
    assert(0 <= e <= 1)
    
    return e if sign_negated == 1 else 1-e


def get_data_matrix(file, name, function,names):
    if os.path.isfile(file):
        print('revrieving cached {} data matrix'.format(name))
        data_matrix = np.load(file)
    else:
        print('generating and caching {} data matrix'
              ' (could take considerable time)...'.format(name), end=' ')
        data_matrix = np.array([[function(ax1, ax2)
                                for ax1 in names] for ax2 in names])
        np.save(file, data_matrix)
        print('done!')
    return data_matrix

def get_dataset(filename):
    with open(filename) as data_file:
        data = np.array(list(csv.reader(data_file)))

    n = len(data) - 1

    # ## Extract data names, membership values and Gram matrix

    names = np.array(data[0])[1:n+1]
    mu = np.array([float(row[0]) for row in data[1:n+1]])
    gram = get_data_matrix('length_distance.npy','length_distance',length_distance,names)

    assert(len(names.shape) == 1)
    assert(len(mu.shape) == 1)
    assert(len(gram.shape) == 2)
    assert(names.shape[0] == gram.shape[0] == gram.shape[1] == mu.shape[0])

    X = np.array([[x] for x in np.arange(n)])

    return X, gram, mu,names


data_file_name = 'data/data-tettamanzi-complete.csv'
X, gram, mu, names = get_dataset(data_file_name)

out_cv = KFold()

k, solver = get_kernel_and_solver(gram)

fuzzifiers = [CrispFuzzifier(), QuantileConstantPiecewiseFuzzifier(), QuantileLinearPiecewiseFuzzifier(), LinearFuzzifier(), ExponentialFuzzifier(profile = 'alpha', alpha = 0.07)]
mean_test_scores = []
variance_test_scores = []
mean_train_scores = []
variance_train_scores = []

for fuzzifier in fuzzifiers: 
    test_scores = []
    train_scores = []
    i = 1
    
    fi = FuzzyInductor(k=k, solver=solver, fuzzifier= fuzzifier)

    inner_folds = 5
    rmse = make_scorer(mean_squared_error)
    
    gs = GridSearchCV(fi, {'c': np.logspace(-3, 3, 7)},
                        verbose=0, cv=inner_folds,
                        error_score= np.nan, scoring = rmse, n_jobs= 1,
                        pre_dispatch=10, refit = True)

    for train_idx, test_idx in out_cv.split(X):
        X_train = X[train_idx]
        X_test = X[test_idx]
        mu_train = mu[train_idx]
        mu_test = mu[test_idx]

        try:
            gs.fit(X_train, mu_train)
            print(f"fold {i}: best parameters: {gs.best_params_['c']}")
            train_score = gs.score(X_train, mu_train)
            test_score = gs.score(X_test, mu_test)
            print(f'fold {i}: train score {train_score:.2f}, test score {test_score:.2f}')
            test_scores.append(test_score)
            train_scores.append(train_score)
            i += 1
        except ValueError as e:
            print(e)
            test_scores.append(np.nan)
            train_scores.append(np.nan)
            i += 1
            continue
        
    mean_test_scores.append(np.nanmean(test_scores))
    mean_train_scores.append(np.nanmean(train_scores))
    variance_test_scores.append(np.nanvar(test_scores))
    variance_train_scores.append(np.nanvar(train_scores))

generating and caching length_distance data matrix (could take considerable time)... 

KeyboardInterrupt: 

In [14]:
import pandas as pd

d = {'RMSE test' : mean_test_scores, 'test variance': variance_test_scores, 'RMSE train' : mean_train_scores, 'train variance' : variance_train_scores}
df = pd.DataFrame(d, index = ['CrispFuzzifier', 'QuantileConstantPiecewiseFuzzifier', 'QuantileLinearPiecewiseFuzzifier','LinearFuzzifier', 'ExponentialFuzzifier'])
df.head()

Unnamed: 0,RMSE test,test variance,RMSE train,train variance
CrispFuzzifier,0.323416,0.030826,0.558929,0.010162
QuantileConstantPiecewiseFuzzifier,0.217993,0.000641,0.382215,0.003953
QuantileLinearPiecewiseFuzzifier,0.223077,0.000873,0.366874,0.003613
LinearFuzzifier,,,,
ExponentialFuzzifier,,,,


In [15]:
##Hamming distance based Similarity experiments(generating matrix)
import os
from nltk.metrics.distance import edit_distance as _edit_distance

def get_kernel_and_solver(gram):
    eigvals = np.linalg.eigvals(gram)
    assert(sum([abs(e.imag) for e in eigvals]) < 1e-4)
    abs_neg_eigvals = [-l.real for l in eigvals if l < 0]
    adjustment = max(abs_neg_eigvals) if abs_neg_eigvals else 0

    kernel = PrecomputedKernel(gram)
    solver = GurobiSolver(adjustment=adjustment) if adjustment else GurobiSolver()

    return kernel, solver

def hamming(ax1, ax2):
    sign_negated = 1

    ax1_clean = ax1[2:-1]
    if ax1_clean[0] == '-':
        sign_negated *= -1
        ax1_clean = ax1_clean[1:]

    ax2_clean = ax2[2:-1]
    if ax2_clean[0] == '-':
        sign_negated *= -1
        ax2_clean = ax2_clean[1:]

    pairs = list(zip(ax1_clean, ax2_clean))
    h = sum([ch1 != ch2
             for ch1, ch2 in pairs]) / (min(len(ax1), len(ax2)) - 3)
    # -3 here stands for "do not consider quotes and initial space"
    
    assert(0 <= h <= 1)
    return h if sign_negated == 1 else 1-h


def get_data_matrix(file, name, function,names):
    if os.path.isfile(file):
        print('revrieving cached {} data matrix'.format(name))
        data_matrix = np.load(file)
    else:
        print('generating and caching {} data matrix'
              ' (could take considerable time)...'.format(name), end=' ')
        data_matrix = np.array([[function(ax1, ax2)
                                for ax1 in names] for ax2 in names])
        np.save(file, data_matrix)
        print('done!')
    return data_matrix

def get_dataset(filename):
    with open(filename) as data_file:
        data = np.array(list(csv.reader(data_file)))

    n = len(data) - 1

    # ## Extract data names, membership values and Gram matrix

    names = np.array(data[0])[1:n+1]
    mu = np.array([float(row[0]) for row in data[1:n+1]])
    gram = get_data_matrix('hamming_distance.npy','hamming_distance',hamming,names)

    assert(len(names.shape) == 1)
    assert(len(mu.shape) == 1)
    assert(len(gram.shape) == 2)
    assert(names.shape[0] == gram.shape[0] == gram.shape[1] == mu.shape[0])

    X = np.array([[x] for x in np.arange(n)])

    return X, gram, mu,names


data_file_name = 'data/data-tettamanzi-complete.csv'
X, gram, mu, names = get_dataset(data_file_name)

out_cv = KFold()

k, solver = get_kernel_and_solver(gram)

fuzzifiers = [CrispFuzzifier(), QuantileConstantPiecewiseFuzzifier(), QuantileLinearPiecewiseFuzzifier(), LinearFuzzifier(), ExponentialFuzzifier(profile = 'alpha', alpha = 0.07)]
mean_test_scores = []
variance_test_scores = []
mean_train_scores = []
variance_train_scores = []

for fuzzifier in fuzzifiers: 
    test_scores = []
    train_scores = []
    i = 1
    
    fi = FuzzyInductor(k=k, solver=solver, fuzzifier= fuzzifier)

    inner_folds = 5
    rmse = make_scorer(mean_squared_error)
    
    gs = GridSearchCV(fi, {'c': np.logspace(-3, 3, 7)},
                        verbose=0, cv=inner_folds,
                        error_score= np.nan, scoring = rmse, n_jobs=-1,
                        pre_dispatch=10, refit = True)

    for train_idx, test_idx in out_cv.split(X):
        X_train = X[train_idx]
        X_test = X[test_idx]
        mu_train = mu[train_idx]
        mu_test = mu[test_idx]

        try:
            gs.fit(X_train, mu_train)
            print(f"fold {i}: best parameters: {gs.best_params_['c']}")
            train_score = gs.score(X_train, mu_train)
            test_score = gs.score(X_test, mu_test)
            print(f'fold {i}: train score {train_score:.2f}, test score {test_score:.2f}')
            test_scores.append(test_score)
            train_scores.append(train_score)
            i += 1
        except ValueError as e:
            print(e)
            test_scores.append(np.nan)
            train_scores.append(np.nan)
            i += 1
            continue
        
    mean_test_scores.append(np.nanmean(test_scores))
    mean_train_scores.append(np.nanmean(train_scores))
    variance_test_scores.append(np.nanvar(test_scores))
    variance_train_scores.append(np.nanvar(train_scores))

generating and caching hamming_distance data matrix (could take considerable time)... done!
fold 1: best parameters: 0.1
fold 1: train score 0.73, test score 0.85
fold 2: best parameters: 1.0
fold 2: train score 0.73, test score 0.58
fold 3: best parameters: 0.01
fold 3: train score 0.65, test score 0.19
fold 4: best parameters: 1.0
fold 4: train score 0.59, test score 0.57
fold 5: best parameters: 0.01
fold 5: train score 0.73, test score 0.23
fold 1: best parameters: 1.0
fold 1: train score 0.39, test score 0.47
fold 2: best parameters: 1000.0
fold 2: train score 0.46, test score 0.37
fold 3: best parameters: 1000.0
fold 3: train score 0.45, test score 0.33
fold 4: best parameters: 0.1
fold 4: train score 0.45, test score 0.18
fold 5: best parameters: 0.1
fold 5: train score 0.46, test score 0.36
fold 1: best parameters: 1.0
fold 1: train score 0.36, test score 0.46
fold 2: best parameters: 1000.0
fold 2: train score 0.43, test score 0.34
fold 3: best parameters: 1000.0
fold 3: train



optimal solution not found!
optimal solution not found!
optimal solution not found!
optimal solution not found!
optimal solution not found!




In [16]:
d = {'RMSE test' : mean_test_scores, 'test variance': variance_test_scores, 'RMSE train' : mean_train_scores, 'train variance' : variance_train_scores}
df = pd.DataFrame(d, index = ['CrispFuzzifier', 'QuantileConstantPiecewiseFuzzifier', 'QuantileLinearPiecewiseFuzzifier','LinearFuzzifier', 'ExponentialFuzzifier'])
df.head()

Unnamed: 0,RMSE test,test variance,RMSE train,train variance
CrispFuzzifier,0.484213,0.05994,0.685165,0.003351
QuantileConstantPiecewiseFuzzifier,0.341939,0.008224,0.443161,0.000801
QuantileLinearPiecewiseFuzzifier,0.348922,0.006325,0.415319,0.000676
LinearFuzzifier,,,,
ExponentialFuzzifier,,,,


In [18]:
#Levenshtein distance based Similarity experiments(generating matrix)
import os
from nltk.metrics.distance import edit_distance as _edit_distance

def get_kernel_and_solver(gram):
    eigvals = np.linalg.eigvals(gram)
    assert(sum([abs(e.imag) for e in eigvals]) < 1e-4)
    abs_neg_eigvals = [-l.real for l in eigvals if l < 0]
    adjustment = max(abs_neg_eigvals) if abs_neg_eigvals else 0

    kernel = PrecomputedKernel(gram)
    solver = GurobiSolver(adjustment=adjustment) if adjustment else GurobiSolver()

    return kernel, solver

def edit_distance(ax1, ax2):
    sign_negated = 1

    ax1_clean = ax1[2:-1]
    if ax1_clean[0] == '-':
        sign_negated *= -1
        ax1_clean = ax1_clean[1:]

    ax2_clean = ax2[2:-1]
    if ax2_clean[0] == '-':
        sign_negated *= -1
        ax2_clean = ax2_clean[1:]
    
    e = _edit_distance(ax1_clean, ax2_clean) / (max(len(ax1), len(ax2)) - 3)
    
    # normalization here follows by the fact that the maximal edit
    # distance between two words is the length of the longest word
    # -3 here stands for "do not consider quotes and initial space"
    assert(0 <= e <= 1)
    
    return e if sign_negated == 1 else 1-e


def get_data_matrix(file, name, function,names):
    if os.path.isfile(file):
        print('revrieving cached {} data matrix'.format(name))
        data_matrix = np.load(file)
    else:
        print('generating and caching {} data matrix'
              ' (could take considerable time)...'.format(name), end=' ')
        data_matrix = np.array([[function(ax1, ax2)
                                for ax1 in names] for ax2 in names])
        np.save(file, data_matrix)
        print('done!')
    return data_matrix

def get_dataset(filename):
    with open(filename) as data_file:
        data = np.array(list(csv.reader(data_file)))

    n = len(data) - 1

    # ## Extract data names, membership values and Gram matrix

    names = np.array(data[0])[1:n+1]
    mu = np.array([float(row[0]) for row in data[1:n+1]])
    gram = get_data_matrix('levenshtein_distance.npy','levenshtein_distance', edit_distance, names)

    assert(len(names.shape) == 1)
    assert(len(mu.shape) == 1)
    assert(len(gram.shape) == 2)
    assert(names.shape[0] == gram.shape[0] == gram.shape[1] == mu.shape[0])

    X = np.array([[x] for x in np.arange(n)])

    return X, gram, mu,names


data_file_name = 'data/data-tettamanzi-complete.csv'
X, gram, mu, names = get_dataset(data_file_name)

out_cv = KFold()

k, solver = get_kernel_and_solver(gram)

fuzzifiers = [CrispFuzzifier(), QuantileConstantPiecewiseFuzzifier(), QuantileLinearPiecewiseFuzzifier(), LinearFuzzifier(), ExponentialFuzzifier(profile = 'alpha', alpha = 0.07)]
mean_test_scores = []
variance_test_scores = []
mean_train_scores = []
variance_train_scores = []

for fuzzifier in fuzzifiers: 
    test_scores = []
    train_scores = []
    i = 1
    
    fi = FuzzyInductor(k=k, solver=solver, fuzzifier= fuzzifier)

    inner_folds = 5
    rmse = make_scorer(mean_squared_error)
    
    gs = GridSearchCV(fi, {'c': np.logspace(-3, 3, 7)},
                        verbose=0, cv=inner_folds,
                        error_score= np.nan, scoring = rmse, n_jobs=-1,
                        pre_dispatch=10, refit = True)

    for train_idx, test_idx in out_cv.split(X):
        X_train = X[train_idx]
        X_test = X[test_idx]
        mu_train = mu[train_idx]
        mu_test = mu[test_idx]

        try:
            gs.fit(X_train, mu_train)
            print(f"fold {i}: best parameters: {gs.best_params_['c']}")
            train_score = gs.score(X_train, mu_train)
            test_score = gs.score(X_test, mu_test)
            print(f'fold {i}: train score {train_score:.2f}, test score {test_score:.2f}')
            test_scores.append(test_score)
            train_scores.append(train_score)
            i += 1
        except ValueError as e:
            print(e)
            test_scores.append(np.nan)
            train_scores.append(np.nan)
            i += 1
            continue
        
    mean_test_scores.append(np.nanmean(test_scores))
    mean_train_scores.append(np.nanmean(train_scores))
    variance_test_scores.append(np.nanvar(test_scores))
    variance_train_scores.append(np.nanvar(train_scores))

generating and caching levenshtein_distance data matrix (could take considerable time)... done!
fold 1: best parameters: 0.01
fold 1: train score 0.74, test score 0.38
fold 2: best parameters: 0.1
fold 2: train score 0.74, test score 0.51
fold 3: best parameters: 0.1
fold 3: train score 0.67, test score 0.22
fold 4: best parameters: 0.1
fold 4: train score 0.75, test score 0.54
fold 5: best parameters: 0.1
fold 5: train score 0.70, test score 0.80
fold 1: best parameters: 100.0
fold 1: train score 0.40, test score 0.41
fold 2: best parameters: 1000.0
fold 2: train score 0.44, test score 0.41
fold 3: best parameters: 1000.0
fold 3: train score 0.45, test score 0.39
fold 4: best parameters: 10.0
fold 4: train score 0.41, test score 0.24
fold 5: best parameters: 0.1
fold 5: train score 0.47, test score 0.39
fold 1: best parameters: 100.0
fold 1: train score 0.37, test score 0.43
fold 2: best parameters: 1000.0
fold 2: train score 0.42, test score 0.36
fold 3: best parameters: 1000.0
fold 



optimal solution not found!
optimal solution not found!
optimal solution not found!
optimal solution not found!
optimal solution not found!




In [19]:
d = {'RMSE test' : mean_test_scores, 'test variance': variance_test_scores, 'RMSE train' : mean_train_scores, 'train variance' : variance_train_scores}
df = pd.DataFrame(d, index = ['CrispFuzzifier', 'QuantileConstantPiecewiseFuzzifier', 'QuantileLinearPiecewiseFuzzifier','LinearFuzzifier', 'ExponentialFuzzifier'])
df.head()

Unnamed: 0,RMSE test,test variance,RMSE train,train variance
CrispFuzzifier,0.488398,0.036885,0.722092,0.00091
QuantileConstantPiecewiseFuzzifier,0.368341,0.003982,0.431938,0.000667
QuantileLinearPiecewiseFuzzifier,0.355627,0.004829,0.404547,0.000526
LinearFuzzifier,,,,
ExponentialFuzzifier,,,,
