In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

from sklearn import tree
from sklearn.tree import _tree

import time
import sklearn

In [17]:
filename = "wine-quality.csv"
data = pd.read_csv(filename)

# print the first 10 rows of data
data.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,good
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,bad
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,bad
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,good
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,good
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,bad


In [18]:
# save the output class, "quality", in y_data
y_data=data["quality"]
# save the features in a new dataframe (datafeatures)
datafeatures = data.drop(["quality"],axis=1,inplace=False)
datafeatures

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [19]:
y_data = y_data.replace("good",1)
y_data = y_data.replace("bad",0)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(datafeatures, y_data, test_size=0.30, random_state=42)

In [21]:
X_train_1 = np.array(X_train)
X_test_1 = np.array(X_test)
y_train_1 =np.array(y_train)
y_test_1 = np.array(y_test)

In [22]:
from sklearn import preprocessing
scaler = StandardScaler(copy=False)
X_train = scaler.fit_transform(X_train_1)
X_test = scaler.fit_transform(X_test_1)


In [23]:
## train the model based on whole data set to find feature importance
clfDT = DecisionTreeClassifier(random_state=12)
clfDT = clfDT.fit(X_train_1, y_train_1)

#### Feature Importance

In [24]:
feat_importancesDT = pd.Series(clfDT.feature_importances_, index = datafeatures.columns)
print (feat_importancesDT)

fixed acidity           0.065188
volatile acidity        0.119451
citric acid             0.048375
residual sugar          0.056745
chlorides               0.084038
free sulfur dioxide     0.036976
total sulfur dioxide    0.095882
density                 0.022758
pH                      0.063453
sulphates               0.124692
alcohol                 0.282442
dtype: float64


In [25]:
## most important feature is alcohol so first we find the median
alcohol_median = data.iloc[:, -2].median(axis=0)
print(alcohol_median)

10.2


In [26]:
## split the data set based on median of alcohol
list1=[]
list2=[]
for i in range(data.shape[0]):
    if data.iloc[i, -2] <= alcohol_median:
        list1.append(data.iloc[i, :])
    else:
        list2.append(data.iloc[i, :])
    


In [27]:
# print(pd.DataFrame(list1))
print(len(list1),"= total numbers of less than median")
print(len(list2),"= total numbers of more than median")

842 = total numbers of less than median
757 = total numbers of more than median


In [28]:
less_than_median_data = pd.DataFrame(list1)
more_than_median_data = pd.DataFrame(list2)

In [29]:
y_data_list1= less_than_median_data["quality"]
# save the features in a new dataframe (datafeatures)
datafeatures1 = less_than_median_data.drop(["quality"],axis=1,inplace=False)
X_train_list1, X_test_list1, y_train_list1, y_test_list1 = train_test_split(datafeatures1, y_data_list1, test_size=0.30, random_state=42)
datafeatures1

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1572,7.3,0.690,0.32,2.2,0.069,35.0,104.0,0.99632,3.33,0.51,9.5
1583,6.2,0.460,0.29,2.1,0.074,32.0,98.0,0.99578,3.33,0.62,9.8
1589,6.6,0.725,0.20,7.8,0.073,29.0,79.0,0.99770,3.29,0.54,9.2
1593,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5


In [30]:
y_data_list2= more_than_median_data["quality"]
# save the features in a new dataframe (datafeatures)
datafeatures2 = more_than_median_data.drop(["quality"],axis=1,inplace=False)
X_train_list2, X_test_list2, y_train_list2, y_test_list2 = train_test_split(datafeatures2, y_data_list2, test_size=0.30, random_state=42)
datafeatures2

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
9,7.5,0.500,0.36,6.1,0.071,17.0,102.0,0.99780,3.35,0.80,10.5
11,7.5,0.500,0.36,6.1,0.071,17.0,102.0,0.99780,3.35,0.80,10.5
16,8.5,0.280,0.56,1.8,0.092,35.0,103.0,0.99690,3.30,0.75,10.5
31,6.9,0.685,0.00,2.5,0.105,22.0,37.0,0.99660,3.46,0.57,10.6
36,7.8,0.600,0.14,2.4,0.086,3.0,15.0,0.99750,3.42,0.60,10.8
...,...,...,...,...,...,...,...,...,...,...,...
1592,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0


### Get rules of Decision Tree

In [31]:
# this function extracts the rules from a Decision Tree model and writes them as IF-THEN sentences
def get_rules(tree, feature_names, class_names, target_label):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"{name} <= {np.round(threshold, 3)}"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"{name} > {np.round(threshold, 3)}"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    rules = []
    for i,path in enumerate(paths):
        rule = "RULE {}".format(i+1)+": IF "
        
        for p in path[:-1]:
            if rule != "RULE {}".format(i+1)+": IF ":
                rule += " AND "
            rule += str(p)
        rule += " THEN "
        if class_names is None:
            rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            rule += f"{target_label} = {class_names[l]}"
        rules += [rule]
        
    return rules

In [34]:
## train the DT on list1 that has lower value than median
clfDT_lower_value = DecisionTreeClassifier(random_state=10)
clfDT_lower_value = clfDT_lower_value.fit(X_train_list1, y_train_list1)

In [35]:
# filename for saving the rules
filename_rulesDT = "DT_rules_lower_value.csv"
rulesDT = get_rules(clfDT_lower_value, datafeatures.columns, ['"0"','"1"'], "quality")
for r in rulesDT:
    with open(filename_rulesDT,"a") as rulefile:
        rulefile.write(r+"\n")
    print(r)

RULE 1: IF volatile acidity > 0.545 AND sulphates <= 0.535 AND pH <= 3.545 AND residual sugar <= 3.95 AND chlorides > 0.069 AND volatile acidity > 0.585 THEN quality = "0"
RULE 2: IF volatile acidity > 0.545 AND sulphates > 0.535 AND total sulfur dioxide > 46.0 AND fixed acidity <= 9.95 AND sulphates <= 1.32 AND sulphates > 0.585 AND alcohol > 9.15 AND chlorides > 0.08 THEN quality = "0"
RULE 3: IF volatile acidity <= 0.545 AND sulphates <= 0.585 AND alcohol <= 9.75 AND chlorides > 0.08 AND residual sugar > 1.85 THEN quality = "0"
RULE 4: IF volatile acidity <= 0.545 AND sulphates > 0.585 AND total sulfur dioxide <= 62.0 AND volatile acidity <= 0.315 AND density <= 1.0 AND total sulfur dioxide > 17.5 THEN quality = "1"
RULE 5: IF volatile acidity > 0.545 AND sulphates > 0.535 AND total sulfur dioxide <= 46.0 AND fixed acidity > 6.95 AND sulphates <= 0.575 AND volatile acidity <= 0.975 AND total sulfur dioxide > 19.0 AND total sulfur dioxide <= 41.5 THEN quality = "0"
RULE 6: IF volatil

### Train the second Decision Tree on data set has higher value rather median

In [36]:
## train the DT on list2
clfDT_higher_value = DecisionTreeClassifier(random_state=10)
clfDT_higher_value = clfDT_higher_value.fit(X_train_list2, y_train_list2)

In [37]:
# filename for saving the rules
filename_rulesDT = "DT_rules_higher_value.csv"
rulesDT = get_rules(clfDT_higher_value, datafeatures.columns, ['"0"','"1"'], "quality")
for r in rulesDT:
    with open(filename_rulesDT,"a") as rulefile:
        rulefile.write(r+"\n")
    print(r)

RULE 1: IF alcohol > 11.45 AND volatile acidity <= 0.515 AND citric acid <= 0.675 AND pH <= 3.395 THEN quality = "1"
RULE 2: IF alcohol <= 11.45 AND sulphates > 0.585 AND total sulfur dioxide <= 77.5 AND sulphates > 0.745 AND volatile acidity > 0.2 AND sulphates <= 1.155 AND total sulfur dioxide <= 56.0 THEN quality = "1"
RULE 3: IF alcohol <= 11.45 AND sulphates > 0.585 AND total sulfur dioxide <= 77.5 AND sulphates <= 0.745 AND alcohol > 10.525 AND chlorides <= 0.098 AND total sulfur dioxide <= 64.0 AND fixed acidity <= 12.95 AND residual sugar > 1.95 AND volatile acidity <= 0.985 AND free sulfur dioxide <= 27.5 THEN quality = "1"
RULE 4: IF alcohol > 11.45 AND volatile acidity > 0.515 AND sulphates > 0.565 AND volatile acidity <= 0.95 AND citric acid <= 0.75 AND free sulfur dioxide > 4.0 AND pH <= 3.565 THEN quality = "1"
RULE 5: IF alcohol <= 11.45 AND sulphates <= 0.585 AND free sulfur dioxide <= 6.5 AND volatile acidity > 0.385 AND density <= 0.999 THEN quality = "0"
RULE 6: IF a

### CONFIG

In [38]:
import os

# input parameters for computing Bag-of-Words results for the vehicle platooning application
# INPUT RULESETS
# paths of the rulesets; CSV files should be rules in if-then format and without covering/error statistics
RULEFILE1 = 'DT_rules_higher_value.csv'
RULEFILE2 = 'DT_rules_lower_value.csv'
# decide names for identifying the considered rulesets
RULESET_IDLIST = ['Alcohol High', 'Alcohol Low']
# dict to associate, if needed, class labels as appear in the rules with more self-explicatory names
# if it is not needed, set key = value for each class (e.g., in this platooning example, rules are 
# expressed with "1 collision" and "0 safe", we convert them to "Collision" and "Non Collision", respectively)
# NOTE: a python dict is structured as {key:value}
CLASS_LABELS_DICT = {"1":"good", "0":"bad"} 

# CONFIGS TO SAVE THE RESULTS
# name and creation of a folder where to save the output files (if saved, see later)
RESULTS_DIR = "results-BoW/"
if not os.path.isdir(RESULTS_DIR):
    os.mkdir(RESULTS_DIR)

# set to True to save the BoW matrix to an excel file (here, we set it to false since this matrix is an intermediate result)
SAVE_BOW = False
if SAVE_BOW:
# name of the file where to save (if saved) the BoW matrix
    outputBoW=RESULTS_DIR+"BagOfWords.xlsx"

# set to True to save BoW similarity matrix (all values for all couples of rules) to an excel file
SAVE_BOWSIM = True
if SAVE_BOWSIM:
    # file name
    outputRuleSim=RESULTS_DIR+"BoWSimilarity.xlsx"

# CONFIGS FOR THE AGGREGATED RULE SIMILARITY VALUES
# set to True to save intermediate results in rule similarities aggregations
SAVE_INTERMEDIATE = False
if SAVE_INTERMEDIATE:
    # file name
    outfile = "bowMean_"+RULESET_IDLIST[0]+RULESET_IDLIST[1]+".xlsx"


### BOW SIMILARITY

In [39]:
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import os
import itertools as itr
# from config import*

def ProcessRulesets(Ruleset1, Ruleset2, class_labels_dict):
    """ Given the rulesets, finds each rule premise ("Rule") and 
    consequence ("Class") and creates a single dataframe with both rulesets """
    def convert_litteral(classOut, class_labels_dict):
        return class_labels_dict[classOut]
        
    Ruleset1['Rule'] = Ruleset1.apply(lambda x: x[0][x[0].find('IF')+3:x[0].find('THEN')-1], axis=1)
    #Ruleset1['Class'] = Ruleset1.apply(lambda x: x[0][x[0].find('"')+1:x[0].find('"')+2], axis=1)
    #print(Ruleset1[0])
    Ruleset1['Class'] = Ruleset1.apply(lambda x: x[0].split('"')[1], axis=1)

    Ruleset1.drop(Ruleset1.columns[0] , axis=1, inplace=True)

    Ruleset2['Rule'] = Ruleset2.apply(lambda x: x[0][x[0].find('IF')+3:x[0].find('THEN')-1], axis=1)
    #Ruleset2['Class'] = Ruleset2.apply(lambda x: x[0][x[0].find('"')+1:x[0].find('"')+2], axis=1)
    Ruleset2['Class'] = Ruleset2.apply(lambda x: x[0].split('"')[1], axis=1)
    
    Ruleset2.drop(Ruleset2.columns[0] , axis=1, inplace=True)

    Ruleset_Tot = pd.concat([Ruleset1, Ruleset2], ignore_index=True)
    Ruleset_Tot['Class'] = Ruleset_Tot['Class'].apply(convert_litteral, args=(class_labels_dict,))
    return Ruleset_Tot

def ImportAndProcessRulesets(rulefile1, rulefile2, rulesetIDlist, class_labels_dict):
    """ Import IF-THEN rulesets, in csv format;
    inputs:
    base_dir: workfolder with rulesets
    rulefile1, rulefile2: file names of the rulesets
    rulesetIDlist: 2-elements list with rulesets IDs (will fill 'Set' column)
    
    output:
    Dataframe with columns: Set (ruleset IDs), Rule (premise with conditions 
    in logical AND) and Class (output) ; 
    """ 
    # read csv rulesets
    Ruleset1 = pd.read_csv(rulefile1, header=None)
    #print(Ruleset1)
    '''
    Ruleset1.drop(Ruleset1.columns[1] , axis=1, inplace=True)
    Ruleset1.drop(Ruleset1.columns[1] , axis=1, inplace=True)
    print(Ruleset1)
    '''
    Ruleset1['Set'] = rulesetIDlist[0]
    
    Ruleset2 = pd.read_csv(rulefile2, header=None)
    '''
    Ruleset2.drop(Ruleset2.columns[1] , axis=1, inplace=True)
    Ruleset2.drop(Ruleset2.columns[1] , axis=1, inplace=True)
    print(Ruleset2)
    '''  
    Ruleset2['Set'] = rulesetIDlist[1]

    # separate premise and consequence
    Ruleset = ProcessRulesets(Ruleset1, Ruleset2, class_labels_dict)
    return Ruleset


def UniqueConditionOccurrences(Ruleset):
    terms = []
    for index, value in Ruleset.loc[:,'Rule'].items():
        for r in value.split(' AND '):
            if r.find('>')>0:
                terms.append(r[:r.find('>')+2].replace(" ", ""))
            else:
                terms.append(r[:r.find('<')+2].replace(" ", ""))
    # counts unique occurrences
    counter = Counter(terms)
    return counter

def CreateFSandTcolumns(Ruleset, counter):
    # iterate on unique feature plus operator
    for c in counter:
      # index is the row of "Regola" column, value is the rule in the column
      for index, value in Ruleset.loc[:,'Rule'].items():
        # get single conditions of the rule and check the presence of condition c in them
        for r in value.split(' AND '):
          # r condition matches condition c (with operators > or >=)
          if r[:r.find('>')+2].replace(" ", "") == c:
            # set fs column to 1
            Ruleset.loc[index, c] = 1
            # set threshold column value to the threshold contained in r 
            Ruleset.loc[index, c+'Value'] = float(r.split('>')[1].strip())
          # same as above, for < or <= operators
          if r[:r.find('<')+2].replace(" ", "") == c:
            Ruleset.loc[index, c] = 1
            Ruleset.loc[index, c+'Value'] = float(r.split('<=')[1].strip())
    return Ruleset

def NormalizeThresholds(Ruleset, counter):
    for c in counter:
        # for condition c, for the rules in which it is present,
        # get the maximum and minimum threshold values
        MAX = Ruleset.loc[Ruleset[c] == 1, c+'Value'].max()
        MIN = Ruleset.loc[Ruleset[c] == 1, c+'Value'].min()
        denominatore = MAX-MIN
        # iterate over the rules
        for index, value in Ruleset.loc[:,c+'Value'].items():
            # check presence of c in current rule
            if Ruleset.loc[index, c] == 1:
              # max = min
                if denominatore == 0:
                    Ruleset.loc[index, c+'ValueNorm'] = 1
                      # max != min
                else:  
                    Ruleset.loc[index, c+'ValueNorm'] = (Ruleset.loc[index, c+'Value']-MIN)/denominatore
            # c is not present in current rule
            else:
                Ruleset.loc[index, c+'Value'] = 0
    # once completed, delete non normalized values
    for c in counter:
        del Ruleset[c+'Value']
    return Ruleset

def buildBOW(Ruleset, save_res = True):

    # get unique set of rules conditions (no repetititions)
    counter = UniqueConditionOccurrences(Ruleset)
    # 1. INITIALIZE BOW MATRIX
    # initialize Ruleset with 3 new zeros columns for each element in counter:
    # - column i with feature plus operator (fs)
    # - fsValue (original threshold value)
    # - fsValueNorm (normalized threshold value)
    for i in counter:
        #print(i)
        Ruleset.loc[:,i]= 0
        Ruleset[i+'Value']=0.0
        Ruleset[i+'ValueNorm']=0.0
   
    # 2. FILL BOW MATRIX (threshold values not normalized)
    Ruleset = CreateFSandTcolumns(Ruleset, counter)
       
    # 3. NORMALIZE THRESHOLDS 
    Ruleset = NormalizeThresholds(Ruleset, counter)
    
    # 4. Save to excel in current path
    if save_res:
        Ruleset.to_excel(outputBoW, index=True)
    # 5. Convert FS and T columns in Ruleset from DataFrame to numpy array
    # this is the proper BoW matrix ("Set","Rule" and "Class" are no more considered)
    bow_matrix = Ruleset.iloc[:,3:Ruleset.shape[1]].to_numpy()
    
    return bow_matrix


def BoW_Similarity(bow_matrix, Ruleset, save_res=True):  
    BowSim = cosine_similarity(bow_matrix, bow_matrix)
    RuleSimilarities = pd.DataFrame(BowSim,
                       index = Ruleset['Set'] + ' ~ ' + Ruleset['Class'] + ' ~ ' + Ruleset['Rule'],
                       columns = Ruleset['Set'] + ' ~ ' + Ruleset['Class'] + ' ~ ' + Ruleset['Rule'])
    if save_res:
        RuleSimilarities.to_excel(outputRuleSim, index=True)
    return RuleSimilarities




### BOWUTILES

In [40]:
# useful function to check if a string s starts with string n
def startswith(s, n): 
    return str(s)[:len(n)] == n


### BOW AGGREGATION

In [41]:

def GetIDandClassFromBoW(rulesim):
    rulesetid=[]
    classlabels=[]
    # iterate over the groups of rules
    for rule in list(rulesim.columns):
        # remove blank spaces
        #rule = rule.replace(" ","")
        # get rule info
        #print("rule: ", rule)
        ID, outclass, premise = rule.split("~")       
        rulesetid.append(ID)
        classlabels.append(outclass)
    uniqueIDs = set(rulesetid)
    uniqueclasslabels = set(classlabels)
    return uniqueIDs, uniqueclasslabels

def MeanSimSameIDandClass(rulesim,uniqueIDs,uniqueclasslabels):
    # comparison within same IDs and Classes
    meanSame ={}
    for idx, out in itr.product(uniqueIDs, uniqueclasslabels):
        #print(idx,out)
        rule_idc=[]#rules with same id and same class
        for rule in list(rulesim.columns):
            #print(rule)
            #print(idx + '~' + out)
            #print(idx + '~' + out in rule)
            if startswith(rule, idx + '~' + out):
                #print("startswith ok")
                rule_idc.append(rule)
            #print(rule_idc)
        #print(rulesim.loc[rule_idc,rule_idc])
        # così mi trovo quelli che ho definito nel foglio al punto 1)
        rulesim_idc = rulesim.loc[rule_idc,rule_idc].mean(axis=0)
        meanSame[str(idx)+'-'+str(out)] = [np.mean(list(rulesim_idc.values))]
    #print("mean same")
    #print(meanSame)
    return meanSame

def MeanSimDiffIDorClass(rulesim,uniqueIDs, uniqueclasslabels):
    # comparison between different ids and/or different classes
    meanDiff={}
    idc_list=list(itr.combinations(itr.product(uniqueIDs, uniqueclasslabels),2))
    #print(idc_list)
    # get a combination ((id,cls),(id,cls))
    for el in idc_list:
        idx1= el[0][0]
        out1 = el[0][1]
        idx2 = el[1][0]
        out2 = el[1][1]
        rule_idc1=[]
        rule_idc2=[]
        #print(el[0][0],el[0][1],"-",el[1][0],el[1][1])
        for rule in list(rulesim.columns):
            if startswith(rule, idx1 + '~' + out1):
                rule_idc1.append(rule)
            if startswith(rule, idx2 + '~' + out2):
                rule_idc2.append(rule)

        #print(rule_idc)
        #print(rulesim.loc[rule_idc,rule_idc])

        # così mi trovo le medie per i blocchi rosa e verdi che ho messo nell'excel
        rulesim_idc = rulesim.loc[rule_idc1,rule_idc2].mean(axis=0)
        #print(np.mean(list(rulesim_idc.values)))
        meanDiff[str(idx1)+' '+str(out1)+"-"+str(idx2)+' '+str(out2)]=[np.mean(list(rulesim_idc.values))]
    #print("mean diff")
    #print(meanDiff)
    return meanDiff

def MeanSimDiffIDSameClass(rulesim,uniqueIDs, uniqueclasslabels):
    # comparison between different ids and same classes
    meanDiff_outlist=[]
    # fix an output
    for out in uniqueclasslabels:
        meanDiff_out ={}
        # get the 2 ids
        idx1 = list(uniqueIDs)[0]
        idx2 = list(uniqueIDs)[1]
        rule_idc1_out=[]
        rule_idc2_out=[]
        for rule in list(rulesim.columns):
            # rules with different id and same output (out1)
            if startswith(rule, idx1 + '~' + out):
                rule_idc1_out.append(rule)
            if startswith(rule, idx2 + '~' + out):
                rule_idc2_out.append(rule)

        # così mi trovo le medie per i blocchi Bji^00 e Bji^11 (vedi schema foglio su carta)
        rulesim_idc_out = rulesim.loc[rule_idc1_out,rule_idc2_out].mean(axis=0)
        meanDiff_out[str(idx1)+' '+str(out)+"-"+str(idx2)+' '+str(out)]=np.mean(list(rulesim_idc_out.values))
        # list of dicts; 1 dict for each output class
        meanDiff_outlist.append(meanDiff_out)
    #print(meanDiff_outlist)
    return meanDiff_outlist

def GetMeanRulesetSim(meanDiff_out):
    meanvaluesperoutput=[]
    for mout_dict in meanDiff_out:
        for k in mout_dict:
            meanvaluesperoutput.append(mout_dict[k])
    #print(meanvaluesperoutput)
    #print(np.mean(meanvaluesperoutput))
    return np.mean(meanvaluesperoutput)




def Aggregate_BoW(rulesim, base_dir, save_res=True):
    # parse rulesim column names (are = to the rows) and get information about the rules
    # in particular, collect all the IDs and all the class labels
    uniqueIDs, uniqueclasslabels = GetIDandClassFromBoW(rulesim)
    #print(uniqueIDs)
    #print(uniqueclasslabels)

    # compute the mean rule similarities between rules for fixed ruleset (fixed ID) AND fixed output class
    meanSame = MeanSimSameIDandClass(rulesim,uniqueIDs, uniqueclasslabels)
    #print(meanSame)
    # compute the mean rule similarities between rules of different rulesets (different IDs) 
    # OR different classes (cannot have same ID and same class, otherwise it is as the first case above)
    meanDiff = MeanSimDiffIDorClass(rulesim,uniqueIDs, uniqueclasslabels)   
    #print(meanDiff)


    # convert to dataframe; TODO: vedere come vogliamo unire questi
    meanSameDf = pd.DataFrame.from_dict(meanSame)
    meanDiffDf = pd.DataFrame.from_dict(meanDiff)
    
    if save_res:
        meanSameDf.to_excel(base_dir+'same_'+outfile,index=False)
        meanDiffDf.to_excel(base_dir+'diff_'+outfile,index=False)

    # per fare come in lavoro con Rulex
    # qui ottengo una lista di dict con 2 valori, che sono le medie tra regole di 2
    # ruleset diversi riferite a uno stesso output (quindi 2 classi output --> 2 valori)
    meanDiff_out = MeanSimDiffIDSameClass(rulesim,uniqueIDs, uniqueclasslabels)
    # calcolo la media delle medie per avere il valore finale
    ruleset_similarity = GetMeanRulesetSim(meanDiff_out)
    
    return ruleset_similarity

### BOWTEST

In [42]:
# 	import and pre-process the rulesets before building the BoW matrix
Ruleset = ImportAndProcessRulesets(RULEFILE1, RULEFILE2, RULESET_IDLIST, CLASS_LABELS_DICT)
#print(Ruleset)
# compute BoW matrix
BoW_matrix = buildBOW(Ruleset, save_res = SAVE_BOW)
#print(BoW_matrix)
# compute rule similarity
rulesimilarity = BoW_Similarity(BoW_matrix, Ruleset, save_res= SAVE_BOWSIM)
#print(rulesimilarity)

# compute the means of rule similarities

ruleset_similarity = Aggregate_BoW(rulesimilarity,RESULTS_DIR, save_res = SAVE_INTERMEDIATE)
print("The Ruleset Similarity between {} and {} is {}".format(RULESET_IDLIST[0],RULESET_IDLIST[1],ruleset_similarity))


The Ruleset Similarity between Alcohol High and Alcohol Low is 0.3157292029192557
