In [1]:
# imports
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import itertools
import math 
import statistics

In [2]:
cars = pd.read_csv("data/cars/cars.csv", index_col=0)

In [3]:
cars = cars[cars.horsepower != "?"]

## FS1

In [4]:
def isfloat(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

def calculate_FS1_for_each_column(df,group_label):
    
    col = df.columns
    colu = []
    for c in col:
        if (isfloat(df[c][0])):
            colu.append(c)
            
    for c in colu:
        df[c] = df[c].astype(float)
    
    string_columns_float = list(df.select_dtypes(exclude=['object']).columns)
    string_columns_object = list(df.select_dtypes(include=['object']).columns)
    
    print(f"Group label is: {group_label}")
    print(f"Group label has {len(df[group_label].unique())} unique values")
    no_of_datasets = len(df[group_label].unique())
    df = df.drop(labels=string_columns_object,axis=1)
    list_of_datasets = []
    origin_dict = {}
    dict_of_mean_values = {} #key is the origin and the values are list of means for each column of a datas

    #creating separated datasetst for each group
    for count,value in enumerate(df[group_label].unique()):
        list_of_datasets.append( df[df[group_label] == value])
        list_of_datasets[count] = list_of_datasets[count].reset_index()
    
    
    #drop origin column for each group and save it as a key in dictionary where 
    #the value of the key will be the corresponding dataset
    
    for dataset in list_of_datasets:
        
        group = int(dataset[group_label].unique()[0])
        
        origin_dict[group] = dataset.drop(labels=[group_label],axis=1)
        
        columns = origin_dict[group].columns
        scaler = MinMaxScaler()
        scaler.fit(origin_dict[group])
        origin_dict[group] = scaler.transform(origin_dict[group])
        origin_dict[group] = pd.DataFrame(origin_dict[group], columns = columns)
        origin_dict[group] = origin_dict[group].drop(labels=["index"] ,axis=1)
        dict_of_mean_values[group] = [origin_dict[group][c].mean() for c in origin_dict[group].columns]
        
    #calculation of diff and QDS between each combination of the groups.
    diff = []

    for key1, key2 in itertools.combinations(dict_of_mean_values.keys(), 2):
        diff.append(abs(np.subtract(np.array(dict_of_mean_values[key1]),np.array(dict_of_mean_values[key2]))))
            
    #creation of the matrix W
    D = origin_dict[1].shape[1]
    W = np.ones((D, D))
    
    for row in range(len(W)):
        for col in range(len(W[0])):
            dij = abs(row-col)
            W[row][col] = 1 - (dij/(D-1))
            
    #calculating QFD
    QFD = []
    for count,d in enumerate(diff):
        QFD.append(math.sqrt(np.dot(np.dot(d,W),np.transpose(d))))
        
    tds = statistics.mean(QFD)
    
    return tds

In [5]:
print(calculate_FS1_for_each_column(cars,"origin"))

Group label is: origin
Group label has 3 unique values
0.6268655640398301


# FS2

In [6]:
# FS2 funkcija

def calculate_FS2_for_each_column(df, group_label):
    
    col = df.columns
    colu = []
    for c in col:
        if (isfloat(df[c][0])):
            colu.append(c)
            
    for c in colu:
        df[c] = df[c].astype(float)
    
    string_columns_float = list(df.select_dtypes(exclude=['object']).columns)
    string_columns_object = list(df.select_dtypes(include=['object']).columns)
    
    print(f"Group label is: {group_label}")
    print(f"Group label has {len(df[group_label].unique())} unique values")
    no_of_datasets = len(df[group_label].unique())
    df = df.drop(labels=string_columns_object,axis=1)
    list_of_datasets = []
    origin_dict = {}
    dict_of_mean_values = {} #key is the origin and the values are list of means for each column of a datas
    dict_of_std_values_minus = {}
    dict_of_std_values_plus = {}
    final_dict = {}

    #creating separated datasetst for each group
    for count,value in enumerate(df[group_label].unique()):
        list_of_datasets.append( df[df[group_label] == value])
        list_of_datasets[count] = list_of_datasets[count].reset_index()
    
    
    #drop origin column for each group and save it as a key in dictionary where 
    #the value of the key will be the corresponding dataset
    
    for dataset in list_of_datasets:
        
        group = int(dataset[group_label].unique()[0])
        
        origin_dict[group] = dataset.drop(labels=[group_label],axis=1)
        
        columns = origin_dict[group].columns
        scaler = MinMaxScaler()
        scaler.fit(origin_dict[group])
        origin_dict[group] = scaler.transform(origin_dict[group])
        origin_dict[group] = pd.DataFrame(origin_dict[group], columns = columns)
        origin_dict[group] = origin_dict[group].drop(labels=["index"] ,axis=1)
        dict_of_std_values_minus[group] = [origin_dict[group][c].mean() - origin_dict[group][c].std() for c in origin_dict[group].columns]
        dict_of_mean_values[group] = [origin_dict[group][c].mean() for c in origin_dict[group].columns]
        dict_of_std_values_plus[group] = [origin_dict[group][c].mean() + origin_dict[group][c].std() for c in origin_dict[group].columns]
        matrix = np.array([dict_of_std_values_minus[group], dict_of_mean_values[group], dict_of_std_values_plus[group]])
        final_dict[group] = matrix
        
    #calculation of diff and QDS between each combination of the groups.
    diff = []

    for key1, key2 in itertools.combinations(final_dict.keys(), 2):
        diff.append(abs(np.subtract(np.array(final_dict[key1]),np.array(final_dict[key2]))))
            
    #creation of the matrix W
    D = origin_dict[1].shape[1]
    W = np.ones((D, D))
        
    for row in range(len(W)):
        for col in range(len(W[0])):
            dij = abs(row-col)
            W[row][col] = 1 - (dij/(D-1))
            
    # print(diff)
                        
    #calculating QFD
    QFD = []
    for count,d in enumerate(diff):
        # print(d.shape)
        # print(W.shape)
        QFD.append(np.sqrt(np.dot(np.dot(d,W),np.transpose(d))))
            
    TDS = sum(QFD)/len(QFD)
    TDS = sum(TDS)/len(TDS)
    TDS = sum(TDS)/len(TDS)
    
    return TDS

In [7]:
print(calculate_FS2_for_each_column(cars,"origin"))

Group label is: origin
Group label has 3 unique values
0.625254807610549


# FS3 

In [8]:
# FS3 funkcija

def calc_hist(numbers):
    range_0_2 = 0
    range_2_4 = 0
    range_4_6 = 0
    range_6_8 = 0
    range_8_10 = 0

    # Iterate through the numbers and count occurrences in each range
    for number in numbers:
        if 0 <= number < 0.2:
            range_0_2 += 1
        elif 0.2 <= number < 0.4:
            range_2_4 += 1
        elif 0.4 <= number < 0.6:
            range_4_6 += 1
        elif 0.6 <= number < 0.8:
            range_6_8 += 1
        elif 0.8 <= number <= 1.0:
            range_8_10 += 1

    ranges = [range_0_2, range_2_4, range_4_6, range_6_8, range_8_10]
    
    return(np.array(ranges))

def calculate_FS3_for_each_column(df, group_label):
    
    col = df.columns
    colu = []
    for c in col:
        if (isfloat(df[c][0])):
            colu.append(c)
            
    for c in colu:
        df[c] = df[c].astype(float)
    
    string_columns_float = list(df.select_dtypes(exclude=['object']).columns)
    string_columns_object = list(df.select_dtypes(include=['object']).columns)
    
    print(f"Group label is: {group_label}")
    print(f"Group label has {len(df[group_label].unique())} unique values")
    no_of_datasets = len(df[group_label].unique())
    df = df.drop(labels=string_columns_object,axis=1)
    list_of_datasets = []
    origin_dict = {}
    final_dict = {}

    #creating separated datasetst for each group
    for count,value in enumerate(df[group_label].unique()):
        list_of_datasets.append( df[df[group_label] == value])
        list_of_datasets[count] = list_of_datasets[count].reset_index()
    
    
    #drop origin column for each group and save it as a key in dictionary where 
    #the value of the key will be the corresponding dataset
    
    for dataset in list_of_datasets:
        
        group = int(dataset[group_label].unique()[0])
        
        origin_dict[group] = dataset.drop(labels=[group_label],axis=1)
        
        columns = origin_dict[group].columns
        scaler = MinMaxScaler()
        scaler.fit(origin_dict[group])
        origin_dict[group] = scaler.transform(origin_dict[group])
        origin_dict[group] = pd.DataFrame(origin_dict[group], columns = columns)
        origin_dict[group] = origin_dict[group].drop(labels=["index"] ,axis=1)

        final_dict[group] = [calc_hist(origin_dict[group][c]) for c in origin_dict[group].columns]
        
    #calculation of diff and QDS between each combination of the groups.
    diff = []

    for key1, key2 in itertools.combinations(final_dict.keys(), 2):
        diff.append(abs(np.subtract(np.array(final_dict[key1]),np.array(final_dict[key2]))))
            
    # print(diff)
    
    #creation of the matrix W
    D = len(final_dict[1])
    W = np.ones((D, D))
        
    for row in range(len(W)):
        for col in range(len(W[0])):
            dij = abs(row-col)
            W[row][col] = 1 - (dij/(D-1))
                        
    #calculating QFD
    QFD = []
    for count,d in enumerate(diff):
        # print(W.shape)
        # print(d.shape)
        # QFD.append(np.sqrt(np.dot(np.dot(d,W),np.transpose(d))))
        QFD.append(np.sqrt(np.dot(np.dot(np.transpose(d),W),d)))
        
    TDS = sum(QFD)/len(QFD)
    TDS = sum(TDS)/len(TDS)
    TDS = sum(TDS)/len(TDS)
    
    return TDS

In [9]:
print(calculate_FS3_for_each_column(cars,"origin"))

Group label is: origin
Group label has 3 unique values
150.19531329556122
