## Code to plot dendogram.
### This script generates the csv file with the structure. Then the csv is used by the html file
### The coloring of the clusters has to be set manually according to the resulting clusters, the names and structure of the clusters changes when the data change
### the code was taken from the moral machine project https://osf.io/3hvt2/?view_only=4bb49492edee4a8eb1758552a362a2cf

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.font_manager as fm
from scipy.cluster import hierarchy as hch
import matplotlib.patches as mpatches



In [2]:

def vectorize(df,col_start):
    """
        Returns N x F Numpy Array representing F dimensions of ACME values
        of N Countries.

        Input: Pandas DataFrame

        Output: Numpy Array
    """

    X = df.values[:, col_start:].astype(float)

    #Normalize Values
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

    return X


def create_d3_dendogram(df, leaf_color=None, save_csv=True,col_start=1):
    """
        Returns schema for D3 Radial Dendrogram Plot
    """
    prefs = df.columns[col_start:]
    X = vectorize(df,col_start)
    Z = hch.linkage(X, method='ward')
    #print(len(Z))
    rootnode, node_list = hch.to_tree(Z, rd=True)
    N = len(node_list) - 1
    #print(N)
    geneology = dict()
    node_hierarchy = list()
    node_culture = list()
    node_continent = list()
    for node in node_list[::-1]:
        id_ = N - node.id
        if id_ == 0:
            geneology[id_] = str(id_)

        if id_ not in geneology:
            for node2 in node_list:
                if (node2.count != 1):
                    if (node.id == node2.left.id) or (node.id == node2.right.id):
                        id2 = N - node2.id
                        if (node.dist == 0) & (node.right is None) & (node.left is None):
                            #print(node.right)
                            #print(node.left)
                            #print(node.id)
                            row = df.iloc[node.id]
                            lang_name = row["name"]
                            geneology[id_] = geneology[id2] + "." + lang_name
                        else:
                            geneology[id_] = geneology[id2] + "." + str(id_)
                        break

        hierarchy = geneology[id_]
        #print(hierarchy)
        if (node.dist == 0) & (node.right is None) & (node.left is None):
            
            row = df.iloc[node.id]
            family = row["family"]
            language = row["name"]
            node_hierarchy.append([hierarchy, family, language])
        else:
            node_hierarchy.append([hierarchy, None, None])

    d3_dendo_tree_df = pd.DataFrame(node_hierarchy)
    d3_dendo_tree_df.columns = ["id", "family", "language"]

    if save_csv:
        if leaf_color:
            d3_dendo_tree_df.to_csv('data/dendrogram_llm_{}.csv'.format(leaf_color))
        else:
            d3_dendo_tree_df.to_csv('dendrogram_llm.csv')

    return d3_dendo_tree_df



## read file and drop incomplete rows 

In [3]:
df = pd.read_csv("./cluster_data/model_preferences_by_lang_gpt4.csv")

mask = df.apply(lambda x: x.astype(str).str.contains('---')).any()

df = df.drop(columns=mask[mask].index)

df=df.set_index('criterion')

df=df.T.reset_index()

df=df.rename(columns={'index':'lang'})

df.iloc[:,1:]=df.iloc[:,1:].astype(float)

df=df.loc[~((df['Species_Humans']==0) & (df['Age_Young']==0) & (df['Fitness_Fit']==0) & (df['Gender_Female']==0) & (df['SocialValue_High']==0)& (df['Utilitarianism_More']==0))].reset_index(drop=True)

## merge with family names which will give the color

In [4]:
family=pd.read_csv("./cluster_data/lang_track_progress_family.csv")

family=family.loc[:,['family','name','lang']]

family.name=family.name.str.replace("Modern Greek \(1453-\)","Modern Greek",regex=True)

family.name=family.name.str.replace("Traditional","Trad",regex=True)
family.name=family.name.str.replace("Simplified","Simp",regex=True)

df=df.merge(family,on=['lang'])

In [5]:
df=df.loc[:,['lang', 'family', 'name', 'Species_Humans', 'Age_Young', 'Fitness_Fit', 'Gender_Female',
       'SocialValue_High', 'Utilitarianism_More']]

In [6]:
df=df.drop_duplicates(subset='name').reset_index(drop=True)

df.to_csv("dendogram_df_gpt.csv",index=False)

In [7]:
den=create_d3_dendogram(df, leaf_color="family",col_start=3)

## Exploration of the clusters because they are mannually entered in the javascript plot

In [9]:
den['clus']=den.id.str[:7]

In [10]:
den.clus.value_counts()

0.5.7.8    107
0.1.3.4     29
0.1.3.9     15
0.5.7.3     11
0.5.22.     10
0.1.2.2      5
0.1.2.1      3
0            1
0.1          1
0.1.2        1
0.1.3        1
0.5          1
0.5.7        1
0.5.22       1
Name: clus, dtype: int64

In [11]:
den['clus']=den.id.str[:9]

In [12]:
den.clus.value_counts()

0.5.7.8.1    106
0.1.3.4.6     21
0.1.3.9.1     13
0.5.7.32.     10
0.1.3.4.1      7
0.5.22.30      7
0.1.2.23.      4
0.5.22.28      3
0.1.2.12.      2
0.1.3.9        1
0.5.7.8        1
0.1            1
0.1.2.12       1
0.5.7          1
0.5.22         1
0.1.2.23       1
0.5            1
0.1.3.4        1
0.5.7.32       1
0.1.3          1
0.1.2          1
0.1.3.9.Y      1
0              1
Name: clus, dtype: int64

In [13]:
den['clus']=den.id.str[:13]

In [14]:
den.clus.value_counts()

0.5.7.8.15.18    63
0.5.7.8.15.47    19
0.1.3.4.6.14.    16
0.5.7.8.16.33    15
0.1.3.9.11.29     7
0.5.7.8.16.26     7
0.1.3.9.11.37     5
0.5.7.32.36.4     5
0.1.3.4.10.13     5
0.5.22.30.41.     4
0.1.3.4.6.17.     2
0.5.22.28.Zul     1
0.5.22.30.41      1
0.5.7.32.51       1
0.1.2.23.91       1
0                 1
0.5.7.32.36       1
0.1.2.23.91.T     1
0.1.2.12.Push     1
0.1.2.23.Panj     1
0.1.2.23.91.B     1
0.5.22.28.Mao     1
0.1.2.12.Kirg     1
0.5.7.32.51.C     1
0.5.7.32.51.G     1
0.5.7.32.36.H     1
0.5.22.30.Haw     1
0.1.3.9.Yiddi     1
0.5.22.30         1
0.5.7.32          1
0.1.3.4.10        1
0.1.2             1
0.1.3             1
0.1.3.4           1
0.5               1
0.1.3.4.6         1
0.5.7             1
0.5.7.8           1
0.1.3.9           1
0.1.3.9.11        1
0.1               1
0.1.2.12          1
0.1.3.4.6.14      1
0.5.7.8.15        1
0.5.7.8.16        1
0.1.3.4.6.17      1
0.5.22            1
0.1.2.23          1
0.5.22.28         1
0.1.3.4.10.Pe     1


In [20]:
den.loc[den.language=='Romanian']

Unnamed: 0,id,family,language,clus
122,0.5.7.8.15.47.59.75.86.Romanian,Indo-European,Romanian,0.5.7.8.15.47
