## Code to plot dendogram.
### This script generates the csv file with the structure. Then the csv is used by the html file
### The coloring of the clusters has to be set manually according to the resulting clusters, the names and structure of the clusters changes when the data change
### To be able to visualize the dendogram run "python3 -m http.server" and open the culture_human.html file 
### the code was taken from the moral machine project https://osf.io/3hvt2/?view_only=4bb49492edee4a8eb1758552a362a2cf

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.font_manager as fm
from scipy.cluster import hierarchy as hch
import matplotlib.patches as mpatches


In [2]:

def vectorize(df,col_start):
    """
        Returns N x F Numpy Array representing F dimensions of ACME values
        of N Countries.

        Input: Pandas DataFrame

        Output: Numpy Array
    """

    X = df.values[:, col_start:].astype(float)

    #Normalize Values
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

    return X


def create_d3_dendogram(df, leaf_color=None, save_csv=True,col_start=1):
    """
        Returns schema for D3 Radial Dendrogram Plot
    """
    prefs = df.columns[col_start:]
    X = vectorize(df,col_start)
    Z = hch.linkage(X, method='ward')
    #print(len(Z))
    rootnode, node_list = hch.to_tree(Z, rd=True)
    N = len(node_list) - 1
    #print(N)
    geneology = dict()
    node_hierarchy = list()
    node_culture = list()
    node_continent = list()
    for node in node_list[::-1]:
        id_ = N - node.id
        if id_ == 0:
            geneology[id_] = str(id_)

        if id_ not in geneology:
            for node2 in node_list:
                if (node2.count != 1):
                    if (node.id == node2.left.id) or (node.id == node2.right.id):
                        id2 = N - node2.id
                        if (node.dist == 0) & (node.right is None) & (node.left is None):
                            #print(node.right)
                            #print(node.left)
                            #print(node.id)
                            row = df.iloc[node.id]
                            lang_name = row["name"]
                            geneology[id_] = geneology[id2] + "." + lang_name
                        else:
                            geneology[id_] = geneology[id2] + "." + str(id_)
                        break

        hierarchy = geneology[id_]
        #print(hierarchy)
        if (node.dist == 0) & (node.right is None) & (node.left is None):
            
            row = df.iloc[node.id]
            family = row["family"]
            language = row["name"]
            node_hierarchy.append([hierarchy, family, language])
        else:
            node_hierarchy.append([hierarchy, None, None])

    d3_dendo_tree_df = pd.DataFrame(node_hierarchy)
    d3_dendo_tree_df.columns = ["id", "family", "language"]

    if save_csv:
        if leaf_color:
            d3_dendo_tree_df.to_csv('./cluster_data/dendrogram_human_{}.csv'.format(leaf_color))
        else:
            d3_dendo_tree_df.to_csv('./cluster_data/dendrogram_human.csv')

    return d3_dendo_tree_df



## read file and drop incomplete rows 

In [3]:
df = pd.read_csv("./cluster_data/human_preferences_by_lang_unpivoted.csv")

df=pd.pivot_table(df,values='final_est',index=['Languages'],columns=['Label']).reset_index()

df=df.rename(columns={'Languages':'lang'})

df.iloc[:,1:]=df.iloc[:,1:].astype(float)

## merge with family names which will give the color

In [4]:
family=pd.read_csv("./cluster_data/lang_track_progress_family.csv")

family=family.loc[:,['family','name','lang']]

family.name=family.name.str.replace("Modern Greek \(1453-\)","Modern Greek",regex=True)

family.name=family.name.str.replace("Traditional","Trad",regex=True)
family.name=family.name.str.replace("Simplified","Simp",regex=True)

df=df.merge(family,on=['lang'])



In [5]:
df=df.loc[:,['lang', 'family', 'name', 'Species', 'Age', 'Fitness', 'Gender',
       'Social Status', 'No. Characters']]

In [6]:
den=create_d3_dendogram(df, leaf_color="family",col_start=3)

## Exploration of the clusters because they are mannually entered in the javascript plot

In [7]:
den['clus']=den.id.str[:7]

In [8]:
den.clus.value_counts()

clus
0.2.3.4    85
0.2.6.8    31
0.1.5.7    28
0.2.3.1     3
0.1.10.     2
0           1
0.1         1
0.2         1
0.2.3       1
0.1.5       1
0.2.6       1
0.1.10      1
0.2.6.M     1
Name: count, dtype: int64

In [9]:
den['clus']=den.id.str[:9]

In [10]:
den.clus.value_counts()

clus
0.2.3.4.9    75
0.2.6.8.1    30
0.1.5.7.1    24
0.2.3.4.2     9
0.2.3.14.     2
0.1.5.76.     2
0             1
0.1.10.Or     1
0.1.10.Sh     1
0.1.5.76      1
0.2.3.14      1
0.1.10        1
0.1           1
0.2.6.8       1
0.1.5.7       1
0.2.6         1
0.1.5         1
0.2.3.4       1
0.2.3         1
0.2           1
0.2.6.Mal     1
Name: count, dtype: int64

In [12]:
den.loc[den.language=='Malagasy']

Unnamed: 0,id,family,language,clus
105,0.3.4.Malagasy,Austronesian,Malagasy,0.3.4.Mal
