## THis notebook computes human preferences by language
### The input is a csv with % language spoken by country and its population and another csv with the human preferences by country

In [1]:
import pandas as pd
import numpy as np

In [2]:
## I modified manually the file
df_final=pd.read_csv("./cluster_data/proportions_population.csv")

humans=pd.read_csv("./cluster_data/human_preferences_by_country.csv")

In [3]:
df_final['pop_lang']=(df_final.Value*df_final.percentage)/100

In [4]:
humans=humans.rename(columns={'Country':'ISO3'})

humans=humans.loc[humans.Label.isin([ 'No. Characters', 'Gender', 'Fitness',
       'Social Status', 'Age', 'Species']),:]

In [5]:
df_weights=df_final.merge(humans,on=['ISO3'])

In [6]:
df_weights['est_weighted']=df_weights.Estimates*df_weights.pop_lang

df_weights['tot_pop']=df_weights.groupby(['Languages','Label'])['pop_lang'].transform('sum')

df_weights=df_weights.groupby(['Languages','Label']).agg({'est_weighted':np.sum,'tot_pop':'mean'}).reset_index()

df_weights['final_est']=df_weights.est_weighted/df_weights.tot_pop

In [7]:
df_weights.loc[df_weights.Languages=='es']

Unnamed: 0,Languages,Label,est_weighted,tot_pop,final_est
84,es,Age,226610200.0,452317774.5,0.500998
85,es,Fitness,80690030.0,452317774.5,0.178392
86,es,Gender,67422120.0,452317774.5,0.149059
87,es,No. Characters,224108800.0,452317774.5,0.495468
88,es,Social Status,180957500.0,452317774.5,0.400067
89,es,Species,198441400.0,452317774.5,0.438721


In [8]:
df_weights.loc[df_weights.Languages=='en']

Unnamed: 0,Languages,Label,est_weighted,tot_pop,final_est
78,en,Age,609315200.0,1359135000.0,0.448311
79,en,Fitness,206936000.0,1359135000.0,0.152256
80,en,Gender,170011000.0,1359135000.0,0.125088
81,en,No. Characters,689151100.0,1359135000.0,0.507051
82,en,Social Status,439764500.0,1359135000.0,0.323562
83,en,Species,808537100.0,1359135000.0,0.594891


In [9]:
df_weights.final_est=(df_weights.final_est+1)/2

df_weights.final_est=df_weights.final_est*100

In [10]:
df_weights.loc[df_weights.Languages=='es']

Unnamed: 0,Languages,Label,est_weighted,tot_pop,final_est
84,es,Age,226610200.0,452317774.5,75.04989
85,es,Fitness,80690030.0,452317774.5,58.919618
86,es,Gender,67422120.0,452317774.5,57.452959
87,es,No. Characters,224108800.0,452317774.5,74.773382
88,es,Social Status,180957500.0,452317774.5,70.003356
89,es,Species,198441400.0,452317774.5,71.936064


In [11]:
df_agg_final=df_weights.copy()

df_agg_final['final_est_weighted']=df_agg_final.tot_pop*df_agg_final.final_est

df_agg_final=df_agg_final.groupby(['Label']).agg({'final_est_weighted':np.sum,'tot_pop':np.sum}).reset_index()

df_agg_final['human_agg_by_pop']=df_agg_final.final_est_weighted/df_agg_final.tot_pop

In [12]:
df_agg_final

Unnamed: 0,Label,final_est_weighted,tot_pop,human_agg_by_pop
0,Age,535307200000.0,7477710000.0,71.587055
1,Fitness,427159000000.0,7477710000.0,57.12431
2,Gender,416405400000.0,7477710000.0,55.686228
3,No. Characters,554002600000.0,7477710000.0,74.087209
4,Social Status,499342300000.0,7477710000.0,66.777441
5,Species,596895800000.0,7477710000.0,79.823346


In [13]:
df_weights_formatted=df_weights.pivot(index='Label',columns='Languages',values='final_est')

In [None]:
df_weights.to_csv("human_preferences_by_lang_unpivoted.csv",index=False)

df_weights_formatted.to_csv("human_preferences_by_lang_converted.csv")

In [14]:
df_weights_formatted

Languages,af,ar,az,be,bg,bn,bs,ca,cs,da,...,tr,uk,ur,uz,vi,xh,yo,zh-cn,zh-tw,zu
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Age,75.062129,71.826936,74.40485,74.618049,76.562218,70.81181,77.53327,74.177547,76.318762,73.645356,...,74.757768,72.762119,69.78406,76.491558,72.725283,75.062129,76.062757,67.698811,67.262089,75.062129
Fitness,59.067847,56.686353,58.850723,58.257658,59.633063,58.612226,57.433867,55.253029,58.937712,57.713595,...,58.993229,58.450191,55.757425,56.53164,57.972196,59.067847,51.548767,56.584632,56.549337,59.067847
Gender,57.032231,54.34377,54.920952,51.363462,54.666919,53.819616,54.702861,58.364513,56.992201,53.382696,...,55.387918,53.084437,54.111415,51.393634,56.82473,57.032231,53.239435,55.18276,54.660676,57.032231
No. Characters,77.067028,74.164764,72.70722,73.332358,75.451048,74.978823,76.327764,75.831528,76.574638,75.553021,...,72.798477,72.152212,75.330536,79.915713,73.62052,77.067028,73.774495,71.164572,69.638113,77.067028
Social Status,69.663674,66.212128,66.063507,66.392904,68.762867,68.115838,65.068156,62.871066,70.815834,65.040542,...,66.70761,66.977199,61.810494,70.931468,68.981515,69.663674,73.72838,66.723166,63.698326,69.663674
Species,79.372922,80.024098,65.105577,81.083481,81.005535,78.501816,79.777624,77.811273,79.709316,84.637363,...,68.850058,80.044235,80.211305,79.628223,77.72253,79.372922,85.857835,82.938047,77.839519,79.372922
