# The Value of Cultural Similarity for Predicting Migration


## Carolina Coimbra Vieira, Sophie Lohmann, Emilio Zagheni

In [1]:
shhh <- suppressPackageStartupMessages # It's a library, so shhh!

shhh(library(tidyr))
shhh(library(dplyr))
shhh(library(readr))
shhh(library(lsa))

"package 'tidyr' was built under R version 4.1.3"
"package 'dplyr' was built under R version 4.1.3"
"package 'readr' was built under R version 4.1.3"
"package 'lsa' was built under R version 4.1.3"
"package 'SnowballC' was built under R version 4.1.1"


## Population 2019
### Datasource: [UN](https://population.un.org/wpp/Download/Standard/Population/)

In [2]:
pop_2019 <- read_csv("data/subset-pop-un2019.csv", col_types = cols())

head(pop_2019)

country,population
<chr>,<dbl>
Argentina,44780675
Australia,25203200
Brazil,211049519
Chile,18952035
France,65129731
Great Britain,67530161


In [3]:
countries <- c(pop_2019$country)

countries

## Geographic info: area  
### Datasource: [CEPII](http://www.cepii.fr/CEPII/en/bdd_modele/presentation.asp?id=6)

<b>Country-level variables</b>   
    *`iso2, iso3, cnum`*: ISO codes in two and three characters, and in three numbers respectively.  
    *`country, pays`*: Name of country in English and French respectively.   
    **`area`: Country’s area in $km^2$.**   
    *`dis_int`*: Internal distance of country i, $d_{ii} = .67 \sqrt{area/\pi}$ (an often used measure of average distance between producers and consumers in a country, see Head and Mayer, 2002 for more on this topic).  
    *`landlocked`*: Dummy variable set equal to 1 for landlocked countries.    
    *`continent`*: Continent to which the country is belonging.   
    *`langoff_i`*: Official or national languages and languages spoken by at least 20% of the population of the country (and spoken in another country of the world) following the same logic than the "open-circuit languages" in Mélitz (2002).  
    *`lang20_i`*: Languages (mother tongue, lingua francas or second languages) spoken by at least 20% of the population of the country.  
    *`lang9_i`*: Languages (mother tongue, lingua francas or second languages) spoken by between 9% and 20% of the population of the country.  
    *`colonizeri`*: Colonizers of the country for a relatively long period of time and with a substantial participation in the governance of the colonized country.  
    *`short_colonizeri`*: Colonizers of the country for a relatively short period of time or with only low involvement in the governance of the colonized country.
    
<b>Cities variables used in the computation of distances</b>  
    *`city_en, city_fr`*: Names of capitals or main cities of the country in English and French.   
    *`lat, lon`*: Latitude and longitude of the city.  
    *`cap`*: Variable equals to 1 if the city is the capital of the country, to 0 if the city is the most populated city (maincity equals to 1) but not the capital, and to 2 in the cases of two capitals, if the city is the most populated but the “second” capital or the previous capital.  
    *`maincity`*: Variable coded as 1 when the city is the most populated of the country and as 2 otherwise.  
    *`citynum`*: Number of cities for each country used to calculate our weighted distances described in the next section.  

In [4]:
geo <- read.csv("data/cepii_geo.csv")

head(geo)

Unnamed: 0_level_0,iso2,iso3,cnum,country,pays,area,dis_int,landlocked,continent,city_en,...,lang9_2,lang9_3,lang9_4,colonizer1,colonizer2,colonizer3,colonizer4,short_colonizer1,short_colonizer2,short_colonizer3
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<int>,<chr>,<chr>,...,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,AW,ABW,533,Aruba,Aruba,193,5225315,0,America,Oranjestad,...,.,.,.,NLD,.,.,.,.,.,.
2,AF,AFG,4,Afghanistan,Afghanistan,652225,3037614,1,Asia,Kabul,...,Uzbek,.,.,.,.,.,.,GBR,.,.
3,AO,AGO,24,Angola,Angola,1246700,4199666,0,Africa,Luanda,...,.,.,.,PRT,.,.,.,.,.,.
4,AI,AIA,660,Anguilla,Anguilla,102,379869,0,America,The Valley,...,.,.,.,GBR,.,.,.,.,.,.
5,AL,ALB,8,Albania,Albanie,28748,6377311,0,Europe,Tirana,...,.,.,.,TUR,.,.,.,.,.,.
6,AD,AND,20,Andorra,Andorre,453,8005398,0,Europe,Andorra la Vella,...,.,.,.,.,.,.,.,.,.,.


In [5]:
colnames(geo)

In [6]:
area <- geo %>% filter (country %in% countries) %>%
                select(iso2, iso3, country, area) %>% 
                unique()

head(area)

Unnamed: 0_level_0,iso2,iso3,country,area
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>
1,AR,ARG,Argentina,2766889
2,AU,AUS,Australia,7686848
4,BR,BRA,Brazil,8511920
6,CL,CHL,Chile,756945
7,ES,ESP,Spain,505954
8,FR,FRA,France,547026


#### Population + Area

In [7]:
area_pop <- merge(area, pop_2019)

area_pop

country,iso2,iso3,area,population
<chr>,<chr>,<chr>,<int>,<dbl>
Argentina,AR,ARG,2766889,44780675
Australia,AU,AUS,7686848,25203200
Brazil,BR,BRA,8511920,211049519
Chile,CL,CHL,756945,18952035
France,FR,FRA,547026,65129731
Great Britain,GB,GBR,244110,67530161
Indonesia,ID,IDN,1933658,270625567
Japan,JP,JPN,377801,126860299
Malaysia,MY,MYS,329758,31949789
Mexico,MX,MEX,1967210,127575529


In [8]:
isos3 <- area$iso3

isos3

## Geographic distance
### Datasource: [CEPII](http://www.cepii.fr/CEPII/en/bdd_modele/presentation.asp?id=6)  

<b>Variables</b>  
    *`iso_o, iso_d`*: ISO codes in three characters, refering to the country of origin and destination, respectively.  
    *`dist`*: Geodesic distances from lat/long of most populous cities (simple distance).   
    *`distcap`*: geodesic distance between capital cities (simple distance).    
    *`distw`*: population weighted distance, theta = 1 (weighted distance).   
    *`distwces`*: population weighted distance, theta = -1, which corresponds to the usual coefficient estimated from gravity models of bilateral trade flows (weighted distance).  
    *`contig`*: dummy variable indicating whether the two countries are contiguous (share a land border).  
    *`comlang_off, comlang_ethno`*: dummy variable indicating whether the two countries share a common language. There are two common languages dummies, the first one based on the fact that two countries share a common official language, and the other one set to one if a language is spoken by at least 9% of the population in both countries.     
    *`colony`*: dummy variable indicating whether the two countries have ever had a colonial link.  
    *`comcol`*: dummy variable indicating whether the two countries have had a common colonizer after 1945.  
    *`curcol`*: dummy variable indicating whether the two countries are currently in a colonial relationship.  
    *`col45`*: dummy variable indicating whether the two countries have had a colonial relationship -a-f-t-e-r- 1945 (share common colonizer pre 1945).  
    *`smctry`*: dummy variable indicating whether the two countries were/are the same country.  

**[TO DO]: check Maciej distance**   
<b>Maciej</b>  
    *`dist_pop_weighted`*: population-weighted average distance between biggest cities  
    *`dist_biggest_cities`*: average distance between biggest cities  
^ most similar to distwces  
    *`dist_unweighted`*: average distance between (?) (not population weighted)  

In [9]:
dists <- read.csv("data/cepii_dist.csv")

head(dists)

Unnamed: 0_level_0,iso_o,iso_d,contig,comlang_off,comlang_ethno,colony,comcol,curcol,col45,smctry,dist,distcap,distw,distwces
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>
1,ABW,ABW,0,0,0,0,0,0,0,0,5225315,5225315,2509354,2304723
2,ABW,AFG,0,0,0,0,0,0,0,0,1325781,1325781,1316822,1316637
3,ABW,AGO,0,0,0,0,0,0,0,0,9516913,9516913,9587316,9584193
4,ABW,AIA,0,0,1,0,0,0,0,0,9832682,9832682,9768974,9768916
5,ABW,ALB,0,0,0,0,0,0,0,0,9091742,9091742,9091576,9091466
6,ABW,AND,0,1,0,0,0,0,0,0,7572788,7572788,7570084,7570083


In [10]:
dists <- dists %>% filter (iso_o %in% isos3 & iso_d %in% isos3)

head(dists)

Unnamed: 0_level_0,iso_o,iso_d,contig,comlang_off,comlang_ethno,colony,comcol,curcol,col45,smctry,dist,distcap,distw,distwces
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>
1,ARG,ARG,0,0,0,0,0,0,0,0,6256475,6256475,5339082,9614934
2,ARG,AUS,0,0,0,0,0,0,0,0,1180136,1173388,1204457,1201827
3,ARG,BRA,1,0,0,0,0,0,0,0,1691067,2353257,2391846,2089281
4,ARG,CHL,1,1,1,0,0,0,0,0,1128317,1128317,1156726,9416383
5,ARG,ESP,0,1,1,1,0,0,0,0,1006585,1006585,1007966,100559
6,ARG,FRA,0,0,0,0,0,0,0,0,1107225,1107225,1093234,1092186


**New variable:**  
    *`shared_hist`*: dummy variable indicating whether the two countries have: *`colony, comcol, curcol, col45, smctry`*

In [11]:
dists <- dists %>% mutate(shared_hist = ifelse(colony + comcol + curcol + col45 + smctry, 1, 0), .keep = "unused")

head(dists)

Unnamed: 0_level_0,iso_o,iso_d,contig,comlang_off,comlang_ethno,dist,distcap,distw,distwces,shared_hist
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<dbl>
1,ARG,ARG,0,0,0,6256475,6256475,5339082,9614934,0
2,ARG,AUS,0,0,0,1180136,1173388,1204457,1201827,0
3,ARG,BRA,1,0,0,1691067,2353257,2391846,2089281,0
4,ARG,CHL,1,1,1,1128317,1128317,1156726,9416383,0
5,ARG,ESP,0,1,1,1006585,1006585,1007966,100559,1
6,ARG,FRA,0,0,0,1107225,1107225,1093234,1092186,0


#### Population + Area + Geographic distance  + Shared history

In [12]:
area_pop_gd_d <- area_pop %>% select(!iso2) %>%
                               merge(dists, by.x=c("iso3"), by.y=c("iso_d")) %>%
                               rename(iso_d = iso3, destination = country, area_d = area, pop_d = population) 

area_pop_gd_o <- area_pop %>% select(!iso2) %>% 
                               merge(dists, by.x=c("iso3"), by.y=c("iso_o")) %>%
                               rename(iso_o = iso3, origin = country, area_o = area, pop_o = population) 

area_pop_gdist <- merge(area_pop_gd_d, area_pop_gd_o)

head(area_pop_gdist)

Unnamed: 0_level_0,iso_d,iso_o,contig,comlang_off,comlang_ethno,dist,distcap,distw,distwces,shared_hist,destination,area_d,pop_d,origin,area_o,pop_o
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<int>,<dbl>,<chr>,<int>,<dbl>
1,ARG,ARG,0,0,0,6256475,6256475,5339082,9614934,0,Argentina,2766889,44780675,Argentina,2766889,44780675
2,ARG,AUS,0,0,0,1180136,1173388,1204457,1201827,0,Argentina,2766889,44780675,Australia,7686848,25203200
3,ARG,BRA,1,0,0,1691067,2353257,2391846,2089281,0,Argentina,2766889,44780675,Brazil,8511920,211049519
4,ARG,CHL,1,1,1,1128317,1128317,1156726,9416383,0,Argentina,2766889,44780675,Chile,756945,18952035
5,ARG,ESP,0,1,1,1006585,1006585,1007966,100559,1,Argentina,2766889,44780675,Spain,505954,46736782
6,ARG,FRA,0,0,0,1107225,1107225,1093234,1092186,0,Argentina,2766889,44780675,France,547026,65129731


## Language  
### Datasource: [CEPII](http://www.cepii.fr/CEPII/en/bdd_modele/presentation.asp?id=19)   

<b>Variables [Sarah's notes]</b>  
    *`col`*: common official language (0 or 1); 19 languages considered.    
    *`csl`*: p(two random people understand a common language) >= *`cnl`*.  
    *`cnl`*: p(two random people share a native language).  
    *`lp`*: lexical closeness of native langauges; set to 0 when *`cnl`* is 1 or 0 also set to 0 if there is no dominant native language (e.g. India).  
    *`lp1`*: tree based. 4 possibilities, 2 languages belonging to:  
        0: separate family trees  
        0.25: different branches of same tree (English and French),  
        0.50: the same branch (English and German),  
        0.75: the same sub-branch (German and Dutch)  
    *`lp2`*: lexical similarity of 200 words, continuous scale 0-100 normalized *`lp1`*, *`lp2`* so coefficients are comparable to each other and *`col, prox1, prox2`* are unadjusted versions of *`lp1`* and *`lp2`*?  
  

In [13]:
langs <- read.csv("data/cepii_language.csv")

head(langs)

Unnamed: 0_level_0,X,iso_o,country_o,iso_d,country_d,col,csl,cnl,prox1,lp1,prox2,lp2,cl,cle
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1,AFG,Afghanistan,ALB,Albania,0,0,0,0.25,,0.07342058,,,0.08662836
2,2,AFG,Afghanistan,DZA,Algeria,0,0,0,0.0,0.0,0.08527792,0.764097,0.09992715,0.10061873
3,3,AFG,Afghanistan,AND,Andorra,0,0,0,0.25,1.945866,0.1202599,1.0775385,0.14091842,0.1418937
4,4,AFG,Afghanistan,AGO,Angola,0,0,0,0.25,,0.11890529,,,0.14029542
5,5,AFG,Afghanistan,AIA,Anguilla,0,0,0,0.25,,0.09291404,,,0.10962854
6,6,AFG,Afghanistan,ATG,Antigua and Barbuda,0,0,0,0.25,1.945866,0.09291404,0.8325174,0.10887503,0.10962854


In [14]:
langs <- langs %>% filter (iso_o %in% isos3 & iso_d %in% isos3)

head(langs)

Unnamed: 0_level_0,X,iso_o,country_o,iso_d,country_d,col,csl,cnl,prox1,lp1,prox2,lp2,cl,cle
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1368,ARG,Argentina,AUS,Australia,0,0.02058416,0.0008,0.25,1.945866,0.13052547,1.1695188,0.15362507,0.15468276
2,1383,ARG,Argentina,BRA,Brazil,0,0.06052872,0.0008,0.75,5.837599,0.42439765,3.8026376,0.49770382,0.5011428
3,1396,ARG,Argentina,CHL,Chile,1,0.98010004,0.85439998,0.0,0.0,0.0,0.0,0.87344128,0.87256539
4,1422,ARG,Argentina,FRA,France,0,0.13392781,0.01039232,0.5,3.891733,0.25450581,2.2803929,0.30551848,0.30756098
5,1441,ARG,Argentina,IDN,Indonesia,0,0.0,0.0,0.0,0.0,0.07436915,0.6663537,0.08714446,0.08774757
6,1448,ARG,Argentina,JPN,Japan,0,0.0,0.0,0.0,0.0,0.06896222,0.617907,0.0808087,0.08136797


#### Population + Area + Geographic distance + Shared history +  Language

In [15]:
area_pop_gdist_lang <- langs %>% select(!c(X, country_o, country_d)) %>%
                                merge(area_pop_gdist)

head(area_pop_gdist_lang)

Unnamed: 0_level_0,iso_o,iso_d,col,csl,cnl,prox1,lp1,prox2,lp2,cl,...,distcap,distw,distwces,shared_hist,destination,area_d,pop_d,origin,area_o,pop_o
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<chr>,<chr>,<chr>,<dbl>,<chr>,<int>,<dbl>,<chr>,<int>,<dbl>
1,ARG,AUS,0,0.02058416,0.0008,0.25,1.945866,0.1305255,1.169519,0.1536251,...,1173388,1204457,1201827,0,Australia,7686848,25203200,Argentina,2766889,44780675
2,ARG,BRA,0,0.06052872,0.0008,0.75,5.837599,0.4243976,3.802638,0.4977038,...,2353257,2391846,2089281,0,Brazil,8511920,211049519,Argentina,2766889,44780675
3,ARG,CHL,1,0.98010004,0.85439998,0.0,0.0,0.0,0.0,0.8734413,...,1128317,1156726,9416383,0,Chile,756945,18952035,Argentina,2766889,44780675
4,ARG,ESP,1,0.98013985,0.85445821,0.0,0.0,0.0,0.0,0.8734919,...,1006585,1007966,100559,1,Spain,505954,46736782,Argentina,2766889,44780675
5,ARG,FRA,0,0.13392781,0.01039232,0.5,3.891733,0.2545058,2.280393,0.3055185,...,1107225,1093234,1092186,0,France,547026,65129731,Argentina,2766889,44780675
6,ARG,GBR,0,0.08325152,0.0,0.25,1.945866,0.1305255,1.169519,0.1529474,...,1114748,1113696,1112822,0,Great Britain,244110,67530161,Argentina,2766889,44780675


## GDP (constant 2010) 2019  
### Datasource: [World Bank](https://databank.worldbank.org/source/world-development-indicators)

*`GDP`*: GDP (constant 2010) in 2019 in US$

GDP at purchaser's prices is the sum of gross value added by all resident producers in the economy plus any product taxes and minus any subsidies not included in the value of the products. It is calculated without making deductions for depreciation of fabricated assets or for depletion and degradation of natural resources. Data are in constant 2010 U.S. dollars. Dollar figures for GDP are converted from domestic currencies using 2010 official exchange rates. For a few countries where the official exchange rate does not reflect the rate effectively applied to actual foreign exchange transactions, an alternative conversion factor is used.

In [16]:
gdp <- read_csv("data/subset-GDP_constant2010-wb2019.csv", col_types = cols())

head(gdp)

iso_country,GDP
<chr>,<dbl>
AUS,1450499000000.0
ARG,437813400000.0
BRA,2364446000000.0
CHL,285037000000.0
FRA,2971919000000.0
IDN,1204457000000.0


In [17]:
area_pop_gdist_lang_gdp <- area_pop_gdist_lang %>% 
                               merge(gdp, by.x=c("iso_o"), by.y=c("iso_country")) %>% 
                               rename(GDP_o = GDP) %>%
                               merge(gdp, by.x=c("iso_d"), by.y=c("iso_country")) %>%
                               rename(GDP_d = GDP)
                               
head(area_pop_gdist_lang_gdp)

Unnamed: 0_level_0,iso_d,iso_o,col,csl,cnl,prox1,lp1,prox2,lp2,cl,...,distwces,shared_hist,destination,area_d,pop_d,origin,area_o,pop_o,GDP_o,GDP_d
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<chr>,<dbl>,<chr>,<int>,<dbl>,<chr>,<int>,<dbl>,<dbl>,<dbl>
1,ARG,CHL,1,0.98010004,0.8544,0.0,0.0,0.0,0.0,0.8734413,...,9416383,0,Argentina,2766889,44780675,Chile,756945,18952035,285037000000.0,437813398163
2,ARG,GBR,0,0.08325152,0.0,0.25,1.945866,0.1305255,1.169519,0.1529474,...,1112822,0,Argentina,2766889,44780675,Great Britain,244110,67530161,2913557000000.0,437813398163
3,ARG,AUS,0,0.02058416,0.0008,0.25,1.945866,0.1305255,1.169519,0.1536251,...,1201827,0,Argentina,2766889,44780675,Australia,7686848,25203200,1450499000000.0,437813398163
4,ARG,KOR,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1913775,0,Argentina,2766889,44780675,South Korea,99484,51225321,1482760000000.0,437813398163
5,ARG,BRA,0,0.06052872,0.0008,0.75,5.837599,0.4243976,3.802638,0.4977038,...,2089281,0,Argentina,2766889,44780675,Brazil,8511920,211049519,2364446000000.0,437813398163
6,ARG,MEX,1,0.98010004,0.8832,0.0,0.0,0.0,0.0,0.8984749,...,7460572,0,Argentina,2766889,44780675,Mexico,1967210,127575529,1309882000000.0,437813398163


## Migration (stock) 2019  
### Datasource: [UN](https://www.un.org/en/development/desa/population/migration/data/estimates2/estimates19.asp)

The dataset presents estimates of international migrant by age, sex and origin. Estimates are presented for 1990, 1995, 2000, 2005, 2010, 2015 and 2019 and are available for all countries and areas of the world. The estimates are based on official statistics on the foreign-born or the foreign population.   
International migrant stock - Total international migrant stock   

In [18]:
migration_stock_wide <- read.csv("data/subset-stocks-migration-un2019.csv")
migration_stock_wide[is.na(migration_stock_wide)] <- 0

head(migration_stock_wide)

Unnamed: 0_level_0,destination,Total,Total16,Argentina,Australia,Brazil,Chile,France,Great.Britain,Indonesia,Japan,Malaysia,Mexico,Russia,Singapore,South.Korea,Spain,Turkey,United.States
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,Argentina,2212879,397927,0,279,49647,216855,8303,959,21,4531,2,1475,1917,2,8296,99769,646,5225
2,Australia,7549270,2109825,17583,0,42552,33707,40733,1262204,87274,53175,174136,6760,28735,64739,121204,19594,42880,114549
3,Brazil,807006,229897,39078,791,0,20747,12851,6650,978,65955,163,3601,1865,154,11531,41321,486,23726
4,Chile,939992,177893,72813,3952,20370,0,12562,6668,343,3231,100,6887,2344,237,4247,22842,1019,20278
5,France,8334875,1110686,14253,9192,63208,15857,0,176672,5446,23536,2793,13851,70463,2512,22794,303245,327508,59356
6,Great Britain,9552110,1073094,12239,145692,47487,8668,170142,0,11543,52930,84638,10457,38917,58432,13171,98372,104491,215915


In [19]:
migration_stock <- migration_stock_wide %>% 
                pivot_longer(!c(destination, Total, Total16), names_to = "origin", values_to = "m_stock") %>%
                select(destination, Total, origin, m_stock) %>%
                rename(total_imm_in_d = Total)
migration_stock$origin <- chartr(".", " ", migration_stock$origin)

head(migration_stock)

destination,total_imm_in_d,origin,m_stock
<chr>,<int>,<chr>,<int>
Argentina,2212879,Argentina,0
Argentina,2212879,Australia,279
Argentina,2212879,Brazil,49647
Argentina,2212879,Chile,216855
Argentina,2212879,France,8303
Argentina,2212879,Great Britain,959


In [20]:
area_pop_mig_d <- area_pop %>% select(country, iso3) %>% 
                               merge(migration_stock, by.x=c("country"), by.y=c("destination")) %>%
                               rename(iso_d = iso3, destination = country) 

area_pop_mig_o <- area_pop %>% select(country, iso3) %>% 
                               merge(migration_stock, by.x=c("country"), by.y=c("origin")) %>%
                               rename(iso_o = iso3, origin = country) 

migration_stock <- merge(area_pop_mig_d, area_pop_mig_o)

head(migration_stock)

Unnamed: 0_level_0,destination,total_imm_in_d,origin,m_stock,iso_d,iso_o
Unnamed: 0_level_1,<chr>,<int>,<chr>,<int>,<chr>,<chr>
1,Argentina,2212879,Argentina,0,ARG,ARG
2,Argentina,2212879,Australia,279,ARG,AUS
3,Argentina,2212879,Brazil,49647,ARG,BRA
4,Argentina,2212879,Chile,216855,ARG,CHL
5,Argentina,2212879,France,8303,ARG,FRA
6,Argentina,2212879,Great Britain,959,ARG,GBR


In [21]:
official_dataset <- merge(area_pop_gdist_lang_gdp, migration_stock) %>% select(!total_imm_in_d)

head(official_dataset)

Unnamed: 0_level_0,iso_d,iso_o,destination,origin,col,csl,cnl,prox1,lp1,prox2,...,distw,distwces,shared_hist,area_d,pop_d,area_o,pop_o,GDP_o,GDP_d,m_stock
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<chr>,<chr>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<int>
1,ARG,AUS,Argentina,Australia,0,0.02058416,0.0008,0.25,1.945866,0.1305255,...,1204457,1201827,0,2766889,44780675,7686848,25203200,1450499000000.0,437813398163,279
2,ARG,BRA,Argentina,Brazil,0,0.06052872,0.0008,0.75,5.837599,0.4243976,...,2391846,2089281,0,2766889,44780675,8511920,211049519,2364446000000.0,437813398163,49647
3,ARG,CHL,Argentina,Chile,1,0.98010004,0.85439998,0.0,0.0,0.0,...,1156726,9416383,0,2766889,44780675,756945,18952035,285037000000.0,437813398163,216855
4,ARG,ESP,Argentina,Spain,1,0.98013985,0.85445821,0.0,0.0,0.0,...,1007966,100559,1,2766889,44780675,505954,46736782,1572013000000.0,437813398163,99769
5,ARG,FRA,Argentina,France,0,0.13392781,0.01039232,0.5,3.891733,0.2545058,...,1093234,1092186,0,2766889,44780675,547026,65129731,2971919000000.0,437813398163,8303
6,ARG,GBR,Argentina,Great Britain,0,0.08325152,0.0,0.25,1.945866,0.1305255,...,1113696,1112822,0,2766889,44780675,244110,67530161,2913557000000.0,437813398163,959


In [22]:
dim(official_dataset)

## Migration (flow) 2015-2020  
### Datasource: [Abel, Guy; E. Cohen, Joel (2019)](https://figshare.com/collections/Bilateral_international_migration_flow_estimates_for_200_countries/4470464)

This collection contains bilateral international migration flow estimates for 200 countries for five-year periods between 1990 and 2020. 

In [23]:
migration_flow <- read.csv("data/abel_cohen_migration_flow.csv")
migration_flow[is.na(migration_flow)] <- 0

head(migration_flow)

Unnamed: 0_level_0,year0,orig,dest,sd_drop_neg,sd_rev_neg,mig_rate,da_min_open,da_min_closed,da_pb_closed
Unnamed: 0_level_1,<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>
1,1990,BDI,BDI,0,0,0,0,0,0
2,1990,COM,BDI,0,0,0,0,0,0
3,1990,DJI,BDI,0,0,0,0,0,0
4,1990,ERI,BDI,0,0,0,0,0,90
5,1990,ETH,BDI,0,0,0,0,0,2
6,1990,KEN,BDI,30,30,69,45,29,87


In [24]:
migration_flow <- migration_flow %>% 
                filter(year0 == 2015) %>%
                select(!year0) %>%
                rename(iso_d = dest, iso_o = orig, 
                       m_flow_sd_drop_neg = sd_drop_neg, 
                       m_flow_sd_rev_neg = sd_rev_neg, 
                       m_flow_mig_rate = mig_rate, 
                       m_flow_da_min_open = da_min_open, 
                       m_flow_da_min_closed = da_min_closed, 
                       m_flow_da_pb_closed = da_pb_closed)

head(migration_flow)

Unnamed: 0_level_0,iso_o,iso_d,m_flow_sd_drop_neg,m_flow_sd_rev_neg,m_flow_mig_rate,m_flow_da_min_open,m_flow_da_min_closed,m_flow_da_pb_closed
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>
1,BDI,BDI,0,0,0,0,0,0
2,COM,BDI,0,0,0,0,0,0
3,DJI,BDI,0,0,0,0,0,0
4,ERI,BDI,0,131,0,43,60,183
5,ETH,BDI,0,14,0,9,3,24
6,KEN,BDI,194,194,211,1168,447,1554


In [25]:
official_dataset <- merge(official_dataset, migration_flow)

head(official_dataset)

Unnamed: 0_level_0,iso_d,iso_o,destination,origin,col,csl,cnl,prox1,lp1,prox2,...,pop_o,GDP_o,GDP_d,m_stock,m_flow_sd_drop_neg,m_flow_sd_rev_neg,m_flow_mig_rate,m_flow_da_min_open,m_flow_da_min_closed,m_flow_da_pb_closed
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,ARG,AUS,Argentina,Australia,0,0.02058416,0.0008,0.25,1.945866,0.1305255,...,25203200,1450499000000.0,437813398163,279,1,1,56,16,105,2224
2,ARG,BRA,Argentina,Brazil,0,0.06052872,0.0008,0.75,5.837599,0.4243976,...,211049519,2364446000000.0,437813398163,49647,475,475,9997,2332,0,10767
3,ARG,CHL,Argentina,Chile,1,0.98010004,0.85439998,0.0,0.0,0.0,...,18952035,285037000000.0,437813398163,216855,2079,2079,43666,13875,53604,84213
4,ARG,ESP,Argentina,Spain,1,0.98013985,0.85445821,0.0,0.0,0.0,...,46736782,1572013000000.0,437813398163,99769,956,956,20090,6434,5516,49532
5,ARG,FRA,Argentina,France,0,0.13392781,0.01039232,0.5,3.891733,0.2545058,...,65129731,2971919000000.0,437813398163,8303,79,79,1672,467,156,3048
6,ARG,GBR,Argentina,Great Britain,0,0.08325152,0.0,0.25,1.945866,0.1305255,...,67530161,2913557000000.0,437813398163,959,8,8,193,45,126,1559


In [26]:
dim(official_dataset)

## WVS
### Datasource: [WVS (2020)](https://www.worldvaluessurvey.org/wvs.jsp)



In [27]:
wvs <- read.csv("data/subset-wvs-wave7-2020.csv")

wvs_cosine <- as.data.frame(cosine(t(as.matrix(wvs[,2:3]))))
colnames(wvs_cosine) <- wvs$country
wvs_cosine["origin"] <- wvs$country


wvs_cosine_longer <- pivot_longer(wvs_cosine, -c("origin"), values_to = "CS_wvs", names_to = "destination")
head(wvs_cosine_longer)

origin,destination,CS_wvs
<chr>,<chr>,<dbl>
Argentina,Argentina,1.0
Argentina,Australia,0.2579213
Argentina,Brazil,0.7688173
Argentina,Chile,-0.9899189
Argentina,France,0.1697698
Argentina,Great Britain,0.3229716


## Foursquare
### Datasource: [You Are What You Eat (and Drink): Identifying Cultural Boundaries by Analyzing Food and Drink Habits in Foursquare. Thiago H. Silva, Pedro O. S. Vaz de Melo, Jussara M. Almeida, Mirco Musolesi, Antonio A. F. Loureiro (2014)](https://www.aaai.org/ocs/index.php/ICWSM/ICWSM14/paper/viewPaper/8113)



In [28]:
foursquare <- read.csv("data/foursquare.csv")

foursquare_cosine <- as.data.frame(cosine(t(as.matrix(foursquare[,2:3]))))
colnames(foursquare_cosine) <- foursquare$country
foursquare_cosine["origin"] <- foursquare$country


foursquare_cosine_longer <- pivot_longer(foursquare_cosine, -c("origin"), values_to = "CS_foursquare", names_to = "destination")
head(foursquare_cosine_longer)

origin,destination,CS_foursquare
<chr>,<chr>,<dbl>
Argentina,Argentina,1.0
Argentina,Australia,0.1005334
Argentina,Brazil,0.8402622
Argentina,Chile,0.9548769
Argentina,France,0.8741573
Argentina,Great Britain,0.6263188


In [29]:
official_dataset <- merge(official_dataset, wvs_cosine_longer)
official_dataset <- merge(official_dataset, foursquare_cosine_longer)

head(official_dataset)

Unnamed: 0_level_0,destination,origin,iso_d,iso_o,col,csl,cnl,prox1,lp1,prox2,...,GDP_d,m_stock,m_flow_sd_drop_neg,m_flow_sd_rev_neg,m_flow_mig_rate,m_flow_da_min_open,m_flow_da_min_closed,m_flow_da_pb_closed,CS_wvs,CS_foursquare
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>
1,Argentina,Australia,ARG,AUS,0,0.02058416,0.0008,0.25,1.945866,0.13052547,...,437813398163,279,1,1,56,16,105,2224,0.2579213,0.1005334
2,Argentina,Brazil,ARG,BRA,0,0.06052872,0.0008,0.75,5.837599,0.42439765,...,437813398163,49647,475,475,9997,2332,0,10767,0.7688173,0.8402622
3,Argentina,Chile,ARG,CHL,1,0.98010004,0.85439998,0.0,0.0,0.0,...,437813398163,216855,2079,2079,43666,13875,53604,84213,-0.9899189,0.9548769
4,Argentina,France,ARG,FRA,0,0.13392781,0.01039232,0.5,3.891733,0.25450581,...,437813398163,8303,79,79,1672,467,156,3048,0.1697698,0.8741573
5,Argentina,Great Britain,ARG,GBR,0,0.08325152,0.0,0.25,1.945866,0.13052547,...,437813398163,959,8,8,193,45,126,1559,0.3229716,0.6263188
6,Argentina,Indonesia,ARG,IDN,0,0.0,0.0,0.0,0.0,0.07436915,...,437813398163,21,0,0,4,1,4,7,0.4656905,-0.8793284


In [30]:
dim(official_dataset)

In [31]:
write_csv(official_dataset, "data/subset-official-dataset.csv")

## Cultural Similarity 

#### symmetric:   

each country is represented by a vector corresponding to the top (almost) <b>400 dishes</b> (top 50 dishes in all the countries)

![CD](figs/heatmap-similarity-symmetric-tops-topK-cosine-reds.png)

In [32]:
cd_sym <- read_csv("data/FB-food-cultural_similarity-symmetric.csv", col_types = cols())
colnames(cd_sym)[1] <- "interests_from"

[1m[22mNew names:
[36m*[39m `` -> `...1`


In [33]:
cd_sym

interests_from,Argentina,Australia,Brazil,Chile,France,Great Britain,Indonesia,Japan,Malaysia,Mexico,Russia,Singapore,South Korea,Spain,Turkey,United States
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Argentina,1.0,-0.1143361,0.176917369,0.39391861,-0.06134509,-0.1691530891,-0.19920772,-0.224371502,-0.28997372,0.27235146,-0.08055443,-0.2096671516,-0.20142284,0.163591513,-0.0986835,-0.12580614
Australia,-0.11433611,1.0,-0.183166299,-0.21693946,-0.181528839,0.3272786581,-0.42124163,-0.369408331,0.18000864,-0.19258554,-0.48793018,0.2374988871,-0.24214879,-0.213543938,-0.27776537,0.45026896
Brazil,0.17691737,-0.1831663,1.0,0.12196555,-0.009614683,-0.0914496937,0.01228442,0.040389545,-0.27893278,-0.02383473,0.10240044,-0.2598915766,-0.08731596,0.072308406,-0.0317225,-0.19253817
Chile,0.39391861,-0.2169395,0.121965552,1.0,-0.036574753,-0.1427083922,-0.12930396,-0.085777299,-0.32750818,0.34784223,-0.01501271,-0.2553625404,-0.20503456,0.194458741,-0.07814615,-0.14241346
France,-0.06134509,-0.1815288,-0.009614683,-0.03657475,1.0,-0.01645512,0.02915838,0.097488102,-0.29143959,-0.10030008,0.24466473,-0.254724732,-0.12086619,0.232377125,0.13634094,-0.14248141
Great Britain,-0.16915309,0.3272787,-0.091449694,-0.14270839,-0.01645512,1.0,-0.23383791,-0.197735569,-0.06866661,-0.20550659,-0.16012196,0.0005845629,-0.1526223,-0.037024527,-0.16732636,0.23801647
Indonesia,-0.19920772,-0.4212416,0.012284416,-0.12930396,0.029158377,-0.2338379149,1.0,0.364092614,-0.02878002,-0.15926308,0.53888907,-0.2344010923,0.1608252,-0.051447082,0.20489202,-0.31442787
Japan,-0.2243715,-0.3694083,0.040389545,-0.0857773,0.097488102,-0.1977355688,0.36409261,1.0,-0.26822899,-0.17332967,0.47569199,-0.1261969938,0.21536567,-0.006820021,0.20586957,-0.29936526
Malaysia,-0.28997372,0.1800086,-0.278932779,-0.32750818,-0.291439588,-0.0686666064,-0.02878002,-0.268228986,1.0,-0.20696652,-0.44061942,0.5102216287,0.01868495,-0.348760866,-0.21755477,0.036495
Mexico,0.27235146,-0.1925855,-0.023834733,0.34784223,-0.100300082,-0.2055065851,-0.15926308,-0.173329675,-0.20696652,1.0,-0.11925075,-0.2090205572,-0.10892951,0.060798588,-0.1202822,-0.01181777


In [34]:
cd_sym <- cd_sym %>% pivot_longer(!interests_from, names_to = "origin", values_to = "CS_symm") %>%
                        rename(destination = interests_from)

head(cd_sym)

destination,origin,CS_symm
<chr>,<chr>,<dbl>
Argentina,Argentina,1.0
Argentina,Australia,-0.11433611
Argentina,Brazil,0.17691737
Argentina,Chile,0.39391861
Argentina,France,-0.06134509
Argentina,Great Britain,-0.16915309


#### nonsymmetric:   
each country is represented by a vector corresponding to the <b>top 50 dishes from origin country</b>

![CDiv](figs/heatmap-similarity-topK-cosine-reds.png)

The **(non-symmetric)** Cultural Similarity between **origin (rows) and destination (columns)** represent how similar origin and destination are in terms of popular dishes from the country of **origin**.   

**Hypothesis:** Immigrants prefer to move to (host) countries culturally similar to their (home/previous) country.

In [35]:
cd_nonsym <- read_csv("data/FB-food-cultural_similarity-nonsymmetric.csv", col_types = cols())
colnames(cd_nonsym)[1] <- "interests_from"

[1m[22mNew names:
[36m*[39m `` -> `...1`


In [36]:
cd_food_o <- cd_nonsym %>% pivot_longer(!interests_from, names_to = "destination", values_to = "CS_nonsymm_food_o") %>%
                           rename(origin = interests_from) %>%
                            merge(cd_sym)

head(cd_food_o)

Unnamed: 0_level_0,origin,destination,CS_nonsymm_food_o,CS_symm
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>
1,Argentina,Argentina,1.0,1.0
2,Argentina,Australia,0.3185665,-0.11433611
3,Argentina,Brazil,0.1077964,0.17691737
4,Argentina,Chile,0.46673,0.39391861
5,Argentina,France,-0.4108677,-0.06134509
6,Argentina,Great Britain,-0.2072795,-0.16915309


The **(non-symmetric)** Cultural Similarity between **origin (rows) and destination (columns)** representing how similar origin and destination are in terms of popular dishes from the country of **destination**. 

In [37]:
cd_food_d <- cd_nonsym %>% pivot_longer(!interests_from, names_to = "origin", values_to = "CS_nonsymm_food_d") %>%
                            rename(destination = interests_from) %>%
                            merge(cd_sym)

head(cd_food_d)

Unnamed: 0_level_0,destination,origin,CS_nonsymm_food_d,CS_symm
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>
1,Argentina,Argentina,1.0,1.0
2,Argentina,Australia,0.3185665,-0.11433611
3,Argentina,Brazil,0.1077964,0.17691737
4,Argentina,Chile,0.46673,0.39391861
5,Argentina,France,-0.4108677,-0.06134509
6,Argentina,Great Britain,-0.2072795,-0.16915309


In [38]:
cd <- merge(cd_food_o, cd_food_d)

head(cd)

Unnamed: 0_level_0,origin,destination,CS_symm,CS_nonsymm_food_o,CS_nonsymm_food_d
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>
1,Argentina,Argentina,1.0,1.0,1.0
2,Argentina,Australia,-0.11433611,0.3185665,-0.2553475
3,Argentina,Brazil,0.17691737,0.1077964,0.2172456
4,Argentina,Chile,0.39391861,0.46673,0.4090194
5,Argentina,France,-0.06134509,-0.4108677,-0.2192713
6,Argentina,Great Britain,-0.16915309,-0.2072795,-0.3668815


In [39]:
pca <- read.csv("data/fb-cultural-similarity-pca.csv")

pca_cosine <- as.data.frame(cosine(t(as.matrix(pca[,2:3]))))
colnames(pca_cosine) <- pca$country
pca_cosine["origin"] <- pca$country


pca_cosine_longer <- pivot_longer(pca_cosine, -c("origin"), values_to = "CS_FB_top50_pca", names_to = "destination")
head(pca_cosine_longer)

origin,destination,CS_FB_top50_pca
<chr>,<chr>,<dbl>
Argentina,Argentina,1.0
Argentina,Australia,0.05466153
Argentina,Brazil,0.79585871
Argentina,Chile,0.98090107
Argentina,France,0.6010584
Argentina,Great Britain,0.37417246


#### Dataset: official statistics + FB cultural distance

In [40]:
official_dataset <- merge(official_dataset, cd)#, mutate(official_dataset, destination = tolower(destination), origin = tolower(origin)))
dataset <- merge(official_dataset, pca_cosine_longer)

head(dataset)

Unnamed: 0_level_0,destination,origin,iso_d,iso_o,col,csl,cnl,prox1,lp1,prox2,...,m_flow_mig_rate,m_flow_da_min_open,m_flow_da_min_closed,m_flow_da_pb_closed,CS_wvs,CS_foursquare,CS_symm,CS_nonsymm_food_o,CS_nonsymm_food_d,CS_FB_top50_pca
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,Argentina,Australia,ARG,AUS,0,0.02058416,0.0008,0.25,1.945866,0.13052547,...,56,16,105,2224,0.2579213,0.1005334,-0.11433611,-0.2553475,0.3185665,0.05466153
2,Argentina,Brazil,ARG,BRA,0,0.06052872,0.0008,0.75,5.837599,0.42439765,...,9997,2332,0,10767,0.7688173,0.8402622,0.17691737,0.2172456,0.1077964,0.79585871
3,Argentina,Chile,ARG,CHL,1,0.98010004,0.85439998,0.0,0.0,0.0,...,43666,13875,53604,84213,-0.9899189,0.9548769,0.39391861,0.4090194,0.46673,0.98090107
4,Argentina,France,ARG,FRA,0,0.13392781,0.01039232,0.5,3.891733,0.25450581,...,1672,467,156,3048,0.1697698,0.8741573,-0.06134509,-0.2192713,-0.4108677,0.6010584
5,Argentina,Great Britain,ARG,GBR,0,0.08325152,0.0,0.25,1.945866,0.13052547,...,193,45,126,1559,0.3229716,0.6263188,-0.16915309,-0.3668815,-0.2072795,0.37417246
6,Argentina,Indonesia,ARG,IDN,0,0.0,0.0,0.0,0.0,0.07436915,...,4,1,4,7,0.4656905,-0.8793284,-0.19920772,-0.3175994,-0.6779621,-0.3986916


In [41]:
dim(dataset)

In [42]:
write_csv(dataset, "data/subset-final-dataset.csv")