# The Value of Cultural Similarity for Predicting Migration


## Carolina Coimbra Vieira, Sophie Lohmann, Emilio Zagheni

In [None]:
shhh <- suppressPackageStartupMessages # It's a library, so shhh!

shhh(library(tidyr))
shhh(library(dplyr))
shhh(library(readr))
shhh(library(lsa))
shhh(library(philentropy))
shhh(library('fastDummies'))

## Population 2019
### Datasource: [UN](https://population.un.org/wpp/Download/Standard/Population/)

In [None]:
pop_2019 <- read.csv("data/subset-pop-un2019.csv")

head(pop_2019)

In [None]:
countries <- c(pop_2019$country)

countries

## Geographic info: area  
### Datasource: [CEPII](http://www.cepii.fr/CEPII/en/bdd_modele/presentation.asp?id=6)

<b>Country-level variables</b>   
    *`iso2, iso3, cnum`*: ISO codes in two and three characters, and in three numbers respectively.  
    *`country, pays`*: Name of country in English and French respectively.   
    **`area`: Country’s area in $km^2$.**   
    *`dis_int`*: Internal distance of country i, $d_{ii} = .67 \sqrt{area/\pi}$ (an often used measure of average distance between producers and consumers in a country, see Head and Mayer, 2002 for more on this topic).  
    *`landlocked`*: Dummy variable set equal to 1 for landlocked countries.    
    *`continent`*: Continent to which the country is belonging.   
    *`langoff_i`*: Official or national languages and languages spoken by at least 20% of the population of the country (and spoken in another country of the world) following the same logic than the "open-circuit languages" in Mélitz (2002).  
    *`lang20_i`*: Languages (mother tongue, lingua francas or second languages) spoken by at least 20% of the population of the country.  
    *`lang9_i`*: Languages (mother tongue, lingua francas or second languages) spoken by between 9% and 20% of the population of the country.  
    *`colonizeri`*: Colonizers of the country for a relatively long period of time and with a substantial participation in the governance of the colonized country.  
    *`short_colonizeri`*: Colonizers of the country for a relatively short period of time or with only low involvement in the governance of the colonized country.
    
<b>Cities variables used in the computation of distances</b>  
    *`city_en, city_fr`*: Names of capitals or main cities of the country in English and French.   
    *`lat, lon`*: Latitude and longitude of the city.  
    *`cap`*: Variable equals to 1 if the city is the capital of the country, to 0 if the city is the most populated city (maincity equals to 1) but not the capital, and to 2 in the cases of two capitals, if the city is the most populated but the “second” capital or the previous capital.  
    *`maincity`*: Variable coded as 1 when the city is the most populated of the country and as 2 otherwise.  
    *`citynum`*: Number of cities for each country used to calculate our weighted distances described in the next section.  

In [None]:
geo <- read.csv("data/cepii_geo2.csv")

head(geo)

In [None]:
colnames(geo)

In [None]:
area <- geo %>% filter (country %in% countries) %>%
                select(iso2, iso3, country, area) %>% 
                unique()

area

#### Population + Area

In [None]:
area_pop <- merge(area, pop_2019)

area_pop

In [None]:
isos3 <- area$iso3

isos3

## Geographic distance
### Datasource: [CEPII](http://www.cepii.fr/CEPII/en/bdd_modele/presentation.asp?id=6)  

<b>Variables</b>  
    *`iso_o, iso_d`*: ISO codes in three characters, refering to the country of origin and destination, respectively.  
    *`dist`*: Geodesic distances from lat/long of most populous cities (simple distance).   
    *`distcap`*: geodesic distance between capital cities (simple distance).    
    *`distw`*: population weighted distance, theta = 1 (weighted distance).   
    *`distwces`*: population weighted distance, theta = -1, which corresponds to the usual coefficient estimated from gravity models of bilateral trade flows (weighted distance).  
    *`contig`*: dummy variable indicating whether the two countries are contiguous (share a land border).  
    *`comlang_off, comlang_ethno`*: dummy variable indicating whether the two countries share a common language. There are two common languages dummies, the first one based on the fact that two countries share a common official language, and the other one set to one if a language is spoken by at least 9% of the population in both countries.     
    *`colony`*: dummy variable indicating whether the two countries have ever had a colonial link.  
    *`comcol`*: dummy variable indicating whether the two countries have had a common colonizer after 1945.  
    *`curcol`*: dummy variable indicating whether the two countries are currently in a colonial relationship.  
    *`col45`*: dummy variable indicating whether the two countries have had a colonial relationship -a-f-t-e-r- 1945 (share common colonizer pre 1945).  
    *`smctry`*: dummy variable indicating whether the two countries were/are the same country.  

**[TO DO]: check Maciej distance**   
<b>Maciej</b>  
    *`dist_pop_weighted`*: population-weighted average distance between biggest cities  
    *`dist_biggest_cities`*: average distance between biggest cities  
^ most similar to distwces  
    *`dist_unweighted`*: average distance between (?) (not population weighted)  

In [None]:
dists <- read.csv("data/cepii_dist2.csv")

head(dists)

**New variable:**  
    *`shared_hist`*: dummy variable indicating whether the two countries have: *`colony, comcol, curcol, col45, smctry`*

In [None]:
dists <- dists %>% mutate(shared_hist = ifelse(colony + comcol + curcol + col45 + smctry, 1, 0), .keep = "unused")

head(dists)

#### Population + Area + Geographic distance  + Shared history

In [None]:
area_pop_gd_d <- area_pop %>% select(!iso2) %>%
                               merge(dists, by.x=c("iso3"), by.y=c("iso_d")) %>%
                               rename(iso_d = iso3, destination = country, area_d = area, pop_d = population) 

area_pop_gd_o <- area_pop %>% select(!iso2) %>% 
                               merge(dists, by.x=c("iso3"), by.y=c("iso_o")) %>%
                               rename(iso_o = iso3, origin = country, area_o = area, pop_o = population) 

area_pop_gdist <- merge(area_pop_gd_d, area_pop_gd_o)

head(area_pop_gdist)

## Language  
### Datasource: [CEPII](http://www.cepii.fr/CEPII/en/bdd_modele/presentation.asp?id=19)   

<b>Variables [Sarah's notes]</b>  
    *`col`*: common official language (0 or 1); 19 languages considered.    
    *`csl`*: p(two random people understand a common language) >= *`cnl`*.  
    *`cnl`*: p(two random people share a native language).  
    *`lp`*: lexical closeness of native langauges; set to 0 when *`cnl`* is 1 or 0 also set to 0 if there is no dominant native language (e.g. India).  
    *`lp1`*: tree based. 4 possibilities, 2 languages belonging to:  
        0: separate family trees  
        0.25: different branches of same tree (English and French),  
        0.50: the same branch (English and German),  
        0.75: the same sub-branch (German and Dutch)  
    *`lp2`*: lexical similarity of 200 words, continuous scale 0-100 normalized *`lp1`*, *`lp2`* so coefficients are comparable to each other and *`col, prox1, prox2`* are unadjusted versions of *`lp1`* and *`lp2`*?  
  

In [None]:
langs <- read.csv("data/cepii_language.csv")

head(langs)

In [None]:
langs <- langs %>% filter (iso_o %in% isos3 & iso_d %in% isos3)

head(langs)

#### Population + Area + Geographic distance + Shared history +  Language

In [None]:
area_pop_gdist_lang <- langs %>% select(!c(X, country_o, country_d)) %>%
                                merge(area_pop_gdist)

head(area_pop_gdist_lang)

In [None]:
dim(area_pop_gdist_lang)

## GDP (constant 2010) 2019  
### Datasource: [World Bank](https://databank.worldbank.org/source/world-development-indicators)

*`GDP`*: GDP (constant 2010) in 2019 in US$

GDP at purchaser's prices is the sum of gross value added by all resident producers in the economy plus any product taxes and minus any subsidies not included in the value of the products. It is calculated without making deductions for depreciation of fabricated assets or for depletion and degradation of natural resources. Data are in constant 2010 U.S. dollars. Dollar figures for GDP are converted from domestic currencies using 2010 official exchange rates. For a few countries where the official exchange rate does not reflect the rate effectively applied to actual foreign exchange transactions, an alternative conversion factor is used.

In [None]:
gdp <- read.csv("data/subset-GDP_constant2010-wb2019.csv")

head(gdp)

In [None]:
area_pop_gdist_lang_gdp <- area_pop_gdist_lang %>% 
                               merge(gdp, by.x=c("iso_o"), by.y=c("iso_country")) %>% 
                               rename(GDP_o = GDP) %>%
                               merge(gdp, by.x=c("iso_d"), by.y=c("iso_country")) %>%
                               rename(GDP_d = GDP)
                               
head(area_pop_gdist_lang_gdp)

In [None]:
dim(area_pop_gdist_lang_gdp)

In [None]:
gdp <- read.csv("data/subset-GDPpercapita_constant2010-wb2019.csv")

head(gdp)

In [None]:
area_pop_gdist_lang_gdp <- area_pop_gdist_lang_gdp %>% 
                               merge(gdp, by.x=c("iso_o"), by.y=c("iso_country")) %>% 
                               rename(GDP_percapita_o = GDP) %>%
                               merge(gdp, by.x=c("iso_d"), by.y=c("iso_country")) %>%
                               rename(GDP_percapita_d = GDP)
                               
head(area_pop_gdist_lang_gdp)

In [None]:
dim(area_pop_gdist_lang_gdp)

## Migration (stock) 2019  
### Datasource: [UN](https://www.un.org/en/development/desa/population/migration/data/estimates2/estimates19.asp)

The dataset presents estimates of international migrant by age, sex and origin. Estimates are presented for 1990, 1995, 2000, 2005, 2010, 2015 and 2019 and are available for all countries and areas of the world. The estimates are based on official statistics on the foreign-born or the foreign population.   
International migrant stock - Total international migrant stock   

In [None]:
migration_stock_wide <- read.csv("data/subset-stocks-migration-un2019.csv")
migration_stock_wide[is.na(migration_stock_wide)] <- 0

head(migration_stock_wide)

In [None]:
migration_stock <- migration_stock_wide %>% 
                pivot_longer(!c(destination, Total, Total16), names_to = "origin", values_to = "m_stock") %>%
                select(destination, Total, origin, m_stock) %>%
                rename(total_imm_in_d = Total)
migration_stock$origin <- chartr(".", " ", migration_stock$origin)

head(migration_stock)

In [None]:
area_pop_mig_d <- area_pop %>% select(country, iso3) %>% 
                               merge(migration_stock, by.x=c("country"), by.y=c("destination")) %>%
                               rename(iso_d = iso3, destination = country) 

area_pop_mig_o <- area_pop %>% select(country, iso3) %>% 
                               merge(migration_stock, by.x=c("country"), by.y=c("origin")) %>%
                               rename(iso_o = iso3, origin = country) 

migration_stock <- merge(area_pop_mig_d, area_pop_mig_o)

head(migration_stock)

In [None]:
official_dataset <- merge(area_pop_gdist_lang_gdp, migration_stock) %>% select(!total_imm_in_d)

head(official_dataset)

In [None]:
dim(official_dataset)

## Migration (flow) 2015-2020  
### Datasource: [Abel, Guy; E. Cohen, Joel (2019)](https://figshare.com/collections/Bilateral_international_migration_flow_estimates_for_200_countries/4470464)

This collection contains bilateral international migration flow estimates for 200 countries for five-year periods between 1990 and 2020. 

In [None]:
migration_flow <- read.csv("data/abel_cohen_migration_flow.csv")
migration_flow[is.na(migration_flow)] <- 0

head(migration_flow)

In [None]:
migration_flow <- migration_flow %>% 
                filter(year0 == 2015) %>%
                select(!year0) %>%
                rename(iso_d = dest, iso_o = orig, 
                       m_flow_sd_drop_neg = sd_drop_neg, 
                       m_flow_sd_rev_neg = sd_rev_neg, 
                       m_flow_mig_rate = mig_rate, 
                       m_flow_da_min_open = da_min_open, 
                       m_flow_da_min_closed = da_min_closed, 
                       m_flow_da_pb_closed = da_pb_closed)

head(migration_flow)

In [None]:
official_dataset <- merge(official_dataset, migration_flow)

head(official_dataset)

In [None]:
dim(official_dataset)

## WVS
### Datasource: [WVS (2020)](https://www.worldvaluessurvey.org/wvs.jsp)



In [None]:
wvs <- read.csv("data/subset-wvs-wave7-2020.csv")
wvs

In [None]:
#Cosine SIMILARITY
wvs_cosine <- as.data.frame(cosine(t(as.matrix(wvs[,2:3]))))
colnames(wvs_cosine) <- wvs$country
wvs_cosine["origin"] <- wvs$country


wvs_cosine_longer <- pivot_longer(wvs_cosine, -c("origin"), values_to = "CS_wvs", names_to = "destination")
head(wvs_cosine_longer)

In [None]:
wvs <- read.csv("data/subset-wvs-wave7-2020.csv")

#euclidean SIMILARITY
wvs_euc <- as.data.frame(1 - distance(as.matrix(wvs[,2:3]), method="euclidean")/max(distance(as.matrix(wvs[,2:3]), method="euclidean")))
colnames(wvs_euc) <- wvs$country
wvs_euc["origin"] <- wvs$country


wvs_euc_longer <- pivot_longer(wvs_euc, -c("origin"), values_to = "CS_euc_wvs", names_to = "destination")
head(wvs_euc_longer)

## Foursquare
### Datasource: [You Are What You Eat (and Drink): Identifying Cultural Boundaries by Analyzing Food and Drink Habits in Foursquare. Thiago H. Silva, Pedro O. S. Vaz de Melo, Jussara M. Almeida, Mirco Musolesi, Antonio A. F. Loureiro (2014)](https://www.aaai.org/ocs/index.php/ICWSM/ICWSM14/paper/viewPaper/8113)



In [None]:
foursquare <- read.csv("data/foursquare.csv")
foursquare

In [None]:
#cosine SIMILARITY
foursquare_cosine <- as.data.frame(cosine(t(as.matrix(foursquare[,2:3]))))
colnames(foursquare_cosine) <- foursquare$country
foursquare_cosine["origin"] <- foursquare$country


foursquare_cosine_longer <- pivot_longer(foursquare_cosine, -c("origin"), values_to = "CS_foursquare", names_to = "destination")
head(foursquare_cosine_longer)

In [None]:
foursquare <- read.csv("data/foursquare.csv")

#euclidean SIMILARITY
foursquare_euc <- as.data.frame(1 - distance(as.matrix(foursquare[,2:3]), method="euclidean")/max(distance(as.matrix(foursquare[,2:3]), method="euclidean")))
colnames(foursquare_euc) <- foursquare$country
foursquare_euc["origin"] <- foursquare$country


foursquare_euc_longer <- pivot_longer(foursquare_euc, -c("origin"), values_to = "CS_euc_foursquare", names_to = "destination")
head(foursquare_euc_longer)

In [None]:
official_dataset <- merge(official_dataset, wvs_cosine_longer)
official_dataset <- merge(official_dataset, wvs_euc_longer)
official_dataset <- merge(official_dataset, foursquare_cosine_longer)
official_dataset <- merge(official_dataset, foursquare_euc_longer)

head(official_dataset)

In [None]:
dim(official_dataset)

In [None]:
write.csv(official_dataset, "data/subset-official-dataset.csv", row.names = FALSE)

## Cultural Similarity 

#### symmetric:   

each country is represented by a vector corresponding to the top (almost) <b>400 dishes</b> (top 50 dishes in all the countries)

![CD](figs/heatmap-similarity-symmetric-tops-topK-cosine-reds.png)

In [None]:
cd_sym <- read.csv("data/FB-food-cultural_similarity-symmetric.csv", check.names=FALSE)
colnames(cd_sym)[1] <- "interests_from"

In [None]:
cd_sym

In [None]:
cd_sym <- cd_sym %>% pivot_longer(!interests_from, names_to = "origin", values_to = "CS_symm") %>%
                        rename(destination = interests_from)

head(cd_sym)

In [None]:
dim(cd_sym)

#### nonsymmetric:   
each country is represented by a vector corresponding to the <b>top 50 dishes from origin country</b>

![CDiv](figs/heatmap-similarity-topK-cosine-reds.png)

The **(non-symmetric)** Cultural Similarity between **origin (rows) and destination (columns)** represent how similar origin and destination are in terms of popular dishes from the country of **origin**.   

**Hypothesis:** Immigrants prefer to move to (host) countries culturally similar to their (home/previous) country.

In [None]:
cd_nonsym <- read.csv("data/FB-food-cultural_similarity-nonsymmetric.csv", check.names=FALSE)
colnames(cd_nonsym)[1] <- "interests_from"

In [None]:
cd_food_o <- cd_nonsym %>% pivot_longer(!interests_from, names_to = "destination", values_to = "CS_nonsymm_food_o") %>%
                           rename(origin = interests_from)
                           
head(cd_food_o)

In [None]:
dim(cd_food_o)

The **(non-symmetric)** Cultural Similarity between **origin (rows) and destination (columns)** representing how similar origin and destination are in terms of popular dishes from the country of **destination**. 

In [None]:
cd_food_d <- cd_nonsym %>% pivot_longer(!interests_from, names_to = "origin", values_to = "CS_nonsymm_food_d") %>%
                            rename(destination = interests_from)

head(cd_food_d)

In [None]:
dim(cd_food_d)

In [None]:
cd <- merge(cd_food_o, cd_food_d)
cd <- merge(cd_sym, cd)

cd

In [None]:
dim(cd)

#### Dataset: official statistics + FB cultural distance

In [None]:
dataset <- merge(official_dataset, cd)#, mutate(official_dataset, destination = tolower(destination), origin = tolower(origin)))

head(dataset)

In [None]:
dim(dataset)

In [None]:
dataset <- dummy_cols(dataset, select_columns = 'origin')
dataset <- dummy_cols(dataset, select_columns = 'destination')

In [None]:
dim(dataset)

In [None]:
write.csv(dataset, "data/subset-final-dataset.csv", row.names = FALSE)

In [None]:
dataset <- read.csv("data/subset-final-dataset.csv")

dataset

### LOG 10 some variables

**Migration stock/flow** 

In order to calculate `log10()` of my dependent variable, *`m_stock`*, I will add an **offset (`offset=1`)** to all the observations in the *`m_stock`* column.

In [None]:
dataset$m_stock <- dataset$m_stock + 1.0
dataset$m_flow_sd_drop_neg <- dataset$m_flow_sd_drop_neg + 1.0
dataset$m_flow_sd_rev_neg <- dataset$m_flow_sd_rev_neg + 1.0
dataset$m_flow_mig_rate <- dataset$m_flow_mig_rate + 1.0
dataset$m_flow_da_min_open <- dataset$m_flow_da_min_open + 1.0
dataset$m_flow_da_min_closed <- dataset$m_flow_da_min_closed + 1.0
dataset$m_flow_da_pb_closed <- dataset$m_flow_da_pb_closed + 1.0

head(dataset)

In [None]:
dataset$log10_m_flow_da_pb_closed <- log10(dataset$m_flow_da_pb_closed)
dataset$log10_m_stock <- log10(dataset$m_stock)
dataset$log10_pop_o <- log10(dataset$pop_o)
dataset$log10_pop_d <- log10(dataset$pop_d)
dataset$log10_area_o <- log10(dataset$area_o)
dataset$log10_area_d <- log10(dataset$area_d)
dataset$log10_GDP_o <- log10(dataset$GDP_o)
dataset$log10_GDP_d <- log10(dataset$GDP_d)
dataset$log10_GDP_percapita_o <- log10(dataset$GDP_percapita_o)
dataset$log10_GDP_percapita_d <- log10(dataset$GDP_percapita_d)
dataset$log10_distwces <- log10(dataset$distwces)
dataset$log10_distcap <- log10(dataset$distcap)

head(dataset)

In [None]:
countries

In [None]:
write.csv(dataset, "data/subset-final-dataset.csv", row.names = FALSE)