# Exploring Migration Patterns using Digital Trace Data  

Lilongwe, Malawi, 21 May 2024 from 11:00 to 13:00.  

Organized by the [IUSSP Panel in Digital and Computational Demography](https://iussp.org/en/digital-and-computational-demography).

This training workshop will take place at the African Population Conference in Lilongwe, Malawi, on 21 May 2024 from 11:00 to 13:00.   
**Please register for this training workshop only if you are attending the UAPS conference.**  

Register [here](https://docs.google.com/forms/d/e/1FAIpQLSd2hEX9l8FACdzBqtrggkjImEDRz_83ZFnENCpeez_q86mGnw/viewform) for "*Exploring Migration Patterns using Digital Trace Data*" 

## Trainers: 
•    Carolina Coimbra Vieira, Max Planck Institute for Demographic Research (MPIDR)   
•    Ebru Şanlıtürk, Max Planck Institute for Demographic Research (MPIDR) 


In [None]:
#!/usr/bin/python3
import math
import numpy as np
import os
import glob
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mplcolor
import seaborn as sns
from mycolorpy import colorlist as mcp
from statsmodels.tsa.stattools import grangercausalitytests
from scipy.stats import spearmanr

In [None]:
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.titlesize'] = 18
plt.rcParams['legend.fontsize'] = 10
    
plt.rcParams["figure.figsize"] = (12,6)

In [None]:
colors_langs_dict = {"en": "#666666", 
                    "uk": "#e7298a",
                    "pl": "#d95f02",
                    "ru": "#1b9e77",
                    "Ukrainian refugees": "#000000",
                    "DANE": "#000000"}

In [None]:
def plot_timeseries (df, color, title, note):
    
    df.plot(color=color)
    plt.title(title)
    
    plt.savefig("figs/series/timeseries-" + note + ".png", format="png", bbox_inches="tight", dpi=300)
    plt.show()

In [None]:
def subplot_timeseries (df, color, title, note):
    
    axes = df.plot.line(subplots=True, color=color, lw=2, ylim=(0,max(df.max())))
    axes = axes.flat
    fig = axes[0].get_figure()
    
    fig.suptitle(title, fontsize=16)
    
    for ax in axes:
        ax.legend(handletextpad=-2.0, handlelength=0, loc='upper left', bbox_to_anchor=(0.08, 1.1), frameon=False)
    #plt.xticks(rotation=50)
    
    plt.savefig("figs/series/subplots-views-" + note + title + ".png", format="png", bbox_inches="tight", dpi=300)
    plt.show()
    

## UNHCR data
The data from UNHCR consists of the daily number of Ukrainian refugees crossing the border from Ukraine to Poland 24th of February 2022.

In [None]:
df_official = pd.read_csv("data/data_unhcr_border_crossings_from_ukraine_day.csv", sep=",")
df_official["Date"] = df_official["Date"].astype('datetime64[ns]')
df_official.set_index("Date", inplace=True)

df_official

In [None]:
plot_timeseries(df_official, 
                color=colors_langs_dict, 
                title="Daily number of Ukrainian refugees crossing the border from Ukraine to Poland", 
                note="UNHCR-data")

## DANE data

In [None]:
df_dane = pd.read_csv("data/data_dane_applications_cities_pol_week.csv", index_col=0)
df_dane.rename(columns={"WARSZAWA": "WARSAW"}, inplace=True)
df_dane

In [None]:
df_dane_stocks = pd.read_csv("data/data_dane_applications_cities_pol_week_stocks.csv", index_col=0)
df_dane_stocks.rename(columns={"WARSZAWA": "WARSAW"}, inplace=True)
df_dane_stocks

In [None]:
rank_PESEL = pd.DataFrame.from_dict(dict([(c[0].replace(" DANE", ""), c[1])
                                                 for c in sorted(dict(df_dane_stocks.max()).items(), key=lambda x:x[1], reverse=True)]), 
                                           orient="index")
rank_PESEL.columns = ["Max PESEL stocks"]
rank_PESEL.index = [c.capitalize() for c in rank_PESEL.index]
rank_PESEL

## Wikipedia data
For each Polish city, we collected the data on the number of views on Wikipedia pages using the Wikimedia pageviews platform: https://pageviews.wmcloud.org/langviews/


### Biggest Polish cities

https://metropolie.pl/fileadmin/news/2022/07/Urban_hospitality_update.pdf
https://businessinsider-com-pl.translate.goog/wiadomosci/wiemy-do-ktorych-miast-dotarlo-najwiecej-uchodzcow-z-ukrainy/1cqkwe5?_x_tr_sl=pl&_x_tr_tl=en&_x_tr_hl=pl&_x_tr_pto=wapp

In [None]:
# Country + Polish cities with more than 200k inhabitants
polish_cities = ["Poland",
                 "Białystok", 
                 "Bydgoszcz", 
                 "Częstochowa", 
                 "Gdańsk", 
                 "Gdynia", 
                 "Gliwice", 
                 "Katowice", 
                 "Kielce", 
                 "Kraków", 
                 "Łódź", 
                 "Lublin", 
                 "Poznań", 
                 "Radom", 
                 "Sosnowiec", 
                 "Szczecin",
                 "Toruń",
                 "Warsaw", 
                 "Wrocław"] 

In [None]:
# use glob to get all the csv files in the folder
path = os.getcwd()
csv_files_preprocess = glob.glob(os.path.join(path, "data/data_wikipedia_views_cities_pol/*.csv"))
csv_files_preprocess = list(set(csv_files_preprocess))

In [None]:
len(csv_files_preprocess)

In [None]:
language_codes = pd.read_csv("data/language-codes-full.csv")
language_codes = language_codes[["alpha3-b", "alpha2", "English"]]

language_codes

In [None]:
def timeseries_relative_change(csv_files_preprocess, same_scale=False):
    colors = ["#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", "#e6ab02", "#a6761d", "#666666"]
    colors = [mplcolor.to_hex(c) for c in sns.color_palette("tab10")]
    remaining_colors = ['#d95f02', '#d95f02', '#d95f02', '#d95f02', '#d95f02', '#d95f02', '#7570b3', '#1f78b4', '#e31a1c']

    #sns.set(rc={'figure.figsize':(20, 10)})
    plt.rcParams["figure.figsize"] = (5, 5)
    plt.rcParams['legend.fontsize'] = 14
    
    deltas_wiki_views = dict()
    diff_median = dict()

    for f in csv_files_preprocess:
        city_wikipedia = f.split("\\")[-1].split(".")[0]
        city = f.split("\\")[-1].split(".")[0].replace(",", " ")  

        colors_langs_dict = {"English": "#666666", 
                             "Ukrainian": "#e7298a",
                             "Russian": "#1b9e77"}

        df = pd.read_csv(f, low_memory=False)
        
        df.drop(columns = ["Title", "Badges"], inplace=True)
        
        df = df.merge(language_codes[["alpha2", "English"]], how="left", left_on="Language", right_on="alpha2").merge(language_codes[["alpha3-b", "English"]], how="left", left_on=["Language", "English"], right_on=["alpha3-b", "English"])
        df["Wikipedia Page Language"] = df["English"]
        df.drop(columns="English", inplace=True)
        df["Wikipedia Page Language"].fillna(df["Language"], inplace=True)
        df.drop(columns=["Language", "alpha2", "alpha3-b"], inplace=True)
        df.set_index("Wikipedia Page Language", inplace=True)
        
        official_langs = ["Polish"]
        for i, l in enumerate(official_langs):
            if l not in colors_langs_dict.keys():
                colors_langs_dict[l] = remaining_colors[i]

        languages = colors_langs_dict.keys() 
        languages = [l for l in languages if l in df.index]

        df_weekly = df.T.reset_index()
        df_weekly = df_weekly.assign(Weeks = df_weekly['index']).drop(columns = 'index')
        df_weekly['Weeks'] =  df_weekly['Weeks'].astype('datetime64[ns]')
        df_weekly = df_weekly.resample('W-Mon', label='left', closed = 'left', on='Weeks').sum()

        df = df_weekly.T

        df = df.loc[languages]

        df_plot = df
        df_plot.columns = df_plot.columns.astype('datetime64[ns]')
        df_plot = df_plot.T

        #Baseline + Pre-post war period
        df_plot = df_plot[(df_plot.index >= "2020-02-24") & (df_plot.index <= "2023-08-24")]

        #Assuming the period until 2018 as a baseline period, we calculate the relative change!
        df_plot_baseline = df_plot[df_plot.index < "2020-08-24"]
        dict_language_baseline = dict(df_plot_baseline.median())
        print("MEDIAN VIEWS BASELINE 24.02-24.08.2020:", dict_language_baseline)
        
        #in case baseline is 0 (no views at all)
        for k,v in dict_language_baseline.items():
            if v == 0:
                dict_language_baseline[k] = 1

        dict_language_median_pre = dict(df_plot[(df_plot.index >= "2020-08-24") & (df_plot.index < "2022-02-24")].median())
        print("MEDIAN VIEWS PRE WAR 24.02.2022:       ", dict_language_median_pre)
        dict_language_median_post = dict(df_plot[(df_plot.index > "2022-02-24")].median())
        print("MEDIAN VIEWS POST WAR 24.02.2022:      ", dict_language_median_post)


        df_plot_proportion = df_plot[df_plot.index >= "2020-08-24"]
        df_plot_proportion = (df_plot_proportion-dict_language_baseline)/dict_language_baseline*100

        axes = df_plot_proportion.plot.line(subplots=True, color=colors_langs_dict, lw=2)

        axes = axes.flat  # .ravel() and .flatten() also work
        # extract the figure object to use figure level methods
        fig = axes[0].get_figure()

        fig.suptitle(city_wikipedia + " - Relative change in the number of views\nBaseline period: 24.02.2020-24.08.2020", fontsize=14)

        # iterate through each axes to use axes level methods
        for ax in axes:
            ax.legend(handletextpad=-2.0, handlelength=0, loc='upper left', bbox_to_anchor=(0.08, 1.1), frameon=False)
            ax.vlines(x="2022-02-24", 
                   ymin=int(min(df_plot_proportion.min())), 
                   ymax=int(max(df_plot_proportion.max())), linestyles="dashed", color="gray")

        plt.savefig("figs/series/relative-change-views-" + city + ".png", format="png", bbox_inches="tight", dpi=300)
        plt.show()


        views_post_war = df_plot_proportion[df_plot_proportion.index > "2022-02-24"].sum()
        views_pre_war = df_plot_proportion[df_plot_proportion.index < "2022-02-24"].sum()

        deltas_wiki_views[city] = dict(views_post_war - views_pre_war)
        diff_median[city] = dict()
        for k, v in dict_language_median_post.items():
            diff_median[city][k] = v - dict_language_median_pre.get(k, 0)

    return deltas_wiki_views, diff_median

In [None]:
len(csv_files_preprocess)

In [None]:
deltas_wiki_views, diffs_median = timeseries_relative_change(csv_files_preprocess, same_scale=True)

In [None]:
plt.rcParams['axes.spines.left'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.bottom'] = False
plt.rcParams["figure.figsize"] = (8, 6.5)

In [None]:
def plot_ranking(ranking_1, ranking_2, language="Ukrainian"):
    
    pallet1 = ['#8dd3c7', '#ffffb3', '#bebada', '#fb8072', '#80b1d3', '#fdb462', '#b3de69', '#fccde5', '#d9d9d9', '#bc80bd']
    pallet2 = ['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c', '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a']
    pallet3 = ['#a50026', '#d73027', '#f46d43', '#fdae61', '#fee090', '#e0f3f8', '#abd9e9', '#74add1', '#4575b4', '#313695']
    pallet4 = ['#6f0000', '#7f0000', '#8f0000', '#9f0000', '#af0000', '#bf0000', '#cf0000', '#df0000', '#ef0000', '#ff0000']
    pallet_mpidr = ['#005869', '#ef7b00', '#8e203c', '#3e2c51', '#22b472', '#faae3b', '#0e7aac']
    pallet_mpi_reds = ['#8e203c', '#c0272c', '#d8361a', '#ef7b00', '#faae3b']
    greys = ['#808080']*10
    
    ranking_1_int = dict()
    for i, location in enumerate(ranking_1):
        ranking_1_int[location] = len(ranking_1) - i
    
    ranks = dict()
    for location in ranking_1:
        ranks[location] = [ranking_1_int[location]]

    for i, location in enumerate(ranking_2):
        ranks[location] += [len(ranking_2) - i] 

    tableau20 = pallet_mpi_reds + greys + greys 
    
    # Avoid unnecessary whitespace.    
    plt.ylim(0, 16)    
    plt.xlim(1, 1.5)    

    # Make sure your axis ticks are large enough to be easily read.    
    # You don't want your viewers squinting to read your plot. 
    y_ticks_label, y_ticks = zip(*sorted(ranking_1_int.items(), key=lambda kv:kv[1]))
    plt.yticks(y_ticks, y_ticks_label, fontsize=18)    
    plt.xticks([1,1.5], ["PESEL stocks", 
                        "Delta median relative change \n " + language + " Wikipedia views"], fontsize=18)    
        
    # Remove the tick marks; they are unnecessary with the tick lines we just plotted.    
    plt.tick_params(axis="both", which="both", bottom="off", top="off",    
                    labelbottom="on", left="off", right="off", labelleft="on")    
    
    locations = ranking_1

    for i, location in enumerate(ranking_1):    
        if i < 10: 
            plt.plot([1,1.5], ranks[location], lw=4, color=tableau20[i])    
        else:
            plt.plot([1,1.5], ranks[location], lw=2, color=tableau20[i]) 

        # Add a text label to the right end of every line. Most of the code below    
        # is adding specific offsets y position because some labels overlapped.    
        y_pos = ranks[location][1]

        plt.text(1.55, y_pos, location, fontsize=18, color=tableau20[i])    
    plt.tight_layout()
    plt.savefig("figs/ranks/rank_polish_cities_" + language + ".png", bbox_inches="tight", format="png", dpi=300)
    plt.show()

In [None]:
for language in ["Ukrainian", "Russian", "English", "Polish"]:
    rank_wiki_deltas = pd.DataFrame.from_dict(dict([(c[0], c[1][language]) 
                                                for c in sorted(deltas_wiki_views.items(), 
                                                                key=lambda x:x[1][language], reverse=True) 
                                                if c[0] != 'Poland']), orient="index")
    rank_wiki_deltas.columns = ["Delta sum relative change"]

    rank_wiki_medians = pd.DataFrame.from_dict(dict([(c[0], c[1][language]) 
                                                for c in sorted(diffs_median.items(), 
                                                                key=lambda x:x[1][language], reverse=True) 
                                                if c[0] != 'Poland']), orient="index")
    rank_wiki_medians.columns = ["Delta median relative change"]
    
    ranks = rank_PESEL.merge(rank_wiki_deltas, right_index=True, left_index=True)
    ranks = ranks.merge(rank_wiki_medians, right_index=True, left_index=True)
    
    print(language)
    rho, p = spearmanr(ranks["Max PESEL stocks"], ranks["Delta sum relative change"])
    print("Delta sum", rho, p)
    
    rho, p = spearmanr(ranks["Max PESEL stocks"], ranks["Delta median relative change"])
    print("Delta median", rho, p)
    
    plot_ranking(ranks["Max PESEL stocks"].sort_values(ascending=False).index.tolist(), 
                 ranks["Delta median relative change"].sort_values(ascending=False).index.tolist(),
                 language)

In [None]:
plt.rcParams['axes.spines.left'] = True
plt.rcParams['axes.spines.right'] = True
plt.rcParams['axes.spines.top'] = True
plt.rcParams['axes.spines.bottom'] = True
plt.rcParams["figure.figsize"] = (8, 6.5)

### PESEL

In [None]:
df_dane.drop(index=df_dane.index[0], axis=0, inplace=True)
df_dane

In [None]:
df_dane.columns = [c.capitalize() + " DANE" for c in df_dane.columns]
df_dane["Date"] = df_dane.index.astype('datetime64[ns]')
df_dane.set_index("Date", inplace=True, drop=True) 
df_dane

In [None]:
max(df_dane.max())

## Merging datasets: 
## Wikipedia views in Polish cities and Ukrainian refugees crossing the border to Poland 

In [None]:
df_merged = df_official

for f in csv_files_preprocess:
    city = f.split("\\")[-1].split(".")[0]
    
    df = pd.read_csv(f, low_memory=False)
    df.drop(columns = ["Title", "Badges"], inplace=True)
    df = df[df["Language"].isin(colors_langs_dict.keys())]
    df.set_index("Language", inplace=True)

    df.columns = df.columns.astype('datetime64[ns]')
    df = df.T
    
    ## COMBINING DATASETS: UNHRC + WIKIPEDIA VIEWS (PL, EN, UK)
    df_merged = df_merged.merge(df.add_prefix(city + " "), how="right", left_index=True, right_index=True)

In [None]:
df_merged

In [None]:
df_merged.sum()

In [None]:
df_merged_official = pd.DataFrame(df_merged["Ukrainian refugees"].dropna())
df_merged_official

In [None]:
plt.rcParams["figure.figsize"] = (15, 5)
plt.plot(df_merged_official["Ukrainian refugees"])

### Correlations

In [None]:
plt.rcParams["figure.figsize"] = (30, 30)
corr = df_merged.corr().round(2)
sns.heatmap(corr, cmap="YlGnBu", annot=True, vmin=0, vmax=1)
plt.title("Correlation between number of views on Wikipedia page \n dedicated to Polish cities across different languages and \n the number of Ukrainian refugees crossing the border \n from Ukraine to Poland during the war")
plt.savefig("figs/corr/correlations.png", format="png", bbox_inches="tight", dpi=300)
plt.show() 

In [None]:
PROPS = {
    'boxprops':{'facecolor':'white', 'edgecolor':'black'},
    'medianprops':{'color':'black'},
    'whiskerprops':{'color':'black'},
    'capprops':{'color':'black'}
}

plt.rcParams["figure.figsize"] = (8, 4)

# Store the list of columns
suffix_columns_to_plot = ['en', 'pl', 'ru', 'uk']

# Create the figure and two subplots
fig, axes = plt.subplots(ncols=len(suffix_columns_to_plot))

# Create the boxplot with Seaborn
for c, axis in zip(suffix_columns_to_plot, axes):
           
    df_aux = df_merged[["Ukrainian refugees"] + [col for col in df_merged if c in col]]
    df_aux = df_aux.dropna()
    corrs = df_aux.corr().iloc[1:-1,0].tolist()
    sns.boxplot(data=corrs, ax=axis, **PROPS) 
    if c == 'en':
        #axis.set_title("Correlation between number of views on Wikipedia pages dedicated to \nPolish cities across different languages and the number of Ukrainian \nrefugees crossing the border from Ukraine to Poland (24.02.22-07.03.23)", 
        #              loc="left")
        axis.set(ylabel="Correlation", xlabel="English", xticklabels=["English"], xticks=[], ylim=(-0.3, 0.7))
        
    elif c == "pl":
        axis.set(yticks=[], xlabel="Polish", xticklabels=["Polish"], xticks=[], ylim=(-0.3, 0.7))
        
    elif c == "ru":
        axis.set(yticks=[], xlabel="Russian", xticklabels=["Russian"], xticks=[], ylim=(-0.3, 0.7))
        
    elif c == "uk":
        axis.set(yticks=[], xlabel="Ukrainian", xticklabels=["Ukrainian"], xticks=[], ylim=(-0.3, 0.7))
        
    axis.axhline(0, ls='--', c='gray')
    

plt.savefig("figs/corr/corr-boxplot.png", format="png", bbox_inches="tight", dpi=300)
plt.show()

In [None]:
df_merged[["Ukrainian refugees"] + [col for col in df_merged if "en" in col]].dropna().corr()

In [None]:
df_merged[["Ukrainian refugees"] + [col for col in df_merged if "en" in col]].dropna().corr().iloc[1:-1,0].tolist()

In [None]:
for f in csv_files_preprocess:
    
    plt.rcParams["figure.figsize"] = (8, 8)
    
    city = f.split("\\")[-1].split(".")[0]
    
    filter_col = ["Ukrainian refugees"] + [col for col in df_merged if col.startswith(city)]
    
    df = df_merged[filter_col]
    df.columns = [c.split(" ")[1] if city in c else c for c in df.columns]

    df_plot = df_merged[filter_col]
    df_plot.columns = [c.split(" ")[1] if city in c else c for c in df.columns]
    
    ## DF PLOT!!! RANGE
    df_plot = df_plot["2022-01-01":]
    
    # LOG ALL VALUES or NORM MAX 
    #df_plot = (np.log(df_plot)).replace(-np.inf, 0)
    #df_plot = df_plot / df_plot.max()
    #df_plot = df_plot / df_plot.sum()
      
    
    fig, axes1 = plt.subplots(4, 1)
    
    df_plot[["Ukrainian refugees"]].plot.line(
        ax=axes1[0], color=colors_langs_dict, title="Wikipedia page: " + city, ylim=(0, 150000), legend=None)
    df_plot[["Ukrainian refugees"]].plot.line(
        ax=axes1[1], color=colors_langs_dict, ylim=(0, 150000), legend=None)
    df_plot[["Ukrainian refugees"]].plot.line(
        ax=axes1[2], color=colors_langs_dict, ylim=(0, 150000), legend=None)
    df_plot[["Ukrainian refugees"]].plot.line(
        ax=axes1[3], color=colors_langs_dict, ylim=(0, 150000), legend=None)
    
    axes1[3].set_xlabel('Time (days)')
    axes1[1].set_ylabel('Number of Ukrainian refugees (source: UNHCR)', color=colors_langs_dict["Ukrainian refugees"])
    
    axes20 = axes1[0].twinx()  # instantiate a second axes that shares the same x-axis
    axes21 = axes1[1].twinx()  # instantiate a second axes that shares the same x-axis
    axes22 = axes1[2].twinx()  # instantiate a second axes that shares the same x-axis
    axes23 = axes1[3].twinx()  # instantiate a second axes that shares the same x-axis
    
    df_plot[["pl"]].plot.line(
        ax=axes20, color=colors_langs_dict, xticks=[], ylim=(0, df_plot["pl"].max()*1.1), legend=None)
    axes20.set_ylabel('Number of views \n (Polish)', color=colors_langs_dict["pl"])
    axes20.tick_params(axis='y', labelcolor=colors_langs_dict["pl"])
    
    df_plot[["en"]].plot.line(
        ax=axes21, color=colors_langs_dict, xticks=[], ylim=(0, df_plot["en"].max()*1.1), legend=None)
    axes21.set_ylabel('Number of views \n (English)', color=colors_langs_dict["en"])
    axes21.tick_params(axis='y', labelcolor=colors_langs_dict["en"])
    
    df_plot[["ru"]].plot.line(
        ax=axes22, color=colors_langs_dict, xticks=[], ylim=(0, df_plot["ru"].max()*1.1), legend=None)
    axes22.set_ylabel('Number of views \n (Russian)', color=colors_langs_dict["ru"])
    axes22.tick_params(axis='y', labelcolor=colors_langs_dict["ru"])
    
    df_plot[["uk"]].plot.line(
        ax=axes23, color=colors_langs_dict, ylim=(0, df_plot["uk"].max()*1.1), legend=None)
    axes23.set_ylabel('Number of views \n (Ukrainian)', color=colors_langs_dict["uk"])
    axes23.tick_params(axis='y', labelcolor=colors_langs_dict["uk"])
    
    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.savefig("figs/series/plot" + city + ".png", format="png", bbox_inches="tight", dpi=300)
    plt.show()
    
    
    plt.rcParams["figure.figsize"] = (5, 4)
    df_plot.rename(columns={"Ukrainian refugees": "Ukrainian refugees \n in Poland (UNHCR)", 
                       "pl": "Wikipedia views \n in Polish", 
                       "en": "Wikipedia views \n in English", 
                       "ru": "Wikipedia views \n in Russian", 
                       "uk": "Wikipedia views \n in Ukrainian"}, inplace=True)
    
    corr = df_plot.dropna().corr()
    sns.heatmap(corr, cmap="YlGnBu", annot=True, vmin=0, vmax=1)
    #plt.title(city + ": Correlation between number of views on Wikipedia page \n dedicated to " + city + " across different languages and \n the number of Ukrainian refugees crossing the border \n from Ukraine to Poland (24.02.22-07.03.23)")
    plt.savefig("figs/corr/corr-" + city + ".png", format="png", bbox_inches="tight", dpi=300)
    plt.show()

### Granger causality

The Null hypothesis for grangercausalitytests is that the time series in the second column, x2, does NOT Granger cause the time series in the first column, x1. Grange causality means that past values of x2 have a statistically significant effect on the current value of x1, taking past values of x1 into account as regressors. We reject the null hypothesis that x2 does not Granger cause x1 if the pvalues are below a desired size of the test.

https://www.machinelearningplus.com/time-series/granger-causality-test-in-python/


The Granger Causality test is used to determine whether or not one time series is useful for forecasting another.

This test uses the following null and alternative hypotheses:

Null Hypothesis (H0): Time series x does not Granger-cause time series y

Alternative Hypothesis (HA): Time series x Granger-causes time series y

The term “Granger-causes” means that knowing the value of time series x at a certain lag is useful for predicting the value of time series y at a later time period.

This test produces an F test statistic with a corresponding p-value. If the p-value is less than a certain significance level (i.e. α = .05), then we can reject the null hypothesis and conclude that we have sufficient evidence to say that time series x Granger-causes time series y.

https://www.statology.org/granger-causality-test-in-python/


In [None]:
for f in csv_files_preprocess:
    
    plt.rcParams["figure.figsize"] = (5, 4)
    
    city = f.split("\\")[-1].split(".")[0]
    
    filter_col = ["Ukrainian refugees"] + [col for col in df_merged if col.startswith(city)]
    
    df = df_merged[filter_col]
    df.columns = [c.split(" ")[1] if city in c else c for c in df.columns]
   
    ## WAR PERIOD
    df = df["2022-02-24":"2023-03-07"]    
    
    df.rename(columns={"Ukrainian refugees": "Ukrainian refugees \n in Poland (UNHCR)", 
                       "pl": "Wikipedia views \n in Polish", 
                       "en": "Wikipedia views \n in English", 
                       "ru": "Wikipedia views \n in Russian", 
                       "uk": "Wikipedia views \n in Ukrainian"}, inplace=True)
    
    df_dot_f = df.T.dot(df)
    labels = df.T.dot(df)
    
    for r in df_dot_f.index:
        for c in df_dot_f.columns:
            df_dot_f.loc[r,c] = np.nan
            labels.loc[r,c] = ""
            
            if r != c:
                g_causality = grangercausalitytests(df[[r,c]], maxlag=[3], verbose=False)[3][0]["params_ftest"]
                
                fscore = g_causality[0]
                pvalue = g_causality[1]
                
                df_dot_f.loc[r,c] = fscore
                labels.loc[r,c] = str(round(fscore,2)) + "\n(" + str(round(pvalue, 2)) + ")" # 0 -> F, 1 -> p-value
                

    sns.heatmap(df_dot_f, cmap="YlGnBu", annot=labels, fmt = '', vmin=0, vmax=max(df_dot_f.max())+1)
    #plt.title(city + ": F-test for Granger causality (column Granger cause row)\n(24.02.22-07.03.23)")
    plt.savefig("figs/granger/granger-f-" + city + ".png", format="png", bbox_inches="tight", dpi=300)
    plt.show()
