In [267]:
import numpy as np
import pandas as pd
#import plotly.plotly as py
#import plotly.graph_objs as go
import warnings
warnings.filterwarnings('ignore')

In [268]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [269]:
init_notebook_mode(connected=True)

For my final project, I will explore a dataset on the number of refugees around the world from 1975 until 2016 which I obtained from the United Nations data portal. I chose to analyze data on refugees since I majored in international relations in undergraduate school and studied  history of political regimes, conflict and war which are the main causes of why people become refugees. In addition, I feel very strongly about the refugee crisis because I worked directly with refugees from multple countries in Africa, Europe and Asia to help them rebuild their life in the United States. As I explore this data, I will also use auxiliary datasets, including world population and 2016 GDP data from the World Bank's website and geographic coordinates data which I found on simplemap.com. I believe these supplementary datasets will help me make more interesting and insightful plots.  

# What types of questions were you hoping to explore with this data?

First of all, I would like to take a look at the historic trend for the number of refugees worldwide.

In [270]:
df = pd.read_csv('UN_refugees_data.csv')

In [271]:
df.head()

Unnamed: 0,Country or territory of asylum or residence,Country or territory of origin,Year,Total Refugees
0,Afghanistan,Iraq,2016,1.0
1,Afghanistan,Islamic Rep. of Iran,2016,33.0
2,Afghanistan,Pakistan,2016,59737.0
3,Albania,China,2016,11.0
4,Albania,Dem. Rep. of the Congo,2016,3.0


In [272]:
df_tseries = df.groupby('Year')['Total Refugees'].sum().reset_index()

In [273]:
trace = go.Scatter(
                x=df_tseries ['Year'],
                y=df_tseries ['Total Refugees'],
                mode = 'lines',
                line = dict(color = "#F28D28"),
                opacity = 1)

data = [trace]

layout = dict(
    title='<b>Number of Refugees Worldwide </b>',
        xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='Year',
            font=dict(

                size=18,
                color='#7f7f7f'
            )
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Number of Refugees',
            font=dict(

                size=18,
                color='#7f7f7f'
            )
        )
    )
    
)

        
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Refugees in the World')


Although there is an upward trend in the number of refugees since the year of 2015, we also observe that number of refugees in the world was at its maximum in early 1990s. 

Next, I will take a look at the distribution of refugees from 1995 to 2015 by country of origin. 

In [274]:
total_ref_by_origin = df.groupby(['Country or territory of origin', 'Year'])['Total Refugees'].sum().reset_index()

total_ref_by_asylum = df.groupby(['Country or territory of asylum or residence', 'Year'])['Total Refugees'].\
sum().reset_index()

total_ref_by_origin = total_ref_by_origin.rename(index=str, columns={'Total Refugees': 'Total Refugees Abroad',
                                                        "Country or territory of origin" : 'Country'})
total_ref_by_asylum = total_ref_by_asylum.rename(index=str, columns={'Total Refugees': 'Total Refugees in Country',
                                                        "Country or territory of asylum or residence" : 'Country'})

combined = pd.merge(total_ref_by_asylum, total_ref_by_origin, on = ['Country', 'Year'],  how='inner')

In [275]:
b1 = combined[(combined['Year'] == 1995) & (combined['Total Refugees Abroad'] >100000)]['Total Refugees Abroad']
b2 = combined[(combined['Year'] == 2000) & (combined['Total Refugees Abroad'] >100000)]['Total Refugees Abroad']
b3 = combined[(combined['Year'] == 2005) & (combined['Total Refugees Abroad'] >100000)]['Total Refugees Abroad']
b4 = combined[(combined['Year'] == 2010) & (combined['Total Refugees Abroad'] >100000)]['Total Refugees Abroad']
b5 = combined[(combined['Year'] == 2015) & (combined['Total Refugees Abroad'] >100000)]['Total Refugees Abroad']

trace1 = go.Box(
    y=b1,
    name = '1995',            

)
trace2 = go.Box(
    y=b2,
    name = '2000',

)
trace3 = go.Box(
    y=b3,
    name = '2005',

)
trace4 = go.Box(
    y=b4,
    name = '2010',

)
trace5 = go.Box(
    y=b5,
    name = '2015',

)

layout = go.Layout(
    title = go.layout.Title(
        text = '<b>Distribution of Refugees Between 1995 and 2015</b>'
    ),
    xaxis=dict(
        title='Year'
    ),
    yaxis=dict(
        title='Number of Refugees from Each Country'
    ),
    
)

data = [trace1, trace2, trace3, trace4, trace5]

fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='boxplot')

Based on the boxplot above, the year of 2000 has no outliers suggesting that there was no major influx of refugees in 2000. Looking at the outliers for the rest of the years, we observe that there were almost 5 millions of refugees from one country in 2015 which demonstates that the refugee crisis has aggravated since 2000. 

Let's focus now on countries of origin (where refugees came from) and countries of asylum (where refugees reside). I would like to explore the spread of refugees throughout the world. Specifically, I am interested in finding out which countries of asylum accepted refugees from the most countries in the world and which countries of origin have their denizens spread to the most countries in the world. 

In [276]:
df_barplot = df[(df['Country or territory of asylum or residence'] != "Various") & 
   (df['Country or territory of origin'] != "Various") ]

countries_origin = df_barplot['Country or territory of origin'].unique().tolist()
countries_asylum = df_barplot['Country or territory of asylum or residence'].unique().tolist()

num_countries = []
for c in countries_asylum:
    num_countries.append((c,df_barplot[df_barplot['Country or territory of asylum or residence'] == c]\
            ['Country or territory of origin'].unique().shape[0]))


num_countries_asylum = sorted(num_countries, key=lambda x:x[1], reverse=True)
top_20_asylum = num_countries_asylum[0:20] # Looking at the top 20 countries only

country_asylum = []
num_asylum = []

for c in top_20_asylum:
    country_asylum.append(c[0])
    num_asylum.append(c[1])

In [277]:
data = [go.Bar(
            x=country_asylum,
            y=num_asylum,
            marker=dict(
            color='rgb(95, 149, 111)'
            )
    )]

layout = go.Layout(
    title='<b>Countries that Accepted Refugees from the Most Countries of Origin</b>',
    xaxis=dict(
        title='Country of Asylum'
    ),
    yaxis=dict(
        title='Number of Countries Refugees Came from'
    ),
    
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='barplot')

In [278]:
num_countries = []
for c in countries_origin:
    num_countries.append((c,df_barplot[df_barplot['Country or territory of origin'] == c]\
            ['Country or territory of asylum or residence'].unique().shape[0]))

num_countries_origin = sorted(num_countries, key=lambda x:x[1], reverse=True)
top_20_origin = num_countries_origin[0:20] # Looking at the top 20 countries only

country_origin = []
num_origin = []

for c in top_20_origin:
    country_origin.append(c[0])
    num_origin.append(c[1])

country_origin = ['DRC (Congo)' if c== 'Dem. Rep. of the Congo' else c for c in country_origin]

In [279]:
data = [go.Bar(
            x=country_origin,
            y=num_origin,
            marker=dict(
            color='rgb(95, 149, 111)'
            )
    )]

layout = go.Layout(
    title='<b>Countries Refugees from which Reside in the Most Countries of Asylum </b>',
    xaxis=dict(
        title='Country of Origin'
    ),
    yaxis=dict(
        title='Number of Countries Refugees Live in'
    ),
    
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='barplot')

Let's now focus on the latest data from 2016. Is there are relatinship between the number of refugees living in a country of asylum and the GDP of that country?

In [280]:
combined_2016 = combined[combined['Year'] == 2016]
gdp = pd.read_csv('gdp_data.csv')
gdp_2016 = pd.merge(combined_2016,gdp, how='inner', on='Country')
gdp_2016 = gdp_2016.dropna()

In [281]:
gdp_2016.head()

Unnamed: 0,Country,Year,Total Refugees in Country,Total Refugees Abroad,GDP
0,Afghanistan,2016,59771.0,2488689.0,19543980000.0
1,Albania,2016,138.0,11065.0,13038540000.0
2,Algeria,2016,94232.0,3726.0,168000000000.0
3,Angola,2016,15555.0,8404.0,122000000000.0
4,Antigua and Barbuda,2016,4.0,83.0,1510085000.0


In [282]:
trace = go.Scatter(
    x = gdp_2016['Total Refugees in Country'],
    y = gdp_2016['GDP'],
    mode = 'markers'
)

data = [trace]

layout = go.Layout(
    title='<b> Relationship between GDP and Number of Refugees in Countries of Asylum </b>',
    xaxis=dict(
        title='Number of Refugees'
    ),
    yaxis=dict(
        title='GDP'
    ),
    
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter')

In [283]:
removed_outliers = gdp_2016.sort_values('GDP', ascending = False).reset_index()

In [284]:
removed_outliers = removed_outliers.loc[4:,]

In [285]:
trace = go.Scatter(
    x = removed_outliers['Total Refugees in Country'],
    y = removed_outliers['GDP'],
    mode = 'markers'
)

data = [trace]

layout = go.Layout(
    title='<b> Relationship between GDP and Number of Refugees in Countries of Asylum </b>',
    xaxis=dict(
        title='Number of Refugees'
    ),
    yaxis=dict(
        title='GDP'
    ),
    
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic-scatter')

We observe that even After removing the largest economies, There doesn't seem to a clear linear relationship

Next, I will look at the frequency distribution of refugees in 2016 across different countries. 

In [286]:
df_hist = combined_2016.sort_values('Total Refugees in Country', ascending=False)

df_hist = df_hist[df_hist['Total Refugees in Country'] > 10000]


In [287]:
data = [go.Histogram(x=df_hist['Total Refugees in Country'],
                    marker=dict(color='rgb(95, 149, 111)'
            ))]

layout = go.Layout(
    title='<b>Frequency Distribution of Refugees in 2016 Across Countries</b>',
    xaxis=dict(
        title='Number of Refugees',
        range = [10000, df_hist['Total Refugees in Country'].max()]
    ),
    yaxis=dict(
        title='Number of Countries Refugees Reside in'
    ),
    
)


# x axis should start from 10000; add text to display country names
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic histogram')

As we can see, there are approximately 5 countires where more than half million of refugees lived in in 2016, while in most of the countries of asylum,
the number of refugees is significantly below half a million.  

Now, let's look at five countries with a historically high influx of refugees.

In [288]:
df_popul = pd.read_csv('population_by_country.csv')
df_popul.head()

Unnamed: 0,Country,1975,1976,1977,1978,1979,1980,1981,1982,1983,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Aruba,60657.0,60586.0,60366.0,60103.0,59980.0,60096.0,60567.0,61345.0,62201.0,...,101220.0,101353.0,101453.0,101669.0,102053.0,102577.0,103187.0,103795.0,104341.0,104822.0
1,Afghanistan,12590286.0,12840299.0,13067538.0,13237734.0,13306695.0,13248370.0,13053954.0,12749645.0,12389269.0,...,26616792.0,27294031.0,28004331.0,28803167.0,29708599.0,30696958.0,31731688.0,32758020.0,33736494.0,34656032.0
2,Angola,7682479.0,7900997.0,8130988.0,8376147.0,8641521.0,8929900.0,9244507.0,9582156.0,9931562.0,...,20997687.0,21759420.0,22549547.0,23369131.0,24218565.0,25096150.0,25998340.0,26920466.0,27859305.0,28813463.0
3,Albania,2404831.0,2458526.0,2513546.0,2566266.0,2617832.0,2671997.0,2726056.0,2784278.0,2843960.0,...,2970017.0,2947314.0,2927519.0,2913021.0,2905195.0,2900401.0,2895092.0,2889104.0,2880703.0,2876101.0
4,Andorra,30705.0,31777.0,32771.0,33737.0,34818.0,36067.0,37500.0,39114.0,40867.0,...,82683.0,83861.0,84462.0,84449.0,83751.0,82431.0,80788.0,79223.0,78014.0,77281.0


In [289]:
#df_popul.Country.unique()

In [290]:
df_popul = df_popul.replace(['Iran, Islamic Rep.','Syrian Arab Republic', 
                 'Congo, Dem. Rep.'], ['Iran','Syria',
                                                 'Congo (DR)'])

In [291]:
df_temp = df.replace(['Islamic Rep. of Iran','Syrian Arab Rep.', 
                 'Dem. Rep. of the Congo','United Rep. of Tanzania'], ['Iran','Syria',
                                                 'Congo (DR)', 'Tanzania'])

In [292]:
top_asylum = df_temp.groupby(['Country or territory of asylum or residence','Year'])['Total Refugees'].\
                sum().sort_values(ascending=False).reset_index()

top_15 = top_asylum['Country or territory of asylum or residence'].unique()[:15]

top_15

array(['Iran', 'Pakistan', 'Turkey', 'Somalia', 'Congo (DR)', 'Syria',
       'Germany', 'Sudan', 'Lebanon', 'Malawi', 'Uganda', 'Tanzania',
       'United States', 'Ethiopia', 'Various'], dtype=object)

In [293]:
df_stack_area = top_asylum[top_asylum['Country or territory of asylum or residence'].isin(top_15)]

#year = [1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015]
year = df_stack_area.Year.unique()

In [294]:
df_merged_pop = pd.merge(df_stack_area, df_popul, how='left', left_on='Country or territory of asylum or residence',
             right_on='Country')



In [295]:
colorscale=[[0, 'rgb(166,206,227)'], [0.25, 'rgb(31,120,180)'], [0.45, 'rgb(178,223,138)'], 
                    [0.65, 'rgb(51,160,44)'], [0.85, 'rgb(251,154,153)'], [1, 'rgb(227,26,28)']]

trace0 = dict(
    x=year,
    y=df_stack_area[df_stack_area['Country or territory of asylum or residence'] == 'Germany'].\
                            sort_values('Year')['Total Refugees'].tolist(),
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5,
              color='rgb(131, 90, 241)'),
    stackgroup='one',
    name = 'Germany',
    fillcolor = 'rgb(166,206,227)'
)
trace1 = dict(
    x=year,
    y=df_stack_area[df_stack_area['Country or territory of asylum or residence'] == 'Congo (DR)'].\
                            sort_values('Year')['Total Refugees'].tolist(),
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5,
              color='rgb(111, 231, 219)'),
    stackgroup='one',
    name = 'Congo (DR)',
    fillcolor = 'rgb(31,120,180)'
)
trace2 = dict(
    x=year,
    y=df_stack_area[df_stack_area['Country or territory of asylum or residence'] == 'United States'].\
                            sort_values('Year')['Total Refugees'].tolist(),
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5,
              color='rgb(184, 247, 212)'),
    stackgroup='one',
    name='United States',
    fillcolor='rgb(178,223,138)'
)


trace3 = dict(
    x=year,
    y=df_stack_area[df_stack_area['Country or territory of asylum or residence'] == 'Iran'].\
                            sort_values('Year')['Total Refugees'].tolist(),
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5,
              color='rgb(184, 247, 212)'),
    stackgroup='one',
    name = 'Iran',
    fillcolor='rgb(51,160,44)'
)


trace4 = dict(
    x=year,
    y=df_stack_area[df_stack_area['Country or territory of asylum or residence'] == 'Pakistan'].\
                            sort_values('Year')['Total Refugees'].tolist(),
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5,
              color='rgb(184, 247, 212)'),
    stackgroup='one',
    name = 'Pakistan',
    fillcolor='rgb(251,154,153)'
)


layout = go.Layout(
    title='<b>Countries of Asylum with Highest Number of Refugees</b>',
    xaxis=dict(
        title='Year'
    ),
    yaxis=dict(
        title='Number of Refugees'
    ),
    
)


data = [trace1, trace2, trace0, trace4, trace3]

fig = dict(data=data,layout=layout)
iplot(fig, filename='stacked-area-plot-hover')

Based on the stacked area graph, we see that for all the countries except Iran, the year of 2000 was the year with the highest number of refugees recorded in these countires. For Iran, the number of refugees has been consistently higher than for the other 4 countries.

Next, I will look at European countries.

In [296]:
europe = pd.read_csv('europe_countries.csv', header=None, sep=';')

country_list = europe[0].tolist()

In [297]:
df_temp = df.replace(['The former Yugoslav Rep. of Macedonia',
                                       'Serbia (and Kosovo: S/RES/1244 (1999))',
                     'Russian Federation'], ['Macedonia','Serbia', 'Russia'])

In [298]:
df_eur = df_temp[df_temp['Country or territory of asylum or residence'].isin(country_list)]

In [299]:
df_eur = df_eur[(df_eur['Year'] == 2016)]

In [300]:
country_refugee = df_eur.groupby('Country or territory of asylum or residence')['Total Refugees'].sum().reset_index()

In [301]:
df_lon_lat = pd.read_excel('lon_lat.xlsx', sheet_name='Sheet1')

In [302]:
df_bubble = pd.merge(country_refugee, df_lon_lat, how='left', 
                left_on='Country or territory of asylum or residence', right_on='Country')


In [347]:
df['text'] = 'Country: ' + (df_bubble['Country']).astype(str) + '<br>Number of Refugees: ' + (df_bubble['Total Refugees']).astype(str)
limits = [(10000,30000),(30001,60000),(60001,100000),(100001,250000),(250001, 700000)]
colors = ["rgb(0,116,217)","rgb(255,65,54)","rgb(133,20,75)","rgb(255,133,27)","rgb(34,139,34)"]
cities = []
scale = 3000

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_bubble[df_bubble['Total Refugees'].between(lim[0], lim[1], inclusive=True)]
    city = go.Scattergeo(
        locationmode = 'country names',
        lon = df_sub['Longitude'],
        lat = df_sub['Latitude'],
        text = df['text'],
        marker = go.scattergeo.Marker(
            size = df_sub['Total Refugees']/scale,
            color = colors[i],
            line = go.scattergeo.marker.Line(
                width=0.5, color='rgb(40,40,40)'
            ),
            sizemode = 'area' ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    cities.append(city)

layout = go.Layout(
        title = go.layout.Title(
            text = 'Refugee Population in Europe in 2016 by Country'
        ),
        showlegend = True,
        geo = go.layout.Geo(
            projection = go.layout.geo.Projection(
                type='equirectangular'
            ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        )
    )

fig = go.Figure(data=cities, layout=layout)
plot(fig, filename='bubble_map')

'file:///Users/anushkocharyan/Desktop/datavis_final/bubble_map.html'

In [352]:
df = pd.read_csv('UN_refugees_data.csv')

df_2016 = df[(df['Year'] == 2016)]

df_2016 = df_2016[df_2016['Country or territory of asylum or residence'] != 'Various']

df_2016 = df_2016[(df_2016['Country or territory of origin'] != 'Stateless') & 
                 (df_2016['Country or territory of origin'] != 'Various')]

countries = ['Pakistan', 'Afghanistan', 'Islamic Rep. of Iran', 'Iraq', 'Turkey']

hm = df_2016[df_2016['Country or territory of origin'].isin(countries)]
hm = df_2016[df_2016['Country or territory of asylum or residence'].isin(countries)]

hm = hm.replace(['Islamic Rep. of Iran'], ['Iran'])

countries = ['Pakistan', 'Afghanistan', 'Iran', 'Iraq', 'Turkey']

heatmap_matrix = []

for c_origin in countries:
    matrix = []
    for c_asylum in countries:
        if c_origin == c_asylum:
            matrix.append(0)
        elif ((hm['Country or territory of asylum or residence'] == c_asylum) &\
                              (hm['Country or territory of origin'] == c_origin)).any():
            num_ref =  hm.loc[(hm['Country or territory of asylum or residence'] == c_asylum) &\
                              (hm['Country or territory of origin'] == c_origin)]\
                                                                              ['Total Refugees'].tolist()[0]
            matrix.append(num_ref)
        else:
            matrix.append(0)

    heatmap_matrix.append(matrix)
        

trace = go.Heatmap(z=heatmap_matrix,
                   x= countries,
               
                   y= countries)
data=[trace]

layout = go.Layout(
    title = go.layout.Title(
        text = '<b>Number of Refugees Across Countries of Asylums and Origins </b>'
    ),
    xaxis=dict(
        title='Country of Asylum',
        #tickangle=-45
    ),
    yaxis=dict(
        title='Country of Origin',
        #tickangle=-45
    ),
    
)


fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='labelled-heatmap')