In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import ScalarFormatter
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import plotly.express as px

<h4>Данные</h4>

<p>UNESCO Bulk Data Download Service can be accessed here: <a href>https://uis.unesco.org/bdds</a> 
<br> Labels list is analyzed and students inbound indicators are extracted into distinct csv file named <i>OPRI_student_labels.csv</i>
<br>UNESCO Data Service is also availabla at https://data.uis.unesco.org/#
<p>UNESCO UIS DataSet for 2017 and 2022 lacks data from USA, Mexico, Netherlands, Iran, Kazakhstan and some other countries. They are taken from the following sources:
<br> USA, 2022 - <a href>https://opendoorsdata.org</a> (by source countries)
<br>China, 2022 - http://www.moe.gov.cn/jyb_xwfb/gzdt_gzdt/s5987/201904/t20190412_377692.html taken pre-pandemic for distinct countries, total from http://en.moe.gov.cn/documents/statistics/2022/national/202401/t20240110_1099530.html for 2022
<br>China, 2017 - are from 2018 https://docs.aiddata.org/reports/corridors-of-power.html#figure4
<br>Philippines
<br>Israel
<br> Netherlands, 2022 - https://www.nuffic.nl/sites/default/files/2023-06/incoming-degree-mobility-in-dutch-higher-education-2022-2023.pdf (total and by source countries where provided)
<br> Mexico, 2022 - https://www.iie.org/research-initiatives/project-atlas/explore-data/mexico-2/ (total inbound students)
<br> Iran, 2022 - https://tehrantimes.com/news/483478/Students-from-91-countries-studying-in-Iranian-universities (total inbound students)
<br> Kazakhstan, 2022 - https://enic-kazakhstan.edu.kz/uploads/additional_files_items/193/file/tadzhikistan.pdf?cache=1691397957, https://enic-kazakhstan.edu.kz/uploads/additional_files_items/156/file/mezhdunarodnoe-sotrudnichestvo-sng.pdf?cache=1677496628, https://enic-kazakhstan.edu.kz/uploads/additional_files_items/138/file/3-1-monitoring-akademicheskoy-mobilnosti-2022-rus.pdf?cache=1672983807 (total and by source countries where provided)
<br> Egypt, 2017 - https://enterprise.press/stories/2019/12/16/how-egypt-is-positioning-itself-as-an-educational-hub-for-international-students-8512/ (total inbound students)
<br> Thailand, 2017 - https://apheit.bu.ac.th/jounal/Inter-vol8-1/นานาชาติ_บทความวิจัย_2.pdf (total inbound students)


In [415]:
country_names = pd.read_csv(r"C:\Users\yuliy\Documents\UNESCO Education\OPRI_COUNTRY.csv")
print(country_names.columns)
country_names.head(2)

Index(['COUNTRY_ID', 'COUNTRY_NAME_EN'], dtype='object')


Unnamed: 0,COUNTRY_ID,COUNTRY_NAME_EN
0,AFG,Afghanistan
1,ALB,Albania


In [416]:
labels = pd.read_csv(r"C:\Users\yuliy\Documents\UNESCO Education\OPRI_students_label.csv")
print(labels.columns)
labels.head(2)

Index(['INDICATOR_ID', 'INDICATOR_LABEL_EN'], dtype='object')


Unnamed: 0,INDICATOR_ID,INDICATOR_LABEL_EN
0,26421,"Africa: Students from Algeria, both sexes (num..."
1,26422,"Africa: Students from Angola, both sexes (number)"


In [417]:
national = pd.read_csv(r"C:\Users\yuliy\Documents\UNESCO Education\OPRI_DATA_NATIONAL.csv")
print(national.columns)
national.head(2)


Columns (0,4) have mixed types. Specify dtype option on import or set low_memory=False.



Index(['indicator_id', 'country_id', 'year', 'value', 'magnitude',
       'qualifier'],
      dtype='object')


Unnamed: 0,indicator_id,country_id,year,value,magnitude,qualifier
0,10,ABW,1998,0.0,,
1,10,ABW,1999,0.0,,


In [418]:
#Add USA inbound students
usa_add_long = pd.read_csv("usa_inbound.csv")
usa_add_long['to'] = 'United States of America'
usa_add_long['year'] = '2022'
usa_add_long = usa_add_long.rename(columns={'unesco':'from'})
usa_add_long = usa_add_long[['from','to','year','2022']]
usa_add_long.columns = ['from','to','year','value']
usa_add_long = usa_add_long[usa_add_long['from']!='#Н/Д']
usa_add_long['value'] = usa_add_long['value'].fillna(0).astype(int)
usa_add_long['country_id'] = 'USA'
usa_add_long.head(2)

Unnamed: 0,from,to,year,value,country_id
0,Afghanistan,United States of America,2022,682,USA
1,Albania,United States of America,2022,1087,USA


In [419]:
#Add Kazakhstan inbound students
kaz_add_long = pd.read_csv("kazakhstan_inbound.csv")
kaz_add_long['year'] = kaz_add_long['year'].astype(str)
kaz_add_long['country_id'] = 'KAZ'
kaz_add_long.head(2)

Unnamed: 0,from,to,year,value,country_id
0,Uzbekistan,Kazakhstan,2022,9571.0,KAZ
1,Tajikistan,Kazakhstan,2022,361.0,KAZ


In [None]:
#Addd Netherlands inbound students
nld_add_long = pd.read_csv("nld_inbound.csv")
nld_add_long  = nld_add_long.dropna()
nld_add_long['year'] = nld_add_long['year'].astype(str)
nld_add_long['country_id'] = 'NLD'
nld_add_long.head(2)

Unnamed: 0,from,to,year,value,country_id
18,Belgium,Netherlands,2022,4780.0,NLD
29,Bulgaria,Netherlands,2022,5112.0,NLD


In [444]:
china_add_long = pd.read_csv("china_inbound.csv")
china_add_long  = china_add_long.dropna()
china_add_long['year'] = china_add_long['year'].astype(str)
china_add_long['country_id'] = 'CHN'
china_add_long.head(2)

Unnamed: 0,from,to,year,value,country_id
0,India,China,2017,23198,CHN
1,India,China,2022,23198,CHN


inbound отчеты - обработка и добавление отсутствующих стран

In [421]:
#Extract labels and countru names from UIS dataset
from_to_labels = labels[~labels['INDICATOR_ID'].isin([26637,26638,43188])]
from_to_labels['from'] = from_to_labels['INDICATOR_LABEL_EN'].apply(lambda x:x.split('from ')[1].split(', both sexes (number)')[0].replace('the ',''))
from_to_labels.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,INDICATOR_ID,INDICATOR_LABEL_EN,from
0,26421,"Africa: Students from Algeria, both sexes (num...",Algeria
1,26422,"Africa: Students from Angola, both sexes (number)",Angola
2,26423,"Africa: Students from Benin, both sexes (number)",Benin
3,26424,"Africa: Students from Botswana, both sexes (nu...",Botswana
4,26425,"Africa: Students from Burkina Faso, both sexes...",Burkina Faso


In [422]:
total_from_to = national[national['indicator_id'].isin(from_to_labels['INDICATOR_ID'].unique())]
total_from_to = pd.merge(total_from_to,from_to_labels,left_on='indicator_id',right_on='INDICATOR_ID',how='left')
total_from_to = pd.merge(total_from_to,country_names,left_on='country_id',right_on='COUNTRY_ID',how='left')
total_from_to = total_from_to.rename(columns={'COUNTRY_NAME_EN':'to'})
total_from_to =total_from_to[['indicator_id','country_id','year','value','from','to']]
total_from_to['value'] = total_from_to['value'].astype(int)
print(total_from_to.shape)
total_from_to.head()

(453280, 6)


Unnamed: 0,indicator_id,country_id,year,value,from,to
0,26421,ABW,2003,0,Algeria,Aruba
1,26421,ABW,2004,0,Algeria,Aruba
2,26421,ABW,2006,0,Algeria,Aruba
3,26421,ABW,2009,0,Algeria,Aruba
4,26421,ABW,2010,0,Algeria,Aruba


In [423]:
canada_in = total_from_to[((total_from_to['to'] == 'Canada') & (total_from_to['year'] == 2022))]

In [424]:
total_from_to[((total_from_to['to'] == 'Netherlands') & (total_from_to['year'] == 2022))]

Unnamed: 0,indicator_id,country_id,year,value,from,to
372156,26599,NLD,2022,0,Netherlands,Netherlands


In [425]:
total_from_to[((total_from_to['to'] == 'Kazakhstan') & (total_from_to['year'] == 2022))]

Unnamed: 0,indicator_id,country_id,year,value,from,to


In [426]:
#delete empty 2022 rows for the USA, Netherlands, Mexico
total_from_to['year'] = total_from_to['year'].astype(str)
total_from_to = total_from_to[~((total_from_to['to'] == 'United States of America') & (total_from_to['year'] == '2022'))]
total_from_to = total_from_to[~((total_from_to['to']=='Netherlands') & (total_from_to['year']=='2022'))]
total_from_to = total_from_to[~((total_from_to['to']=='Kazakhstan') & (total_from_to['year']=='2022'))]
print(total_from_to.shape)
total_from_to.head(2)

(453278, 6)


Unnamed: 0,indicator_id,country_id,year,value,from,to
0,26421,ABW,2003,0,Algeria,Aruba
1,26421,ABW,2004,0,Algeria,Aruba


In [427]:
print(total_from_to.shape)
#Add absent countries into df
total_from_to = pd.concat([total_from_to,usa_add_long,kaz_add_long,nld_add_long])
print(total_from_to.shape)
total_from_to.head(2)

(453278, 6)
(453549, 6)


Unnamed: 0,indicator_id,country_id,year,value,from,to
0,26421,ABW,2003,0.0,Algeria,Aruba
1,26421,ABW,2004,0.0,Algeria,Aruba


International Education Market

In [None]:
total_years = total_from_to[~total_from_to['year'].isin(['1998','1999'])].groupby(['year','to'])['value'].sum().reset_index()

new_data = [
    {'to': 'Egypt', 'year': '2017', 'value': 51162},
    {'to': 'Thailand', 'year': '2017', 'value': 43821},
    {'to': 'Mexico', 'year': '2022', 'value': 51659},
    {'to': 'Iran', 'year': '2022', 'value': 94406},
    {'to': 'Ukraine', 'year': '2022', 'value': 50870},
]

new_data_df = pd.DataFrame(new_data)

total_years = pd.concat([total_years,new_data_df])

total_years.loc[(total_years['to']=='Kazakhstan') & (total_years['year'] == '2022'),'value']=26080 #replace as inbound by countries is less than known sum
total_years.loc[(total_years['to']=='China') & (total_years['year'] == '2022'),'value'] = 253177 #replace as inbound by countries is less than known sum http://en.moe.gov.cn/documents/statistics/2022/national/202401/t20240110_1099530.html
total_years.loc[(total_years['to']=='China') & (total_years['year'] == '2017'),'value'] = 489172
total_years.loc[(total_years['to']=='Netherlands') & (total_years['year'] == '2022'),'value']=122287 #replace as inbound by countries is less than known sum
# china = pd.DataFrame({'year':['2017','2022'],'to':['China','China'],'value':[]})

# <br>China, 2017
# <br>Philippines
# <br>Israel
total_years[total_years['year'].isin(['2017','2022'])]

Unnamed: 0,year,to,value
2091,2017,Algeria,0.0
2092,2017,Andorra,182.0
2093,2017,Argentina,85945.0
2094,2017,Armenia,4138.0
2095,2017,Australia,370409.0
...,...,...,...
0,2017,Egypt,51162.0
1,2017,Thailand,43821.0
2,2022,Mexico,51659.0
3,2022,Iran,94406.0


<h4>Inbound by Countries</h4>

На основании отчетов inbound в разрезе страна-страна

In [433]:
#Filter and unpivot df to compare countries year to year, and do some cleaning
countries = total_from_to[total_from_to['year'].isin(['2017','2022'])]
countries = countries[countries['from']!=countries['to']]
countries = countries.drop_duplicates()
countries = countries.pivot(index=['from','to'], columns='year', values='value').reset_index()
countries = countries[countries['from']!='unknown countries']
countries['from'] = countries['from'].apply(lambda x:x.strip())
countries['to'] = countries['to'].apply(lambda x:x.strip())
countries.loc[countries['from']=='Iran (Islamic Republic of)','from']='Iran'
countries.loc[countries['from']=='Islamic Republic of Iran','from']='Iran'
countries.loc[countries['from']=='United Kingdom of Great Britain and Northern Ireland','from']='United Kingdom'
countries.loc[countries['from']=='China, Hong Kong Special Administrative Region','from']='Hong Kong'
countries.loc[countries['from']=='Hong Kong, Special Administrative Region of China','from']='Hong Kong'
countries.loc[countries['from']=='China, Macao Special Administrative Region','from']='Macao'
countries.loc[countries['from']=='Macao, Special Administrative Region of China','from']='Macao'
countries.loc[countries['from']=='United States of America','from']='USA'
countries.loc[countries['from']=='United States','from']='USA'
countries.loc[countries['from']=='United Republic of Tanzania','from']='Tanzania'
countries.loc[countries['from']=="Lao People's Democratic Republic",'from'] = 'Lao'
countries.loc[countries['from']=="Turks and Caicos Islands",'from'] = 'Turks and Caicos'
countries.loc[countries['from']=="Russian Federation",'from'] = 'Russia'
countries.loc[countries['from']=="Republic of Moldova",'from'] = 'Moldova'
countries.loc[countries['to']=='Iran (Islamic Republic of)','to']='Iran'
countries.loc[countries['to']=='Islamic Republic of Iran','to']='Iran'
countries.loc[countries['to']=='United Kingdom of Great Britain and Northern Ireland','to']='United Kingdom'
countries.loc[countries['to']=='China, Hong Kong Special Administrative Region','to']='Hong Kong'
countries.loc[countries['to']=='Hong Kong, Special Administrative Region of China','to']='Hong Kong'
countries.loc[countries['to']=='China, Macao Special Administrative Region','to']='Macao'
countries.loc[countries['to']=='Macao, Special Administrative Region of China','to']='Macao'
countries.loc[countries['to']=='United States of America','to']='USA'
countries.loc[countries['to']=='United States','to']='USA'
countries.loc[countries['to']=='United Republic of Tanzania','to']='Tanzania'
countries.loc[countries['to']=="Lao People's Democratic Republic",'to'] = 'Lao'
countries.loc[countries['to']=="Turks and Caicos Islands",'to'] = 'Turks and Caicos'
countries.loc[countries['to']=="Russian Federation",'to'] = 'Russia'
countries.loc[countries['to']=="Republic of Moldova",'to'] = 'Moldova'
countries.head()

year,from,to,2017,2022
0,Afghanistan,Albania,,0.0
1,Afghanistan,Andorra,0.0,
2,Afghanistan,Argentina,15.0,16.0
3,Afghanistan,Armenia,0.0,0.0
4,Afghanistan,Australia,44.0,73.0


In [434]:
countries.sum()

year
from    AfghanistanAfghanistanAfghanistanAfghanistanAf...
to      AlbaniaAndorraArgentinaArmeniaAustraliaAustria...
2017                                            4494782.0
2022                                            5691192.0
dtype: object

In [None]:
#Add total inbound students for absent countries where we don't have disctinct values by source countries
inbound = pd.merge(countries.groupby(['to'])['2017'].sum().reset_index(),
                 countries.groupby(['to'])['2022'].sum().reset_index(),
                 on = 'to',
                 how='left')
new_data = [
    {'to': 'Egypt', '2017': 51162},
    {'to': 'Thailand', '2017': 43821},
    {'to': 'Mexico', '2022': 51659},
    {'to': 'Iran', '2022': 94406},
    {'to': 'Ukraine', '2022': 50870},
]
new_data_df = pd.DataFrame(new_data)
inbound = pd.concat([inbound,new_data_df])

# inbound.loc[inbound['to']=='Egypt','2017']=51162
# inbound.loc[inbound['to']=='Thailand','2017'] = 43821
# inbound.loc[inbound['to']=='Mexico','2022']=51659
# inbound.loc[inbound['to']=='Iran','2022']=94406
# inbound.loc[inbound['to']=='Ukraine','2022']=50870

inbound.loc[inbound['to']=='Kazakhstan','2022']=26080 #add values by countries and total, as not every source is known
inbound.loc[inbound['to']=='China','2022']=253177
inbound.loc[inbound['to']=='China','2017']=489172
inbound.loc[inbound['to']=='Netherlands','2022']=122287

# inbound = inbound[~(inbound['2017'].isna()) & ~(inbound['2022'].isna())]
# inbound = inbound[~(inbound['2017']==0) & ~(inbound['2022']==0)]
# inbound['2022/2017,%'] = round(inbound['2022']/inbound['2017']*100,0).astype(int)-100
# inbound['2022/2017,%'] = inbound['2022/2017,%'].apply(lambda x:'+'+str(x) if x>0 else str(x))
# #shorten the countries list
# inbound = inbound[inbound['2022']>5000]

# #And assign groups to make understandable charts
# def group(x):
#     if x<20000:
#         group_name = '5-20 тыс.'
#     elif 20000<=x<50000:
#         group_name = '20-50 тыс.'
#     elif 50000<=x<100000:
#         group_name = '50-100 тыс.'
#     else:
#         group_name = 'Больше 100 тыс.'
#     return group_name
# inbound['group'] = inbound['2022'].apply(lambda x: group(x))
# inbound.set_index('to', inplace=True)
# inbound = inbound.sort_values(by='2022',ascending=False)
inbound.head()

Unnamed: 0,to,2017,2022
0,Albania,0.0,1345.0
1,Andorra,182.0,0.0
2,Argentina,85945.0,135754.0
3,Armenia,4138.0,6635.0
4,Australia,368594.0,364473.0


In [None]:
world = pd.DataFrame({'2017':[inbound['2017'].sum()],'2022':[inbound['2022'].sum()]})
world['2022/2017,%'] = round(world['2022']/world['2017']*100,0).astype(int)-100
world['2022/2017,%'] = world['2022/2017,%'].apply(lambda x:'+'+str(x) if x>0 else str(x))
fig = 
trace1 = go.Bar(x=filtered_df.index,
    y=filtered_df['2017'],
    name=f'2017 - Group {group}',
    text=[f"{value/1000:.1f}k" if value < 1000 else f"{int(value/1000)}k" for value in filtered_df['2017']],  # Value labels in thousands
    textposition='auto',
    textfont=dict(size=16),
    marker_color='blue',
    marker_line=dict(width=3, color=['red' if country == 'Russia' else 'blue' for country in filtered_df.index]) 
    )
trace2 = go.Bar(x=filtered_df.index,
    y=filtered_df['2022'],
    name=f'2022 - Group {group}',
    text=[f"{value/1000:.1f}k\n({filtered_df['2022/2017,%'].values[j]}%)" if value < 1000 else f"{int(value/1000)}k\n({filtered_df['2022/2017,%'].values[j]}%)" for j, value in enumerate(filtered_df['2022'])],  # Value labels in thousands
    textposition='auto',
    textfont=dict(size=16),
    marker_color='green',
    marker_line=dict(width=3, color=['red' if country == 'Russia' else 'green' for country in filtered_df.index]) 
    )

fig.add_trace(trace1)
fig.add_trace(trace2)
fig.update_layout(
    title='Входящий поток иностранных студентов по группам стран с 2017 по 2022 г.',
    # xaxis_title='Country',
    yaxis_title='Число студентов, тыс.',
    barmode='group',  # Group bars together
    height=1400,
    width=2600,
    plot_bgcolor='white'
)
for i in range(1, num_rows + 1):
    for j in range(1, num_cols + 1):
        fig.update_xaxes(tickangle=45, row=i, col=j)
        fig.update_yaxes(gridcolor='lightgrey', row=i, col=j)
pio.write_image(fig, "Входящий поток иностранных студентов по группам стран с 2017 по 2022 г.png")
fig.show()
print(world)

Unnamed: 0,2017,2022,"2022/2017,%"
0,4456691.0,5829508.0,31


Inbound students

In [20]:
n_cols = 2  # Number of columns in subplot grid
# unique_groups  = diff['group'].unique()
unique_groups = ['Больше 100 тыс.', '50-100 тыс.', '20-50 тыс.','5-20 тыс.']
num_plots = len(unique_groups)
num_cols = 2  # Maximum of 2 columns
num_rows = (num_plots + num_cols - 1) // num_cols  # Calculate the number of rows
fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=unique_groups)

for i, group in enumerate(unique_groups):
    filtered_df = inbound[inbound['group'] == group]
    row_index = i // n_cols + 1
    col_index = i % n_cols + 1
    
    trace1 = go.Bar(x=filtered_df.index,
        y=filtered_df['2017'],
        name=f'2017 - Group {group}',
        text=[f"{value/1000:.1f}k" if value < 1000 else f"{int(value/1000)}k" for value in filtered_df['2017']],  # Value labels in thousands
        textposition='auto',
        textfont=dict(size=16),
        marker_color='blue',
        marker_line=dict(width=3, color=['red' if country == 'Russia' else 'blue' for country in filtered_df.index]) 
        )
    trace2 = go.Bar(x=filtered_df.index,
        y=filtered_df['2022'],
        name=f'2022 - Group {group}',
        text=[f"{value/1000:.1f}k\n({filtered_df['2022/2017,%'].values[j]}%)" if value < 1000 else f"{int(value/1000)}k\n({filtered_df['2022/2017,%'].values[j]}%)" for j, value in enumerate(filtered_df['2022'])],  # Value labels in thousands
        textposition='auto',
        textfont=dict(size=16),
        marker_color='green',
        marker_line=dict(width=3, color=['red' if country == 'Russia' else 'green' for country in filtered_df.index]) 
        )
    
    fig.add_trace(trace1, row=row_index, col=col_index)
    fig.add_trace(trace2, row=row_index, col=col_index)
        
fig.update_layout(
title='Входящий поток иностранных студентов по группам стран с 2017 по 2022 г.',
# xaxis_title='Country',
yaxis_title='Число студентов, тыс.',
barmode='group',  # Group bars together
height=1400,
width=2600,
plot_bgcolor='white'
)
for i in range(1, num_rows + 1):
    for j in range(1, num_cols + 1):
        fig.update_xaxes(tickangle=45, row=i, col=j)
        fig.update_yaxes(gridcolor='lightgrey', row=i, col=j)
pio.write_image(fig, "Входящий поток иностранных студентов по группам стран с 2017 по 2022 г.png")
fig.show()


На основании отчетов inbound в разрезе весь мир -страна

In [101]:
# total_inbound = national[national['indicator_id'].isin([26637])]
# total_inbound['from'] = 'World'
# total_inbound.head()

<h4>Исходящие по странам</h4>

OE.5T8.40510 
Total outbound internationally mobile tertiary students studying abroad, all countries, both sexes (UIS estimate) (number)

In [318]:
outbound = pd.merge(countries.groupby(['from'])['2017'].sum().reset_index(),
                 countries.groupby(['from'])['2022'].sum().reset_index(),
                 on = 'from',
                 how='left')
outbound = outbound[~(outbound['2017'].isna()) & ~(outbound['2022'].isna())]
outbound = outbound[~(outbound['2017']==0) & ~(outbound['2022']==0)]
outbound['2022/2017,%'] = round(outbound['2022']/outbound['2017']*100,0).astype(int)-100
outbound['2022/2017,%'] = outbound['2022/2017,%'].apply(lambda x:'+'+str(x) if x>0 else str(x))
outbound = outbound[outbound['2022']>5000]
def group(x):
    if x<20000:
        group_name = '5-20 тыс.'
    elif 20000<=x<50000:
        group_name = '20-50 тыс.'
    elif 50000<=x<100000:
        group_name = '50-100 тыс.'
    elif 100000<=x<200000:
        group_name = '100-200 тыс.'
    else:
        group_name = 'Больше 200 тыс.'
    return group_name
outbound['group'] = outbound['2022'].apply(lambda x: group(x))
outbound.set_index('from', inplace=True)
outbound = outbound.sort_values(by='2022',ascending=False)
outbound.head()

Unnamed: 0_level_0,2017,2022,"2022/2017,%",group
from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
China,839754.0,960174.0,14,Больше 200 тыс.
India,327376.0,709884.0,117,Больше 200 тыс.
Viet Nam,93589.0,133922.0,43,100-200 тыс.
Uzbekistan,34689.0,132963.0,283,100-200 тыс.
Germany,122284.0,129970.0,6,100-200 тыс.


In [22]:
n_cols = 2  # Number of columns in subplot grid
# unique_groups  = diff['group'].unique()
unique_groups = ['Больше 200 тыс.','100-200 тыс.', '50-100 тыс.', '20-50 тыс.','5-20 тыс.']
num_plots = len(unique_groups)
num_cols = 2  # Maximum of 2 columns
num_rows = (num_plots + num_cols - 1) // num_cols  # Calculate the number of rows
fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=unique_groups)
outbound = outbound.sort_values(by=['2022'],ascending=False)
for i, group in enumerate(unique_groups):
    filtered_df = outbound[outbound['group'] == group]
    row_index = i // n_cols + 1
    col_index = i % n_cols + 1
    
    trace1 = go.Bar(x=filtered_df.index,
        y=filtered_df['2017'],
        name=f'2017 - Group {group}',
        text=[f"{value/1000:.1f}k" if value < 1000 else f"{int(value/1000)}k" for value in filtered_df['2017']],  # Value labels in thousands
        textposition='auto',
        textfont=dict(size=12),
        marker_color='brown',
        marker_line=dict(width=3, color=['black' if country == 'Russia' else 'brown' for country in filtered_df.index]) 
        )
    trace2 = go.Bar(x=filtered_df.index,
        y=filtered_df['2022'],
        name=f'2022 - Group {group}',
        text=[f"{value/1000:.1f}k<br>({filtered_df['2022/2017,%'].values[j]}%)" if value < 1000 else f"{int(value/1000)}k<br>({filtered_df['2022/2017,%'].values[j]}%)" for j, value in enumerate(filtered_df['2022'])],  # Value labels in thousands
        textposition='auto',
        textfont=dict(size=12),
        marker_color='orange',
        marker_line=dict(width=3, color=['black' if country == 'Russia' else 'orange' for country in filtered_df.index]) 
        )
    
    fig.add_trace(trace1, row=row_index, col=col_index)
    fig.add_trace(trace2, row=row_index, col=col_index)
        
fig.update_layout(
title='Исходящий поток иностранных студентов по группам стран с 2017 по 2022 г.',
# xaxis_title='Country',
yaxis_title='Число студентов, тыс.',
barmode='group',  # Group bars together
height=1600,
width=2600,
plot_bgcolor='white'
)
for i in range(1, num_rows + 1):
    for j in range(1, num_cols + 1):
        fig.update_xaxes(tickangle=45, row=i, col=j)
        fig.update_yaxes(gridcolor='lightgrey', row=i, col=j)
pio.write_image(fig, "Исходящий поток иностранных студентов по группам стран с 2017 по 2022 г.png")
fig.show()

<h4>Student flows inside CIS</h4>

In [None]:
countries2 = countries[~(countries['2017'].isna()) & ~(countries['2022'].isna())]
countries2 = countries2[~(countries2['2017']==0) & ~(countries2['2022']==0)]

#some tweaks for Sankey diagram that demands different names for sources and targets
countries2['from'] = countries2['from']+ ' Out'
countries2['to'] = countries2['to']+ ' In'
countries2.head(2)

year,from,to,2017,2022
2,Afghanistan Out,Argentina In,15.0,16.0
4,Afghanistan Out,Australia In,44.0,73.0


In [327]:
cis_c = ['Russia','Moldova','Belarus','Kazakhstan','Uzbekistan',
          'Kyrgyzstan','Tajikistan','Turkmenistan',
          'Armenia','Azerbaijan','Ukraine']
cis = [item + ' Out' for item in cis_c] + [item + ' In' for item in cis_c]
cis_countries_from = countries2[countries2['from'].isin(cis)]
cis_countries_from.head(2)

year,from,to,2017,2022
953,Armenia Out,Argentina In,31.0,65.0
955,Armenia Out,Austria In,60.0,66.0


In [328]:
#Extract top 5 target countries for every source country in 2017 and 2022 
cis_countries_from_2022_top10 = cis_countries_from.groupby('from').apply(lambda x: x.nlargest(5, '2022')).reset_index(drop=True)
cis_countries_from_2022_top10.drop(columns='2017',inplace=True)
cis_countries_from_2017_top10 = cis_countries_from.groupby('from').apply(lambda x: x.nlargest(5, '2017')).reset_index(drop=True)
cis_countries_from_2017_top10.drop(columns='2022',inplace=True)
cis_countries_from_top = pd.concat([cis_countries_from_2017_top10,cis_countries_from_2022_top10])
cis_countries_from_top['to'].unique()







array(['Russia In', 'France In', 'Germany In', 'USA In',
       'United Kingdom In', 'Turkey In', 'Georgia In', 'Poland In',
       'Lithuania In', 'Czechia In', 'Kyrgyzstan In', 'Kazakhstan In',
       'Saudi Arabia In', 'Romania In', 'Italy In', 'Belarus In',
       'Malaysia In', 'Republic of Korea In', 'Latvia In', 'Bulgaria In',
       'Uzbekistan In', 'Slovakia In'], dtype=object)

In [336]:
#Draw Sankey for every year
c_a = ('Kazakhstan Out','Kyrgyzstan Out','Tajikistan Out','Turkmenistan Out','Uzbekistan Out')
e_c = ('Russia Out','Moldova Out','Belarus Out','Ukraine Out','Armenia Out','Azerbaijan Out')
regions = {c_a:'Central Asia',
           e_c:'Europe & Caucasus'}

for key,value in regions.items():
    print(value)
    df = cis_countries_from_top[cis_countries_from_top['from'].isin(key)]
    unique_labels = list(set(df['from'].tolist() + df['to'].tolist()))
    other_color = 'rgba(189, 195, 199, 0.8)'
    country_colors = {
        'Armenia': 'rgba(31, 119, 180, 0.8)',  # Blue
        'Azerbaijan': 'rgba(255, 127, 14, 0.8)',  # Orange
        'Belarus': 'rgba(44, 162, 44, 0.8)',  # Green
        'Kazakhstan': 'rgba(214, 39, 40, 0.8)',  # Red
        'Kyrgyzstan': 'rgba(148, 103, 189, 0.8)',  # Purple
        'Moldova': 'rgba(140, 86, 75, 0.8)',  # Brown
        'Russia': 'rgba(227, 119, 194, 0.8)',  # Pink
        'Tajikistan': 'rgba(127, 127, 127, 0.8)',  # Gray
        'Turkmenistan': 'rgba(188, 189, 34, 0.8)',  # Yellow-green
        'Ukraine': 'rgba(23, 190, 239, 0.8)',  # Cyan
        'Uzbekistan': 'rgba(255, 187, 120, 0.8)',   # Light Orange (to make it distinct)
        'France':other_color, 
        'Germany':other_color, 
        'USA':other_color, 
        'United Kingdom':other_color, 
        'Turkey':other_color, 
        'Georgia':other_color, 
        'Poland':other_color, 
        'Lithuania':other_color, 
        'Romania':other_color, 
        'Italy':other_color, 
        'Czechia':other_color, 
        'Republic of Korea':other_color,
        'Saudi Arabia':other_color,
        'Latvia':other_color,
        'Malaysia':other_color,
        'Bulgaria':other_color,
        'Slovakia':other_color,
    }

    # You can use color mapping for countries. Change i and coeffs to get distinct colors.
    # country_colors = {country: f'rgba({(i+100) * 20 % 255}, {((i+50) * 200 + 100) % 255}, {((i+200) * 380) % 255}, 1)'
    #                   for i, country in enumerate(set([label.split()[0] for label in unique_labels]))}
    # Function to get sorted unique labels based on total link values

    def get_sorted_labels(df, year):
        link_sums = pd.concat([
            df.groupby('from')[year].sum(),
            df.groupby('to')[year].sum()
        ]).groupby(level=0).sum()
        return link_sums.sort_values(ascending=False).index.tolist()

    # Function to build a Sankey trace for a specific year
    def build_sankey_trace(df, year,pad_size):
        sorted_labels = get_sorted_labels(df, year)
        unique_labels_sorted = [label for label in unique_labels if label in sorted_labels]
        
        # Map source and target to indices
        source_indices = [unique_labels_sorted.index(src) for src in df['from']]
        target_indices = [unique_labels_sorted.index(tgt) for tgt in df['to']]
        
        # Assign link colors based on source country
        link_colors = [country_colors[" ".join(src.split()[:-1])].replace('1)', '0.8)') for src in df['from']]
        
        return go.Sankey(
            node=dict(
                pad=pad_size,
                thickness=50,
                line=dict(color="black", width=0.5),
                label=[f"{label} ({int(df[df['from'] == label][year].sum() + df[df['to'] == label][year].sum())})"
                    for label in unique_labels_sorted],
                color=[country_colors[" ".join(label.split()[:-1])] for label in unique_labels_sorted]
            ),
            link=dict(
                source=source_indices,
                target=target_indices,
                value=df[year],
                color=link_colors,
                label=[f"{df.iloc[i]['from']} → {df.iloc[i]['to']}: {df.iloc[i][year]}"
                    for i in range(len(df))]
            )
        )

    def generate_annotations(df, unique_labels_sorted, year):
        annotations = []
        df=df[['from','to',year]].dropna()
        for i, row in df.iterrows():
            source_idx = unique_labels_sorted.index(row['from'])
            target_idx = unique_labels_sorted.index(row['to'])
            annotations.append(dict(
                # x=x_pos, y=y_pos,
                text=f"{int(row[year])}",  # Display value as an integer
                showarrow=False,
                font=dict(size=10, color="black"),
                align="center"
            ))
        return annotations

    fig = make_subplots(
        rows=1, cols=2, 
        specs=[[{'type': 'domain'}, {'type': 'domain'}]]  # Specify 'domain' for Sankey
    )
    fig.add_trace(build_sankey_trace(df, "2017",48), row=1, col=1)
    fig.add_trace(build_sankey_trace(df, "2022",18), row=1, col=2)
    fig.update_layout(
        title_text=f"Students Flows from {value} CIS Countries, Top 5 destinations for every country",
        font=dict(size=12, color='black', family='Times'),
        height=600,
        width=1000,
        plot_bgcolor='white',
        annotations=[
            dict(
                x=0.2,  # Center of first subplot
                y=1.05, 
                text="2017",
                showarrow=False,
                font=dict(size=16, color="black")
            ),
            dict(
                x=0.78,  # Center of second subplot
                y=1.05, 
                text="2022",
                showarrow=False,
                font=dict(size=16, color="black"),
            ),
            dict(
                x=0.002,  # Comments
                y=-0.1, 
                text="@Conspect Labs, Sources: UNESCO, OpenDoorsData, Nuffic, IIE, Statista, MoE of Kazakhstan, Iran, Egypt, Thailand",
                showarrow=False,
                font=dict(size=12, color="black")
            )
        ]
    )

    output_file = f"top_5_destinations_sankey_diagrams _{value}.png"  # You can change this to .png, .pdf, or .jpeg
    pio.write_image(fig, output_file, format="png", width=1600, height=800)
    # Show the figure
    fig.show()

Central Asia


Europe & Caucasus


Top 20 source countries for Russia

In [353]:
#Extract all source countries for Russia
inbound_country='Russia In'
inbound_to_country = countries2[countries2['to']==inbound_country]
inbound_to_country.head()

year,from,to,2017,2022
91,Afghanistan Out,Russia In,188.0,1602.0
210,Albania Out,Russia In,4.0,92.0
329,Algeria Out,Russia In,246.0,1524.0
567,Angola Out,Russia In,865.0,805.0
923,Argentina Out,Russia In,6.0,46.0


In [372]:
#Extract top 20 source countries for Russia in 2017 and 2022 
inbound_to_country_from_2017_top20 = inbound_to_country.sort_values(by='2017', ascending=False).head(10)
inbound_to_country_from_2017_top20 = inbound_to_country_from_2017_top20[['from','to','2017']]
inbound_to_country_from_2017_top20


year,from,to,2017
11602,Kazakhstan Out,Russia In,65237.0
23822,Uzbekistan Out,Russia In,20862.0
22631,Turkmenistan Out,Russia In,17457.0
23106,Ukraine Out,Russia In,15263.0
21563,Tajikistan Out,Russia In,14204.0
4836,China Out,Russia In,11950.0
2107,Belarus Out,Russia In,11600.0
1514,Azerbaijan Out,Russia In,11269.0
10417,India Out,Russia In,6544.0
12076,Kyrgyzstan Out,Russia In,5523.0


In [373]:
inbound_to_country_from_2022_top20 = inbound_to_country.sort_values(by='2022', ascending=False).head(10)
inbound_to_country_from_2022_top20 = inbound_to_country_from_2022_top20[['from','to','2022']]
inbound_to_country_from_2022_top20

year,from,to,2022
11602,Kazakhstan Out,Russia In,53935.0
23822,Uzbekistan Out,Russia In,48430.0
4836,China Out,Russia In,40797.0
22631,Turkmenistan Out,Russia In,27095.0
21563,Tajikistan Out,Russia In,27086.0
10417,India Out,Russia In,19734.0
6973,Egypt Out,Russia In,15616.0
2107,Belarus Out,Russia In,9821.0
23106,Ukraine Out,Russia In,9446.0
12076,Kyrgyzstan Out,Russia In,9345.0


Российские вузы зависят в основном от стран Центральной Азии и Китая - 60% студентов приезжают из Казахстана, Узбекистана, Туркменистана, Таджикистана и Китая. Диверсификация рынка происходит, но не быстро: в 2017 году топ-10 стран-доноров России составляли 85% въезда студентов, в 2022-м - 78%. 

In [364]:
inbound_to_country.sort_values(by='2022', ascending=False).head(5)['2022'].sum()/inbound_to_country['2022'].sum()

np.float64(0.5901387854701391)

In [363]:
inbound_to_country.sort_values(by='2022', ascending=False).head(10)['2022'].sum()/inbound_to_country['2022'].sum()

np.float64(0.7814121369254279)

In [365]:
inbound_to_country.sort_values(by='2017', ascending=False).head(5)['2017'].sum()/inbound_to_country['2017'].sum()

np.float64(0.6303152926905545)

In [366]:
inbound_to_country.sort_values(by='2017', ascending=False).head(10)['2017'].sum()/inbound_to_country['2017'].sum()

np.float64(0.8524796012168194)

Наибольший вклад в прирост въезда за 2017-2022 годы внесли Китай, Египет, Индия, 