In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import ScalarFormatter
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import plotly.express as px

<h2>Initial Data Load</h2>

<h3>Data Sources</h3>

<p>UNESCO Bulk Data Download Service can be accessed here: <a href>https://uis.unesco.org/bdds</a> 
<br> Labels list is analyzed and students inbound indicators are extracted into distinct csv file named <i>OPRI_student_labels.csv</i>
<br>UNESCO Data Service is also availabla at <a href>https://data.uis.unesco.org/#</a> 
<p>UNESCO UIS DataSet for 2017 and 2022 lacks reports, for several years, from USA, Mexico, Netherlands, Iran, Kazakhstan and some other countries. They are taken from the following sources:
<br> USA, 2022 - <a href>https://opendoorsdata.org</a> (by source countries)
<br>China, 2022 - <a href>http://www.moe.gov.cn/jyb_xwfb/gzdt_gzdt/s5987/201904/t20190412_377692.html</a>  taken pre-pandemic for distinct countries, in total from <a href>http://en.moe.gov.cn/documents/statistics/2022/national/202401/t20240110_1099530.html</a>  for the 2022
<br>China, 2017 - are from 2018 <a href>https://docs.aiddata.org/reports/corridors-of-power.html#figure4</a> 
<br>Philippines - still absent
<br>Israel - still absent
<br> Netherlands, 2022 - <a href>https://www.nuffic.nl/sites/default/files/2023-06/incoming-degree-mobility-in-dutch-higher-education-2022-2023.pdf</a>  (total and by source countries where provided)
<br> Mexico, 2022 - <a href>https://www.iie.org/research-initiatives/project-atlas/explore-data/mexico-2/</a>  (total inbound students)
<br> Iran, 2022 - <a href>https://tehrantimes.com/news/483478/Students-from-91-countries-studying-in-Iranian-universities</a>  (total inbound students)
<br> Kazakhstan, 2022 - <a href>https://enic-kazakhstan.edu.kz/uploads/additional_files_items/193/file/tadzhikistan.pdf?cache=1691397957</a> , https://enic-kazakhstan.edu.kz/uploads/additional_files_items/156/file/</a> mezhdunarodnoe-sotrudnichestvo-sng.pdf?cache=1677496628</a> , <a href>https://enic-kazakhstan.edu.kz/uploads/additional_files_items/138/file/3-1-monitoring-akademicheskoy-mobilnosti-2022-rus.pdf?cache=1672983807</a>  (total and by source countries where provided)
<br> Egypt, 2017 - <a href>https://enterprise.press/stories/2019/12/16/how-egypt-is-positioning-itself-as-an-educational-hub-for-international-students-8512/</a>  (total inbound students)
<br> Thailand, 2017 - <a href>https://apheit.bu.ac.th/jounal/Inter-vol8-1/นานาชาติ_บทความวิจัย_2.pdf</a>  (total inbound students)


<h3>Data Load</h3>

Opening and exploring UNESCO datasets

In [2]:
country_names = pd.read_csv(r"C:\Users\yuliy\Documents\UNESCO Education\OPRI_COUNTRY.csv")#change the file location
print(country_names.columns)
country_names.head(2)

Index(['COUNTRY_ID', 'COUNTRY_NAME_EN'], dtype='object')


Unnamed: 0,COUNTRY_ID,COUNTRY_NAME_EN
0,AFG,Afghanistan
1,ALB,Albania


In [3]:
labels = pd.read_csv(r"C:\Users\yuliy\Documents\UNESCO Education\OPRI_students_label.csv")#change the file location
print(labels.columns)
labels.head(2)

Index(['INDICATOR_ID', 'INDICATOR_LABEL_EN'], dtype='object')


Unnamed: 0,INDICATOR_ID,INDICATOR_LABEL_EN
0,26421,"Africa: Students from Algeria, both sexes (num..."
1,26422,"Africa: Students from Angola, both sexes (number)"


In [4]:
national = pd.read_csv(r"C:\Users\yuliy\Documents\UNESCO Education\OPRI_DATA_NATIONAL.csv") #change the file location
print(national.columns)
national.head(2)

  national = pd.read_csv(r"C:\Users\yuliy\Documents\UNESCO Education\OPRI_DATA_NATIONAL.csv") #change the file location


Index(['indicator_id', 'country_id', 'year', 'value', 'magnitude',
       'qualifier'],
      dtype='object')


Unnamed: 0,indicator_id,country_id,year,value,magnitude,qualifier
0,10,ABW,1998,0.0,,
1,10,ABW,1999,0.0,,


Add USA and other countries inbound students, taken from another sources

In [5]:
usa_add_long = pd.read_csv("usa_inbound.csv")
usa_add_long['to'] = 'United States of America'
usa_add_long['year'] = '2022'
usa_add_long = usa_add_long.rename(columns={'unesco':'from'})
usa_add_long = usa_add_long[['from','to','year','2022']] #we need year 2022
usa_add_long.columns = ['from','to','year','value']
usa_add_long = usa_add_long[usa_add_long['from']!='#Н/Д']
usa_add_long['value'] = usa_add_long['value'].fillna(0).astype(int)
usa_add_long['country_id'] = 'USA'
print(usa_add_long.value.sum())
usa_add_long.head(2)

1035161


Unnamed: 0,from,to,year,value,country_id
0,Afghanistan,United States of America,2022,682,USA
1,Albania,United States of America,2022,1087,USA


Add Kazakhstan inbound students

In [6]:
kaz_add_long = pd.read_csv("kazakhstan_inbound.csv")
kaz_add_long['year'] = kaz_add_long['year'].astype(str)
kaz_add_long['country_id'] = 'KAZ'
kaz_add_long.head(2)

Unnamed: 0,from,to,year,value,country_id
0,Uzbekistan,Kazakhstan,2022,9571.0,KAZ
1,Tajikistan,Kazakhstan,2022,361.0,KAZ


Add Netherlands inbound students

In [7]:
nld_add_long = pd.read_csv("nld_inbound.csv")
nld_add_long  = nld_add_long.dropna()
nld_add_long['year'] = nld_add_long['year'].astype(str)
nld_add_long['country_id'] = 'NLD'
nld_add_long.head(2)

Unnamed: 0,from,to,year,value,country_id
18,Belgium,Netherlands,2022,4780.0,NLD
29,Bulgaria,Netherlands,2022,5112.0,NLD


Add China inbound students

In [8]:
china_add_long = pd.read_csv("china_inbound.csv")
china_add_long  = china_add_long.dropna()
china_add_long['year'] = china_add_long['year'].astype(str)
china_add_long['country_id'] = 'CHN'
china_add_long.head(2)

Unnamed: 0,from,to,year,value,country_id
0,India,China,2017,23198,CHN
1,India,China,2022,23198,CHN


Extract labels and country names from the UIS dataset

In [9]:
from_to_labels = labels[~labels['INDICATOR_ID'].isin([26637,26638,43188])]
from_to_labels['from'] = from_to_labels['INDICATOR_LABEL_EN'].apply(lambda x:x.split('from ')[1].split(', both sexes (number)')[0].replace('the ',''))
from_to_labels.head(1)
total_from_to = national[national['indicator_id'].isin(from_to_labels['INDICATOR_ID'].unique())]
total_from_to = pd.merge(total_from_to,from_to_labels,left_on='indicator_id',right_on='INDICATOR_ID',how='left')
total_from_to = pd.merge(total_from_to,country_names,left_on='country_id',right_on='COUNTRY_ID',how='left')
total_from_to = total_from_to.rename(columns={'COUNTRY_NAME_EN':'to'})
total_from_to =total_from_to[['indicator_id','country_id','year','value','from','to']]
total_from_to['value'] = total_from_to['value'].astype(int)
total_from_to['year'] = total_from_to['year'].astype(str)
print(total_from_to.shape)
total_from_to.head(1)


(463913, 6)


Unnamed: 0,indicator_id,country_id,year,value,from,to
0,26421,ABW,2003,0,Algeria,Aruba


Delete empty 2022 rows for the Netherlands

In [10]:
total_from_to = total_from_to[~((total_from_to['to']=='Netherlands') & (total_from_to['year']=='2022'))]
print(total_from_to.shape)
total_from_to.head(2)

(463912, 6)


Unnamed: 0,indicator_id,country_id,year,value,from,to
0,26421,ABW,2003,0,Algeria,Aruba
1,26421,ABW,2004,0,Algeria,Aruba


Union initial UIS dataset and absent countries datasets

In [11]:
print(total_from_to.shape)
total_from_to = pd.concat([total_from_to,usa_add_long,kaz_add_long,nld_add_long,china_add_long])
print(total_from_to.shape)
total_from_to.head(2)

(463912, 6)
(464215, 6)


Unnamed: 0,indicator_id,country_id,year,value,from,to
0,26421,ABW,2003,0.0,Algeria,Aruba
1,26421,ABW,2004,0.0,Algeria,Aruba


Replace long country names by the shorter ones to save some space in charts

In [12]:
total_from_to.loc[total_from_to['from']=='Iran (Islamic Republic of)','from']='Iran'
total_from_to.loc[total_from_to['from']=='Islamic Republic of Iran','from']='Iran'
total_from_to.loc[total_from_to['from']=='United Kingdom of Great Britain and Northern Ireland','from']='United Kingdom'
total_from_to.loc[total_from_to['from']=='China, Hong Kong Special Administrative Region','from']='Hong Kong'
total_from_to.loc[total_from_to['from']=='Hong Kong, Special Administrative Region of China','from']='Hong Kong'
total_from_to.loc[total_from_to['from']=='China, Macao Special Administrative Region','from']='Macao'
total_from_to.loc[total_from_to['from']=='Macao, Special Administrative Region of China','from']='Macao'
total_from_to.loc[total_from_to['from']=='United States of America','from']='USA'
total_from_to.loc[total_from_to['from']=='United States','from']='USA'
total_from_to.loc[total_from_to['from']=='United Republic of Tanzania','from']='Tanzania'
total_from_to.loc[total_from_to['from']=="Lao People's Democratic Republic",'from'] = 'Lao'
total_from_to.loc[total_from_to['from']=="Turks and Caicos Islands",'from'] = 'Turks and Caicos'
total_from_to.loc[total_from_to['from']=="Russian Federation",'from'] = 'Russia'
total_from_to.loc[total_from_to['from']=="Republic of Moldova",'from'] = 'Moldova'
total_from_to.loc[total_from_to['to']=='Iran (Islamic Republic of)','to']='Iran'
total_from_to.loc[total_from_to['to']=='Islamic Republic of Iran','to']='Iran'
total_from_to.loc[total_from_to['to']=='United Kingdom of Great Britain and Northern Ireland','to']='United Kingdom'
total_from_to.loc[total_from_to['to']=='China, Hong Kong Special Administrative Region','to']='Hong Kong'
total_from_to.loc[total_from_to['to']=='Hong Kong, Special Administrative Region of China','to']='Hong Kong'
total_from_to.loc[total_from_to['to']=='China, Macao Special Administrative Region','to']='Macao'
total_from_to.loc[total_from_to['to']=='Macao, Special Administrative Region of China','to']='Macao'
total_from_to.loc[total_from_to['to']=='United States of America','to']='USA'
total_from_to.loc[total_from_to['to']=='United States','to']='USA'
total_from_to.loc[total_from_to['to']=='United Republic of Tanzania','to']='Tanzania'
total_from_to.loc[total_from_to['to']=="Lao People's Democratic Republic",'to'] = 'Lao'
total_from_to.loc[total_from_to['to']=="Turks and Caicos Islands",'to'] = 'Turks and Caicos'
total_from_to.loc[total_from_to['to']=="Russian Federation",'to'] = 'Russia'
total_from_to.loc[total_from_to['to']=="Republic of Moldova",'to'] = 'Moldova'

Calculating numbers for International Education Market and drawing sharts

In [13]:
total_years = total_from_to.groupby(['year','to'])['value'].sum().reset_index()
#raw data from Unesco and additional by-outbound-countries datasets. Data for country level will be added or corrected later
#this df allows to see if we have data for a country in a distinct year
df_country = total_years.pivot(index=['to'], columns='year', values='value').reset_index()
df_country = df_country[['to','2017','2019','2020','2021','2022']]
df_country.head(2)

year,to,2017,2019,2020,2021,2022
0,Afghanistan,,,0.0,,
1,Albania,,2156.0,2146.0,1944.0,1731.0


<h4>Inbound by Countries</h4>

By inbound reports, country-to-country

In [14]:
countries = total_from_to[total_from_to['year'].isin(['2017','2022'])]
countries['year'] = countries['year'].astype(str)
countries = countries[countries['from']!=countries['to']]
countries = countries.groupby(['year','from','to'])['value'].sum().reset_index()
countries = countries.pivot(index=['from','to'], columns='year', values='value').reset_index()
countries = countries.fillna(0)
countries['from'] = countries['from'].apply(lambda x:x.strip())
countries['to'] = countries['to'].apply(lambda x:x.strip())
countries.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  countries['year'] = countries['year'].astype(str)


year,from,to,2017,2022
0,Afghanistan,Albania,0.0,0.0
1,Afghanistan,Andorra,0.0,0.0


In [16]:
# After exploring dataset we can see that data for some years for some countries ar ealso absent. 
# Earlier we added some critical countries where we can get full data in country-to-country level
# That is not possible for every absent country, so for several countries we only add a total sum of inbound students 
# countries with large inbound flows that are absent in the UNESCO dataset:
# Egypt: 2017, 2022
# Iran: 2022
# Israel: 2017, 2022
# Panama: 2017
# Singapore: 2017, 2022
# Philippines: 2017, 2022
# Thailand: 2017
# Togo: 2017,2022
# Ukraine: 2022
# Uruguay: 2017,2022
# Mexico:2022
# Cuba: 2022
# Yemen
# Zambia
# Zimbabwe
#Indonesia 2022

Add total inbound students for absent countries where we don't have disctinct values by source countries

In [15]:
#Data sources are cited before
inbound = pd.merge(countries.groupby(['to'])['2017'].sum().reset_index(),
                 countries.groupby(['to'])['2022'].sum().reset_index(),
                 on = 'to',
                 how='left')
inbound.fillna(0,inplace=True)

new_data = [
    {'to': 'Egypt', '2017': 51162,'2022':69021} #2021 data https://www.statista.com/statistics/1193448/number-of-students-in-higher-education-in-egypt-by-nationality/
]
new_data_df = pd.DataFrame(new_data)
inbound = pd.concat([inbound,new_data_df])

inbound.loc[inbound['to']=='Thailand','2017']=43821
inbound.loc[inbound['to']=='Mexico','2022']=51659
inbound.loc[inbound['to']=='Iran','2022']=94406
inbound.loc[inbound['to']=='Ukraine','2022']=50870
inbound.loc[inbound['to']=='Cuba','2022']=7773
inbound.loc[inbound['to']=='Kazakhstan','2022']=26080 #add values by countries and also in total, as not every source is known
inbound.loc[inbound['to']=='China','2022']=253177
inbound.loc[inbound['to']=='China','2017']=489172
inbound.loc[inbound['to']=='Netherlands','2022']=122287
inbound.head(1)


Unnamed: 0,to,2017,2022
0,Albania,0.0,1731.0


Calc total sum for each year including rows where one of the years is empty

In [16]:
world = pd.DataFrame({'2017':[inbound['2017'].sum()],'2022':[inbound['2022'].sum()]})
#calc total change
world['2022/2017,%'] = round(world['2022']/world['2017']*100,0).astype(int)-100
world['2022/2017,%'] = world['2022/2017,%'].apply(lambda x:'+'+str(x) if x>0 else str(x))
world


Unnamed: 0,2017,2022,"2022/2017,%"
0,5147503.0,6283374.0,22


Draw total sums chart

In [17]:
color_2017 = '#686D76'
color_2022 = '#373A40'
fig = go.Figure()
trace1 = go.Bar(x=[world.columns[0]],
    y=world['2017'],
    name=f'2017',
    text=[f"{value/1000000:.1f}m" if value < 1000 else f"{value/1000000:.1f}m" for value in world['2017']],  # Value labels in thousands
    textposition='auto',
    textfont=dict(size=14),
    marker_color=color_2017,
    width=[0.4]
    )
trace2 = go.Bar(x=[world.columns[1]],
    y=world['2022'],
    name=f'2022',
    text=[f"{value/1000000:.1f}m\n({world['2022/2017,%'].values[j]}%)" if value < 1000 
          else f"{value/1000000:.1f}m<br><b>{world['2022/2017,%'].values[j]}%</b>" for j, value in enumerate(world['2022'])],  # Value labels in thousands
    textposition='auto',
    textfont=dict(size=14),
    marker_color=color_2022,
    width=[0.4]
    )
years = ['2017','2022']
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.update_layout(
    yaxis_title='Число студентов, тыс.',
    barmode='group',  # Group bars together
    height=400,
    width=600,
    plot_bgcolor='white',
    title=dict(text="Входящий поток иностранных студентов по группам стран с 2017 по 2022 г.", 
               font=dict(size=14), 
               yref='paper'),
    showlegend=False,
    bargap=0.04,
    annotations=[
            dict(
                x=0.35,  # Center the annotation
                y=-0.24,  # Move it below the chart
                text="@Conspect Labs, Sources: UNESCO, OpenDoorsData, Nuffic, IIE, Statista, <br>MoE of Kazakhstan, Iran, Egypt, Thailand",
                showarrow=False,
                xref="paper",  # Use paper coordinates
                yref="paper",
                font=dict(size=10, color="grey"),
                align="left"
        )
        ]
)

pio.write_image(fig, "world_higher_ed_market_2017_2022.png")
# fig.show()
print(world)

        2017       2022 2022/2017,%
0  5147503.0  6283374.0         +22


![Alt Text](world_higher_ed_market_2017_2022.png)

Inbound students

Return to the country-level analysis. Remove rows where there is at least one zero value, as we need year-to-year comparisons. # This could be a real zero, but small countries are still not needed, so get rid of them too. Then calculate the change and draw a chart.

In [18]:
inbound_no_zero = inbound[(inbound['2017'] != 0) & (inbound['2022'] != 0)]

inbound_no_zero['2022/2017,%'] = inbound_no_zero.apply(
    lambda row: int(round(row['2022'] / row['2017'] * 100, 0)) - 100
    if row['2022'] != 0 and row['2017'] != 0 else None,
    axis=1
)

# Format the output to add '+' sign for positive values
inbound_no_zero['2022/2017,%'] = inbound_no_zero['2022/2017,%'].apply(lambda x: '+' + str(x) if x > 0 else str(x))

#shorten the countries list by inbound numbers
inbound_no_zero = inbound_no_zero[inbound_no_zero['2022']>5000]

#And assign groups to make clear charts
def group(x):
    if x<20000:
        group_name = '5-20 тыс.'
    elif 20000<=x<50000:
        group_name = '20-50 тыс.'
    elif 50000<=x<100000:
        group_name = '50-100 тыс.'
    else:
        group_name = 'Больше 100 тыс.'
    return group_name
inbound_no_zero['group'] = inbound_no_zero['2022'].apply(lambda x: group(x))
inbound_no_zero.set_index('to', inplace=True)
inbound_no_zero = inbound_no_zero.sort_values(by='2022',ascending=False)
inbound_no_zero.head(2)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,2017,2022,"2022/2017,%",group
to,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
USA,984170.0,1035161.0,5,Больше 100 тыс.
United Kingdom,435389.0,674440.0,55,Больше 100 тыс.


Draw a chart for inbound students for every country

In [19]:
n_cols = 2 
color_2017 = '#686D76'
color_2022 = '#373A40'
distinct_color = '#DC5F00'
unique_groups = ['Больше 100 тыс.', '50-100 тыс.', '20-50 тыс.','5-20 тыс.']
num_plots = len(unique_groups)
num_cols = 2  # Maximum of 2 columns
num_rows = (num_plots + num_cols - 1) // num_cols  # Calculate the number of rows
fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=unique_groups,vertical_spacing=0.08 )

for i, group in enumerate(unique_groups):
    filtered_df = inbound_no_zero[inbound_no_zero['group'] == group]
    row_index = i // n_cols + 1
    col_index = i % n_cols + 1
    
    trace1 = go.Bar(x=filtered_df.index,
        y=filtered_df['2017'],
        name=f'2017 - {group}',
        text=[f"{value/1000:.1f}k" if value < 1000 else f"{int(value/1000)}k" for value in filtered_df['2017']],  # Value labels in thousands
        textposition='auto',
        textfont=dict(size=12),
        marker_color=color_2017,
        marker_line=dict(width=6, color=[distinct_color if country == 'Russia' else color_2017 for country in filtered_df.index]) 
        )
    trace2 = go.Bar(x=filtered_df.index,
        y=filtered_df['2022'],
        name=f'2022 - {group}',
        text=[f"{value/1000:.1f}k {filtered_df['2022/2017,%'].values[j]}%" if value < 1000 else f"{int(value/1000)}k {filtered_df['2022/2017,%'].values[j]}%" for j, value in enumerate(filtered_df['2022'])],  # Value labels in thousands
        textposition='auto',
        textfont=dict(size=12),
        marker_color=color_2022,
        marker_line=dict(width=6, color=[distinct_color if country == 'Russia' else color_2022 for country in filtered_df.index]) 
        )
    
    fig.add_trace(trace1, row=row_index, col=col_index)
    fig.add_trace(trace2, row=row_index, col=col_index)

original_annotations = list(fig.layout.annotations if 'annotations' in fig.layout else [])
custom_annotation = dict(
    x=0.01,
    y=-0.1,
    text="@Conspect Labs, Sources: UNESCO, OpenDoorsData, Nuffic, IIE, Statista, MoE of Kazakhstan, Iran, Egypt, Thailand",
    showarrow=False,
    xref="paper",
    yref="paper",
    font=dict(size=12, color="grey"),
    align="center"
)       
fig.update_layout(
        title=dict(text="Входящий поток иностранных студентов по группам стран, 2017 и 2022 гг.", 
                   font=dict(size=20,color='black'), 
                #    automargin=True, 
                   yref='paper'),
        # xaxis_title='Country',
        yaxis_title='Число студентов, тыс.',
        barmode='group',  # Group bars together
        height=1400,
        width=2000,
        plot_bgcolor='white',
        annotations=original_annotations + [custom_annotation]
)
for i in range(1, num_rows + 1):
    for j in range(1, num_cols + 1):
        fig.update_xaxes(tickangle=45, row=i, col=j)
        fig.update_yaxes(gridcolor='lightgrey', row=i, col=j)
        # fig.update_xaxes(tickfont=dict(size=14,family='Times'), row=i, col=j)
        # fig.update_yaxes(tickfont=dict(size=14, family='Times'), row=i, col=j)
        fig.update_xaxes(tickfont=dict(size=12, color='black'), row=i, col=j)
        fig.update_yaxes(tickfont=dict(size=12, color='black'), row=i, col=j)
fig.update_traces(textfont=dict(size=10),row=2,col=2)
fig.update_traces(constraintext='none')
pio.write_image(fig, "total_inbound_students_by_countries_2017_2022_ru.png")
fig.update_layout(
        title=dict(text="Total inbound students by countries, 2017 и 2022 years"))
pio.write_image(fig, "total_inbound_students_by_countries_2017_2022_en.png")
# fig.show()


![Alt Text](total_inbound_students_by_countries_2017_2022_en.png)

Draw a Treemap chart to see how much the market is fragmented 

In [20]:
tree_df = inbound_no_zero[['2022']].reset_index()
tree_df['2022'] = tree_df['2022']/1000
fig = px.treemap(tree_df,
                  path=[px.Constant("all"), 'to'], 
                  values='2022',
                  title= 'Въездные потоки студентов по странам, 2022 г'
  )

fig.update_traces(root_color="lightgrey",
                textinfo="label+value",
                texttemplate="%{label}<br>%{value:.0f}K"
                )

fig.update_layout(margin = dict(t=50, l=25, r=25, b=25),
width=1000, height=1000)
fig.write_image("Inbound_students_treemap.svg", format="svg")
# fig.show()

![Alt Text](Inbound_students_treemap.svg)

Calculatuing and comparing shares of countries in world higher education market

In [21]:
#we need shares calculated for each year
inbound_no_zero['share_2017']=round(inbound_no_zero['2017']/inbound_no_zero['2017'].sum()*100,1)
inbound_no_zero['share_2022']=round(inbound_no_zero['2022']/inbound_no_zero['2022'].sum()*100,1)
inbound_no_zero.head()

Unnamed: 0_level_0,2017,2022,"2022/2017,%",group,share_2017,share_2022
to,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
USA,984170.0,1035161.0,5,Больше 100 тыс.,19.4,16.6
United Kingdom,435389.0,674440.0,55,Больше 100 тыс.,8.6,10.8
Germany,241497.0,374226.0,55,Больше 100 тыс.,4.8,6.0
Australia,375140.0,369923.0,-1,Больше 100 тыс.,7.4,5.9
Canada,205816.0,336504.0,63,Больше 100 тыс.,4.1,5.4


Draw the bar chart with shares to show their change year-to-year 

In [23]:
n_cols = 2 
color_2017 = '#3486eb'
color_2022 = '#173e6e'
distinct_color = '#DC5F00'
unique_groups = ['Больше 100 тыс.', '50-100 тыс.']

filtered_df = inbound_no_zero[inbound_no_zero['group'].isin(unique_groups)].sort_values(by='2022',ascending=False).head(20)
# row_index = i // n_cols + 1
# col_index = i % n_cols + 1
fig = go.Figure()
trace1 = go.Bar(x=filtered_df.index,
    y=filtered_df['share_2017'],
    name=f'2017',
    text = [f"{int(value)}%" if value > 10 or value == int(value) 
                            else f"{value:.1f}%"
                            for value in filtered_df['share_2017']
                            ],
    textposition='outside',
    textfont=dict(size=10),
    marker_color=color_2017,
    marker_line=dict(width=6, color=[distinct_color if country == 'Russia' else color_2017 for country in filtered_df.index]) 
    )
trace2 = go.Bar(x=filtered_df.index,
    y=filtered_df['share_2022'],
    name=f'2022',
    text = [f"{int(value)}%" if value > 10 or value == int(value) 
                            else f"{value:.1f}%" 
                            for value in filtered_df['share_2022']],
    textposition='outside',
    textfont=dict(size=10),
    marker_color=color_2022,
    marker_line=dict(width=3, color=[distinct_color if country == 'Russia' else color_2022 for country in filtered_df.index]) 
    )

fig.add_trace(trace1)
fig.add_trace(trace2)

original_annotations = list(fig.layout.annotations if 'annotations' in fig.layout else [])
custom_annotation = dict(
    x=0.01,
    y=-0.17,
    text="@Conspect Labs, Sources: UNESCO, OpenDoorsData, Nuffic, IIE, Statista, MoE of Kazakhstan, Iran, Egypt, Thailand",
    showarrow=False,
    xref="paper",
    yref="paper",
    font=dict(size=10, color="grey"),
    align="center"
)    
   
fig.update_layout(
        title=dict(text="Доли рынка международного образования, 20 стран-лидеров, 2017 и 2022 гг", 
                   font=dict(size=14,color='black'), 
                   yref='paper'),
        barmode='group',  
        height=600,
        width=1400,
        plot_bgcolor='white',
        margin = dict(t=50, l=50, r=25, b=90),
        annotations=original_annotations + [custom_annotation]
)
fig.update_xaxes(tickangle=45,tickfont=dict(size=10,color='black'))
fig.update_yaxes(gridcolor='lightgrey')
fig.update_traces(constraintext='none')

pio.write_image(fig, "top_20_world_hihger_education_market_shares_2017_2022_ru.png")
fig.update_layout(
        title=dict(text="Top 20 world international higher education markets shares, 2017 and 2022 years", 
                   font=dict(size=14,color='black'), 
                   yref='paper'))
pio.write_image(fig, "top_20_world_hihger_education_market_shares_2017_2022_en.png")
# fig.show()

![Alt Text](top_20_world_hihger_education_market_shares_2017_2022_en.png)

<h4>Outbound by Countries</h4>

Exploring outbound internationally mobile tertiary students studying abroad, all countries.

In [24]:
outbound = pd.merge(countries.groupby(['from'])['2017'].sum().reset_index(),
                 countries.groupby(['from'])['2022'].sum().reset_index(),
                 on = 'from',
                 how='left')
outbound = outbound[~(outbound['2017'].isna()) & ~(outbound['2022'].isna())]
outbound = outbound[~(outbound['2017']==0) & ~(outbound['2022']==0)]
outbound['2022/2017,%'] = round(outbound['2022']/outbound['2017']*100,0).astype(int)-100
outbound['2022/2017,%'] = outbound['2022/2017,%'].apply(lambda x:'+'+str(x) if x>0 else str(x))
outbound = outbound[outbound['2022']>5000]
def group(x):
    if x<20000:
        group_name = '5-20 тыс.'
    elif 20000<=x<50000:
        group_name = '20-50 тыс.'
    elif 50000<=x<100000:
        group_name = '50-100 тыс.'
    elif 100000<=x<200000:
        group_name = '100-200 тыс.'
    else:
        group_name = 'Больше 200 тыс.'
    return group_name
outbound['group'] = outbound['2022'].apply(lambda x: group(x))
outbound.set_index('from', inplace=True)
outbound = outbound.sort_values(by='2022',ascending=False)
outbound.head(1)

Unnamed: 0_level_0,2017,2022,"2022/2017,%",group
from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
China,839754.0,960174.0,14,Больше 200 тыс.


In [25]:
color_2017 = '#eab676'
color_2022 = '#873e23'
distinct_color = 'black'
n_cols = 2  # Number of columns in subplot grid
unique_groups = ['Больше 200 тыс.','100-200 тыс.', '50-100 тыс.', '20-50 тыс.']
num_plots = len(unique_groups)
num_cols = 2 
num_rows = (num_plots + num_cols - 1) // num_cols
fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=unique_groups,vertical_spacing=0.07)
outbound = outbound.sort_values(by=['2022'],ascending=False)
for i, group in enumerate(unique_groups):
    filtered_df = outbound[outbound['group'] == group]
    row_index = i // n_cols + 1
    col_index = i % n_cols + 1
    
    trace1 = go.Bar(x=filtered_df.index,
        y=filtered_df['2017'],
        name=f'2017 - {group}',
        text=[f"{value/1000:.1f}k" if value < 1000 else f"{int(value/1000)}k" for value in filtered_df['2017']],  # Value labels in thousands
        textposition='auto',
        textfont=dict(size=12),
        marker_color=color_2017,
        marker_line=dict(width=3, color=[distinct_color if country == 'Uzbekistan' else color_2017 for country in filtered_df.index]) 
        )
    trace2 = go.Bar(x=filtered_df.index,
        y=filtered_df['2022'],
        name=f'2022 - {group}',
        text=[f"{value/1000:.1f}k {filtered_df['2022/2017,%'].values[j]}%" if value < 1000 else f"{int(value/1000)}k {filtered_df['2022/2017,%'].values[j]}%" for j, value in enumerate(filtered_df['2022'])],  # Value labels in thousands
        textposition='auto',
        textfont=dict(size=12),
        marker_color=color_2022,
        marker_line=dict(width=3, color=[distinct_color if country == 'Uzbekistan' else color_2022 for country in filtered_df.index]) 
        )
    
    fig.add_trace(trace1, row=row_index, col=col_index)
    fig.add_trace(trace2, row=row_index, col=col_index)

original_annotations = list(fig.layout.annotations if 'annotations' in fig.layout else [])
custom_annotation = dict(
    x=0.01,
    y=-0.15,
    text="@Conspect Labs, Sources: UNESCO, OpenDoorsData, Nuffic, IIE, Statista, MoE of Kazakhstan, Iran, Egypt, Thailand",
    showarrow=False,
    xref="paper",
    yref="paper",
    font=dict(size=12, color="grey"),
    align="center"
) 
  
fig.update_layout(
    title=dict(text="Исходящий поток иностранных студентов по группам стран с 2017 по 2022 г.", 
                   font=dict(size=20, color='black'),
                   yref='paper'),
    barmode='group',     height=1200,
    width=2000,
    plot_bgcolor='white',
    annotations=original_annotations + [custom_annotation],

)
for i in range(1, num_rows + 1):
    for j in range(1, num_cols + 1):
        fig.update_xaxes(tickangle=45, row=i, col=j)
        fig.update_yaxes(gridcolor='lightgrey', row=i, col=j)
        fig.update_xaxes(tickfont=dict(size=12, color='black'), row=i, col=j)
        fig.update_yaxes(tickfont=dict(size=12, color='black'), row=i, col=j)
fig.update_traces(constraintext='none')
fig.update_traces(textfont=dict(size=10),row=2,col=2)
pio.write_image(fig, "Outbound_students_by_countries_2017_2022_ru.png")
fig.update_layout(
    title=dict(text="Outbound students by country, 2017 and 2022 years",
               font=dict(size=20, color='black'),
                   yref='paper'))
pio.write_image(fig, "Outbound_students_by_countries_2017_2022_en.png")
# fig.show()

![Alt Text](Outbound_students_by_countries_2017_2022_en.png)

<h4>Student flows inside CIS</h4>

To see how students flows change over time, we use Sankey diagram. It demands different names for sources and targets, so let's make some tweaks.

In [27]:
countries2 = countries[~(countries['2017'].isna()) & ~(countries['2022'].isna())]
countries2 = countries2[~(countries2['2017']==0) & ~(countries2['2022']==0)]
countries2['from'] = countries2['from']+ ' Out'
countries2['to'] = countries2['to']+ ' In'
countries2.head(2)

year,from,to,2017,2022
2,Afghanistan Out,Argentina In,15.0,16.0
4,Afghanistan Out,Australia In,44.0,73.0


In [28]:
cis_c = ['Russia','Moldova','Belarus','Kazakhstan','Uzbekistan',
          'Kyrgyzstan','Tajikistan','Turkmenistan',
          'Armenia','Azerbaijan','Ukraine']
cis = [item + ' Out' for item in cis_c] + [item + ' In' for item in cis_c]
cis_countries_from = countries2[countries2['from'].isin(cis)]
cis_countries_from.head(2)

year,from,to,2017,2022
953,Armenia Out,Argentina In,31.0,65.0
955,Armenia Out,Austria In,60.0,66.0


Extract top 5 target countries for every source country in 2017 and 2022.

In [None]:
cis_countries_from_2022_top10 = cis_countries_from.groupby('from').apply(lambda x: x.nlargest(5, '2022')).reset_index(drop=True)
cis_countries_from_2022_top10.drop(columns='2017',inplace=True)
cis_countries_from_2017_top10 = cis_countries_from.groupby('from').apply(lambda x: x.nlargest(5, '2017')).reset_index(drop=True)
cis_countries_from_2017_top10.drop(columns='2022',inplace=True)
cis_countries_from_top = pd.concat([cis_countries_from_2017_top10,cis_countries_from_2022_top10])
cis_countries_from_top['to'].unique()







array(['Russia In', 'France In', 'Germany In', 'USA In',
       'United Kingdom In', 'Turkey In', 'Georgia In', 'Poland In',
       'Lithuania In', 'Czechia In', 'China In', 'Kyrgyzstan In',
       'Kazakhstan In', 'Saudi Arabia In', 'Romania In', 'Italy In',
       'Belarus In', 'Malaysia In', 'Republic of Korea In', 'Latvia In',
       'Bulgaria In', 'Uzbekistan In', 'Slovakia In'], dtype=object)

In [30]:
other_color = 'rgba(189, 195, 199, 0.8)'
country_colors = {
        'Armenia': 'rgba(31, 119, 180, 0.8)',  # Blue
        'Azerbaijan': 'rgba(255, 127, 14, 0.8)',  # Orange
        'Belarus': 'rgba(44, 162, 44, 0.8)',  # Green
        'Kazakhstan': 'rgba(214, 39, 40, 0.8)',  # Red
        'Kyrgyzstan': 'rgba(148, 103, 189, 0.8)',  # Purple
        'Moldova': 'rgba(140, 86, 75, 0.8)',  # Brown
        'Russia': 'rgba(227, 119, 194, 0.8)',  # Pink
        'Tajikistan': 'rgba(127, 127, 127, 0.8)',  # Gray
        'Turkmenistan': 'rgba(188, 189, 34, 0.8)',  # Yellow-green
        'Ukraine': 'rgba(23, 190, 239, 0.8)',  # Cyan
        'Uzbekistan': 'rgba(255, 187, 120, 0.8)',   # Light Orange (to make it distinct)
        **{
        country: f'rgba({i*20 %255}, {(i*40) %255}, {(i*60) %255},0.8)'
        for i,country in enumerate([
            "France", "Germany", "USA", "United Kingdom", "Turkey",
            "Georgia", "Poland", "Lithuania", "Canada","Romania", "Italy",
            "Czechia", "Republic of Korea", "Saudi Arabia", "Cyprus","Latvia",
            "Malaysia", "Bulgaria", "Slovakia", "China","New Zealand", "India", "Egypt",
            "Morocco", "Iraq", "Viet Nam", "Nigeria", "Mongolia", "Angola","Iran","Syrian Arab Republic",
            "Australia","Netherlands","Sweden","Ireland","Japan",'Hong Kong','Macao'
        ])
    }
}

In [34]:
#Draw Sankey for every year
c_a = ('Kazakhstan Out','Kyrgyzstan Out','Tajikistan Out','Turkmenistan Out','Uzbekistan Out')
e_c = ('Russia Out','Moldova Out','Belarus Out','Ukraine Out','Armenia Out','Azerbaijan Out')
regions = {c_a:'Central_Asia',
           e_c:'Europe_Caucasus'}

for key,value in regions.items():
    print(value)
    df = cis_countries_from_top[cis_countries_from_top['from'].isin(key)]
    unique_labels = list(set(df['from'].tolist() + df['to'].tolist()))
       
    # You can use color mapping for countries. Change i and coeffs to get distinct colors.
    # country_colors = {country: f'rgba({(i+100) * 20 % 255}, {((i+50) * 200 + 100) % 255}, {((i+200) * 380) % 255}, 1)'
    #                   for i, country in enumerate(set([label.split()[0] for label in unique_labels]))}
    # Function to get sorted unique labels based on total link values

    def get_sorted_labels(df, year):
        link_sums = pd.concat([
            df.groupby('from')[year].sum(),
            df.groupby('to')[year].sum()
        ]).groupby(level=0).sum()
        return link_sums.sort_values(ascending=False).index.tolist()

    # Function to build a Sankey trace for a specific year
    def build_sankey_trace(df, year,pad_size):
        sorted_labels = get_sorted_labels(df, year)
        unique_labels_sorted = [label for label in unique_labels if label in sorted_labels]
        
        # Map source and target to indices
        source_indices = [unique_labels_sorted.index(src) for src in df['from']]
        target_indices = [unique_labels_sorted.index(tgt) for tgt in df['to']]
        
        # Assign link colors based on source country
        link_colors = [country_colors[" ".join(src.split()[:-1])].replace('1)', '0.8)') for src in df['from']]
        
        return go.Sankey(
            node=dict(
                pad=pad_size,
                thickness=50,
                line=dict(color="black", width=0.5),
                label=[f"{label} ({int(df[df['from'] == label][year].sum() + df[df['to'] == label][year].sum())})"
                    for label in unique_labels_sorted],
                color=[country_colors[" ".join(label.split()[:-1])] for label in unique_labels_sorted]
            ),
            link=dict(
                source=source_indices,
                target=target_indices,
                value=df[year],
                color=link_colors,
                label=[f"{df.iloc[i]['from']} → {df.iloc[i]['to']}: {df.iloc[i][year]}"
                    for i in range(len(df))]
            )
        )

    def generate_annotations(df, unique_labels_sorted, year):
        annotations = []
        df=df[['from','to',year]].dropna()
        for i, row in df.iterrows():
            source_idx = unique_labels_sorted.index(row['from'])
            target_idx = unique_labels_sorted.index(row['to'])
            annotations.append(dict(
                # x=x_pos, y=y_pos,
                text=f"{int(row[year])}",  # Display value as an integer
                showarrow=False,
                font=dict(size=10, color="black"),
                align="center"
            ))
        return annotations

    fig = make_subplots(
        rows=1, cols=2, 
        specs=[[{'type': 'domain'}, {'type': 'domain'}]]  # Specify 'domain' for Sankey
    )
    fig.add_trace(build_sankey_trace(df, "2017",48), row=1, col=1)
    fig.add_trace(build_sankey_trace(df, "2022",18), row=1, col=2)
    fig.update_layout(
        title_text=f"Иностранные студенты из стран СНГ, {value}, Топ-5 направлений выезда для каждой страны",
        font=dict(size=12, color='black'),
        height=800,
        width=1600,
        plot_bgcolor='white',
        annotations=[
            dict(
                x=0.2,  # Center of first subplot
                y=1.05, 
                text="2017",
                showarrow=False,
                font=dict(size=16, color="black")
            ),
            dict(
                x=0.78,  # Center of second subplot
                y=1.05, 
                text="2022",
                showarrow=False,
                font=dict(size=16, color="black"),
            ),
            dict(
                x=0.002,  # Comments
                y=-0.12, 
                text="Число студентов показано для суммы входящих и исходящих потоков стран на диаграмме. <br>@Conspect Labs, Sources: UNESCO, OpenDoorsData, Nuffic, IIE, Statista, MoE of Kazakhstan, Iran, Egypt, Thailand",
                showarrow=False,
                font=dict(size=12, color="black")
            )
        ]
    )

    output_file = f"top_5_destinations_sankey_diagrams_{value}_ru.png"  # You can change this to .png, .pdf, or .jpeg
    pio.write_image(fig, output_file, format="png", width=1600, height=800)
    
    fig.update_layout(
        title_text=f"Outbound students from the CIS countries, {value}, Top 5 destinations for every country",
        font=dict(size=12, color='black'),
        height=800,
        width=1600,
        plot_bgcolor='white',
        annotations=[
            dict(
                x=0.2,  # Center of first subplot
                y=1.05, 
                text="2017",
                showarrow=False,
                font=dict(size=16, color="black")
            ),
            dict(
                x=0.78,  # Center of second subplot
                y=1.05, 
                text="2022",
                showarrow=False,
                font=dict(size=16, color="black"),
            ),
            dict(
                x=0.002,  # Comments
                y=-0.12, 
                text="Students number is the sum of the flows at the diagram. <br>@Conspect Labs, Sources: UNESCO, OpenDoorsData, Nuffic, IIE, Statista, MoE of Kazakhstan, Iran, Egypt, Thailand",
                showarrow=False,
                font=dict(size=12, color="black")
            )
        ]
    )
    output_file = f"top_5_destinations_sankey_diagrams_{value}_en.png"  # You can change this to .png, .pdf, or .jpeg
    pio.write_image(fig, output_file, format="png", width=1600, height=800)
    # Show the figure
    # fig.show()

Central_Asia
Europe_Caucasus


![Alt Text](top_5_destinations_sankey_diagrams_Central_Asia_en.png)

<h4>Top 20 source countries for Russia</h4>

In [35]:
#Extract all source countries for Russia
inbound_country='Russia In'
inbound_to_country = countries2[countries2['to']==inbound_country]
inbound_to_country.head(1)

year,from,to,2017,2022
91,Afghanistan Out,Russia In,188.0,1602.0


Extract top 20 source countries for Russia, separately for in 2017 and 2022, as the list of 20 leading countries could change.

In [36]:
inbound_to_country_from_2017_top20 = inbound_to_country.sort_values(by='2017', ascending=False).head(20)
inbound_to_country_from_2017_top20 = inbound_to_country_from_2017_top20[['from','to','2017']]
inbound_to_country_from_2022_top20 = inbound_to_country.sort_values(by='2022', ascending=False).head(20)
inbound_to_country_from_2022_top20 = inbound_to_country_from_2022_top20[['from','to','2022']]
inbound_to_country_from_2017_top20['share'] = round(inbound_to_country_from_2017_top20['2017']/inbound_to_country_from_2017_top20['2017'].sum()*100,1)
inbound_to_country_from_2022_top20['share'] = round(inbound_to_country_from_2022_top20['2022']/inbound_to_country_from_2022_top20['2022'].sum()*100,1)
for df in [inbound_to_country_from_2017_top20,inbound_to_country_from_2022_top20]:
    df['from'] = df['from'].apply(lambda x:x.replace(' Out',''))
    df['color'] = df['from'].map(country_colors)
    print(df.head(1))

year         from         to     2017  share                   color
11603  Kazakhstan  Russia In  65237.0   33.1  rgba(214, 39, 40, 0.8)
year         from         to     2022  share                   color
11603  Kazakhstan  Russia In  53935.0   18.1  rgba(214, 39, 40, 0.8)


Draw the diagram of the main source countries of inbound students in Russia.

In [37]:
inbound_to_country_from_2017_top20.sort_values(by='2017',ascending=True,inplace=True)
inbound_to_country_from_2022_top20.sort_values(by='2022',ascending=True,inplace=True)

# Create subplots
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=['2017', '2022'],
    row_heights =[1],
    
)
trace1 = go.Bar(
    y=inbound_to_country_from_2017_top20['from'],
    x=inbound_to_country_from_2017_top20['2017'],
    orientation='h',
    name=f'2017',
    # text=[f"{value/1000:.1f}k" if value < 10000 else f"{int(value/1000)}k" for value in inbound_to_country_from_2017_top20['2017']],  # Value labels in thousands
    text=[f"{value/1000:.1f}k ({inbound_to_country_from_2017_top20['share'].values[j]}%)" if value < 10000 else f"{int(value/1000)}k ({inbound_to_country_from_2017_top20['share'].values[j]}%)" for j,value in enumerate(inbound_to_country_from_2017_top20['2017'])],  
    textposition='outside',
    textfont=dict(size=10),
    marker_color=[color for color in inbound_to_country_from_2017_top20['color']],
    cliponaxis=False 
    )
trace2 = go.Bar(
    y=inbound_to_country_from_2022_top20['from'],
    x=inbound_to_country_from_2022_top20['2022'],
    orientation='h',
    name=f'2022',
    # text=[f"{value/1000:.1f}k" if value < 10000 else f"{int(value/1000)}k" for value in inbound_to_country_from_2022_top20['2022']],  # Value labels in thousands
    text=[f"{value/1000:.1f}k ({inbound_to_country_from_2022_top20['share'].values[j]}%)" if value < 10000 else f"{int(value/1000)}k ({inbound_to_country_from_2022_top20['share'].values[j]}%)" for j,value in enumerate(inbound_to_country_from_2022_top20['2022'])],  
    textposition='outside',
    textfont=dict(size=10),
    marker_color=[color for color in inbound_to_country_from_2022_top20['color']],
    cliponaxis=False 
    )
    
fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)


fig.update_layout(
    title='Топ-20 стран по количеству студентов в России в 2017 и 2022 г. и их доли в общем кол-ве иностранных студентов',
    # yaxis_title='Число студентов',
    barmode='group',  # Group bars together
    height=600,
    width=1300,
    showlegend=False,
    font=dict(size=12, color='black'),
    yaxis=dict(
            tickfont=dict(size=10)  # Adjust font size here
        ),
        yaxis2=dict(
            tickfont=dict(size=10)  # Adjust font size here
        ),
        xaxis=dict(
            tickfont=dict(size=10)  # Adjust font size here
        ),
        xaxis2=dict(
            tickfont=dict(size=10)  # Adjust font size here
        ),
    margin=dict(l=50, r=50, t=100, b=100),
    plot_bgcolor='white',
    annotations=[
            dict(
                x=0.2,  # Center of first subplot
                y=1.05, 
                text="2017",
                showarrow=False,
                font=dict(size=10, color="black")
            ),
            dict(
                x=0.78,  # Center of second subplot
                y=1.05, 
                text="2022",
                showarrow=False,
                font=dict(size=10, color="black"),
            ),
            dict(
                x=0.4,  # Center the annotation
                y=-0.18,  # Move it below the chart
                text="@Conspect Labs, Sources: UNESCO, OpenDoorsData, Nuffic, IIE, Statista, MoE of Kazakhstan, Iran, Egypt, Thailand",
                showarrow=False,
                xref="paper",  # Use paper coordinates
                yref="paper",
                font=dict(size=12, color="grey"),
                align="left"
        )
        ]
)

pio.write_image(fig, "top_20_countries_inbound_to_russia_2017_2022_ru.png")
    
fig.update_layout(
    title=f'Top 20 Countries by Number of Students in Russia in 2017 and 2022 and Their Shares in the Total Number of International Students'
)
    
pio.write_image(fig, "top_20_countries_inbound_to_russia_2017_2022_en.png")
# fig.show()

![Alt Text](top_20_countries_inbound_to_russia_2017_2022_en.png)

Let's see how concentrated the international higher education market is in Russia. Russian universities primarily depend on Central Asian countries and China — 60% of students come from Kazakhstan, Uzbekistan, Turkmenistan, Tajikistan, and China. Market diversification is occurring, but not quickly: in 2017, the top 10 donor countries to Russia accounted for 85% of student inflows, while in 2022, this figure dropped to 78%.

In [38]:
print(inbound_to_country_from_2017_top20['2017'].sum())

197050.0


In [39]:
inbound_to_country.sort_values(by='2017', ascending=False).head(5)['2017'].sum()/inbound_to_country['2017'].sum()

np.float64(0.6293490911499484)

In [40]:
inbound_to_country.sort_values(by='2022', ascending=False).head(5)['2022'].sum()/inbound_to_country['2022'].sum()

np.float64(0.5899888186650563)

In [41]:
inbound_to_country.sort_values(by='2017', ascending=False).head(10)['2017'].sum()/inbound_to_country['2017'].sum()

np.float64(0.8511728470993443)

In [42]:
inbound_to_country.sort_values(by='2022', ascending=False).head(10)['2022'].sum()/inbound_to_country['2022'].sum()

np.float64(0.7812135634974259)

<h4>Russia's share in outbound students flows by country, 2017 and 2022</h4>

Draw a chart to see how Russia's share on distinct markets changed over time.

In [44]:
results = []
for every in countries2['from'].unique():
    source = countries2[countries2['from']==every]
    
    source_sum_17 = source[source['from']==every]['2017'].sum()
    russia_in_17 = source[source['to']=='Russia In']['2017'].sum()
    share_17 = round(russia_in_17/source_sum_17*100,1)
    
    source_sum_22 = source[source['from']==every]['2022'].sum()
    russia_in_22 = source[source['to']=='Russia In']['2022'].sum()
    share_22 = round(russia_in_22/source_sum_22*100,1)
    results.append({
        'Country': every,
        'Total_2017': source_sum_17,
        'Russia_In_2017': russia_in_17,
        'Share_to_Russia_%_2017': share_17,
        'Total_2022': source_sum_22,
        'Russia_In_2022': russia_in_22,
        'Share_to_Russia_%_2022': share_22,
    })
results_df = pd.DataFrame(results)
results_df['total_change'] = round((results_df['Total_2022']/results_df['Total_2017']-1)*100,1)
results_df['Russia_in_change'] = round((results_df['Russia_In_2022']/results_df['Russia_In_2017']-1)*100,1)
results_df = results_df.reset_index()
results_df = results_df.drop(columns=['index']).sort_values(by='Share_to_Russia_%_2022',ascending=False)
results_df.head(2)

Unnamed: 0,Country,Total_2017,Russia_In_2017,Share_to_Russia_%_2017,Total_2022,Russia_In_2022,Share_to_Russia_%_2022,total_change,Russia_in_change
181,Tajikistan Out,19156.0,14204.0,74.1,31553.0,27086.0,85.8,64.7,90.7
96,Kazakhstan Out,95570.0,65237.0,68.3,92404.0,53935.0,58.4,-3.3,-17.3


Draw the diagram of how Russia's share in outbound students flows has changed since 2017.

In [41]:
color_2017 = '#b380e0'
color_2022 = '#5e3b7d'
fig = go.Figure()
results_df_1 = results_df
#select countries with more then 2% Russia's share and inbound flow more then 500
results_df_1 = results_df_1[results_df_1['Share_to_Russia_%_2022']>=2]
results_df_1 = results_df_1[results_df_1['Russia_In_2022']>=500]
trace1 = go.Bar(
    x=[country.replace(' Out', '') for country in results_df_1['Country']],
    y=results_df_1['Share_to_Russia_%_2017'],
    name='2017',
    text=[f"{int(value)}%" if value >= 10 or value == int(value) else f"{value:.1f}%"   for value in results_df_1['Share_to_Russia_%_2017']],  # Value labels in thousands
    textposition='outside',
    textfont=dict(size=9),
    marker_color=color_2017
    )
trace2 = go.Bar(
    x=[country.replace(' Out', '') for country in results_df_1['Country']],
    y=results_df_1['Share_to_Russia_%_2022'],
    name='2022',
    text = [f"{int(value)}%" if value >= 10 or value == int(value) else f"{value:.1f}%"  for value in results_df_1['Share_to_Russia_%_2022']],
    textposition='outside',
    textfont=dict(size=9),
    marker_color=color_2022
    )
   
fig.add_trace(trace1)
fig.add_trace(trace2)   

original_annotations = list(fig.layout.annotations if 'annotations' in fig.layout else [])
custom_annotation = dict(
    x=0.01,
    y=-0.17,
    text="@Conspect Labs, Sources: UNESCO, OpenDoorsData, Nuffic, IIE, Statista, MoE of Kazakhstan, Iran, Egypt, Thailand",
    showarrow=False,
    xref="paper",
    yref="paper",
    font=dict(size=10, color="grey"),
    align="center"
) 
fig.update_layout(
    title='Доля России в выездном потоке студентов по странам в 2017 и 2022 гг',
    font=dict(size=12, color='black'),
    barmode='group', 
    height=600,
    width=1600,
    yaxis=dict(title='', showgrid=True),
    plot_bgcolor='white',
    margin = dict(t=50, l=50, r=25, b=90),
        annotations=original_annotations + [custom_annotation]
)
fig.update_traces(constraintext='none')
pio.write_image(fig, "Russia_share_outbound_flows_by_country_ru.png")
fig.update_layout(
    title="Russia's Share in Student Outflow by Countries in 2017 and 2022"
)
pio.write_image(fig, "Russia_share_outbound_flows_by_country_en.png")
# fig.show()

![Alt Text](Russia_share_outbound_flows_by_country_en.png)

<h4>Where do students from major markets go?</h4>

The top global markets for international education are China (over 900,000 outbound students per year), India (over 700,000), Vietnam and Uzbekistan (over 140,000 each), Germany and France (over 120,000), Pakistan (115,000), Nigeria (107,000), Nepal, the USA, and Syria (over 90,000). Let's see, how the student flows distributed among these countries.

In [45]:
all_out = countries2.groupby(['from'])['2017'].sum().reset_index()

In [46]:
top_out_ru = {'Индия':'India Out','Китай':'China Out',
           'Узбекистан':'Uzbekistan Out',
           'Вьетнам':'Viet Nam Out',
           'Азербайджан':'Azerbaijan Out',
           'Беларусь':'Belarus Out',
           'Казахстан':'Kazakhstan Out',
           'Туркменистан':'Turkmenistan Out'}
        #    'Germany Out','France Out','Pakistan Out','Nigeria Out','Nepal Out','USA Out',' Syrian Arab Republic Out']

In [47]:
top_out_en = {'India':'India Out','China':'China Out',
           'Uzbekistan':'Uzbekistan Out', 'Viet Nam':'Viet Nam Out',
           'Azerbaijan':'Azerbaijan Out','Belarus':'Belarus Out',
           'Kazakhstan':'Kazakhstan Out','Turkmenistan':'Turkmenistan Out',
           'Germany':'Germany Out','France':'France Out',
           'Pakistan':'Pakistan Out','Nigeria':'Nigeria Out',
           'Nepal':'Nepal Out','USA':'USA Out',
           'Syria':'Syrian Arab Republic Out'}

Draw a chart for outbound students flow for every country for 2017 and 2022 years 

In [51]:
for country,otbound_country in top_out_en.items():
    outbound_from_country = countries2[countries2['from']==otbound_country]
    outbound_from_country_2017_top20 = outbound_from_country[['from','to','2017']]
    outbound_from_country_2017_top20['share_2017'] = round(outbound_from_country_2017_top20['2017']/outbound_from_country_2017_top20['2017'].sum()*100,1)
    outbound_from_country_2017_top20['to'] = outbound_from_country_2017_top20['to'].apply(lambda x:x.replace(' In',''))
    outbound_from_country_2017_top20 = outbound_from_country_2017_top20[['to','2017','share_2017']].set_index('to').reset_index()
    outbound_from_country_2017_top20['color'] = outbound_from_country_2017_top20['to'].map(country_colors)

    outbound_from_country_2022_top20 = outbound_from_country[['from','to','2022']]
    outbound_from_country_2022_top20['share_2022'] = round(outbound_from_country_2022_top20['2022']/outbound_from_country_2022_top20['2022'].sum()*100,1)
    outbound_from_country_2022_top20['to'] = outbound_from_country_2022_top20['to'].apply(lambda x:x.replace(' In',''))
    outbound_from_country_2022_top20 = outbound_from_country_2022_top20[['to','2022','share_2022']].set_index('to').reset_index()
    outbound_from_country_2022_top20['color'] = outbound_from_country_2022_top20['to'].map(country_colors)

    outbound_from_country_2017_top20 = outbound_from_country_2017_top20.sort_values(by='2017',ascending=False).head(10).sort_values(by='2017',ascending=True)
    outbound_from_country_2022_top20 = outbound_from_country_2022_top20.sort_values(by='2022',ascending=False).head(10).sort_values(by='2022',ascending=True)
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=['2017', '2022'],
        row_heights =[1],
        horizontal_spacing=0.12,
        column_widths = [0.4, 0.4],
    )
    trace1 = go.Bar(
        y=outbound_from_country_2017_top20['to'],
        x=outbound_from_country_2017_top20['2017'],
        orientation='h',
        name=f'2017',
        text=[f"{value/1000:.1f}k ({outbound_from_country_2017_top20['share_2017'].values[j]}%)" if value < 10000 else f"{int(value/1000)}k ({outbound_from_country_2017_top20['share_2017'].values[j]}%)" for j,value in enumerate(outbound_from_country_2017_top20['2017'])],
        textposition='outside',
        textfont=dict(size=10),
        marker_color=[color for color in outbound_from_country_2017_top20['color']],
        cliponaxis=False 
        )
    trace2 = go.Bar(
        y=outbound_from_country_2022_top20['to'],
        x=outbound_from_country_2022_top20['2022'],
        orientation='h',
        name=f'2022',
        text=[f"{value/1000:.1f}k ({outbound_from_country_2022_top20['share_2022'].values[j]}%)" if value < 10000 else f"{int(value/1000)}k ({outbound_from_country_2022_top20['share_2022'].values[j]}%)" for j,value in enumerate(outbound_from_country_2022_top20['2022'])],  
        textposition='outside',
        textfont=dict(size=10),
        marker_color=[color for color in outbound_from_country_2022_top20['color']],
        cliponaxis=False 
        )
        
    fig.add_trace(trace1, row=1, col=1)
    fig.add_trace(trace2, row=1, col=2)
            
    fig.update_layout(
        title=f'Top-10 Countries for Student Outflow, {country}, 2017 and 2022',
        font=dict(size=12, color='black'),
        barmode='group', 
        yaxis=dict(
            tickfont=dict(size=10)  # Adjust font size for every subplot
        ),
        yaxis2=dict(
            tickfont=dict(size=10) 
        ),
        xaxis=dict(
            tickfont=dict(size=10)
        ),
        xaxis2=dict(
            tickfont=dict(size=10) 
        ),
        height=400,
        width=1100,
        showlegend=False,
        margin=dict(l=20, r=60, t=100, b=100),
        plot_bgcolor='white',
        annotations=[
                dict(
                    x=0.2,  # Center of first subplot
                    y=1.05, 
                    text="2017",
                    showarrow=False,
                    font=dict(size=8, color="black"),
                ),
                dict(
                    x=0.78,  # Center of second subplot
                    y=1.05, 
                    text="2022",
                    showarrow=False,
                    font=dict(size=8, color="black"),
                ),
                dict(
                    x=0.005,  # Center the annotation
                    y=-0.25,  # Move it below the chart
                    text="@Conspect Labs, Sources: UNESCO, OpenDoorsData, Nuffic, IIE, Statista, MoE of Kazakhstan, Iran, Egypt, Thailand",
                    showarrow=False,
                    xref="paper",  # Use paper coordinates
                    yref="paper",
                    font=dict(size=10, color="grey"),
                    align="left"
            )
            ]
    )
pio.write_image(fig, f"Top-10_Countries_for_Student_Outflow_({country})_2017_2022.png")
    
    # Create a second version with Russian titles
fig.update_layout(
    title=f'Топ-10 стран по направлениям выезда студентов, {country}, 2017 и 2022 гг.'
)

pio.write_image(fig, f"Топ-10 стран по направлениям выезда студентов ({country}), 2017 и 2022 гг.png")
    

![Alt Text](Top-10_Countries_for_Student_Outflow_(China)_2017_2022.png)

![Alt Text](Top-10_Countries_for_Student_Outflow_(India)_2017_2022.png)

![Alt Text](Top-10_Countries_for_Student_Outflow_(Uzbekistan)_2017_2022.png)

![Alt Text](Top-10_Countries_for_Student_Outflow_(Viet_Nam)_2017_2022.png)