In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
warnings.filterwarnings("ignore")
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)

In [None]:
!pip install jupyter-dash pyngrok -q

In [None]:
# from pyngrok import ngrok
# tunnel = ngrok.connect(8096)

from dash import Dash, html, dcc
from jupyter_dash import JupyterDash
from dash.dependencies import Output, Input
from dash.exceptions import PreventUpdate

In [None]:
data = pd.read_csv('world_university_rank.csv')
data.head()

In [None]:
#Data manipulation / cleaning

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.dropna(inplace=True)

In [None]:
data.rename(columns={'locationLocation':'Location'}, inplace=True)

In [None]:
data.rename(columns={'Number of Studnet':'Number of Student'}, inplace=True)

In [None]:
data.rename(columns={'Number of student per staffs':'Number of Student per Staff'}, inplace=True)

In [None]:
data.drop(data[data['Rank'] == 'Reporter'].index, inplace=True)

In [None]:
data['Number of Student'] = data['Number of Student'].str.replace(',', '')
data['Number of Student'] = data['Number of Student'].astype('int')

In [None]:
data['Number of Student per Staff'] = data['Number of Student per Staff'].astype('float')

In [None]:
data.rename(columns={'International Student':'International Student (percentage)'}, inplace=True)

In [None]:
data['International Student (percentage)'] = data['International Student (percentage)'].str.replace('%', '')
data['International Student (percentage)'] = data['International Student (percentage)'].replace('', '0')

In [None]:
data['International Student (percentage)'] = data['International Student (percentage)'].astype('float') / 100

In [None]:
#Male Student and Female student Count

In [None]:
def female(data):
    blanks = []
    for x in data['Female : male ratio']:
        blanks.append(x.split(':')[0])
        
    return blanks

data['Female'] = female(data)

In [None]:
def male(data):
    blanks = []
    for x in data['Female : male ratio']:
        blanks.append(x.split(':')[1])
        
    return blanks

data['Male'] = male(data)

In [None]:
data['Female Cnt'] = round(data['Number of Student']*(data['Female'].astype(int)/100))

data['Male Cnt'] = round(data['Number of Student']*(data['Male'].astype(int)/100))

data['Female'] = data['Female'].astype(int)
data['Male'] = data['Male'].astype(int)

data.drop('Female : male ratio', axis=1, inplace=True)

In [None]:
for x in ['Female', 'Male']:
    data[x] = data[x] / 100 

data.rename(columns={'Female':'Female (percentage)',
                    'Male':'Male (percentage)'}, inplace=True)

In [None]:
for x in ['Female Cnt', 'Male Cnt']:
    data[x] = data[x].astype(int)

In [None]:
#International Student Cnt
data['International Student Cnt'] = round(data['Number of Student']*data['International Student (percentage)'])
data['International Student Cnt'] = data['International Student Cnt'].astype(int)

In [None]:
data.reset_index(drop=True, inplace=True)

In [None]:
nations_to_continents = {
    'United Kingdom': 'Europe',
    'United States': 'North America',
    'Switzerland': 'Europe',
    'Canada': 'North America',
    'Singapore': 'Asia',
    'Germany': 'Europe',
    'Hong Kong': 'Asia',
    'Australia': 'Oceania',
    'Belgium': 'Europe',
    'France': 'Europe',
    'Sweden': 'Europe',
    'China': 'Asia',
    'Netherlands': 'Europe',
    'Japan': 'Asia',
    'South Korea': 'Asia',
    'Saudi Arabia': 'Asia',
    'Finland': 'Europe',
    'Denmark': 'Europe',
    'Austria': 'Europe',
    'Norway': 'Europe',
    'New Zealand': 'Oceania',
    'South Africa': 'Africa',
    'Italy': 'Europe',
    'Ireland': 'Europe',
    'Spain': 'Europe',
    'Taiwan': 'Asia',
    'Luxembourg': 'Europe',
    'Macao': 'Asia',
    'Qatar': 'Asia',
    'Brazil': 'South America',
    'Hungary': 'Europe',
    'Estonia': 'Europe',
    'Israel': 'Asia',
    'India': 'Asia',
    'United Arab Emirates': 'Asia',
    'Brunei Darussalam': 'Asia',
    'Iceland': 'Europe',
    'Lebanon': 'Asia',
    'Philippines': 'Asia',
    'Ghana': 'Africa',
    'Portugal': 'Europe',
    'Iran': 'Asia',
    'Malaysia': 'Asia',
    'Poland': 'Europe',
    'Egypt': 'Africa',
    'Turkey': 'Asia',
    'Greece': 'Europe',
    'Cyprus': 'Europe',
    'Vietnam': 'Asia',
    'Algeria': 'Africa',
    'Nigeria': 'Africa',
    'Tanzania': 'Africa',
    'Pakistan': 'Asia',
    'Ukraine': 'Europe',
    'Palestine': 'Asia',
    'Romania': 'Europe',
    'Czech Republic': 'Europe',
    'Northern Cyprus': 'Europe',
    'Ethiopia': 'Africa',
    'Jordan': 'Asia',
    'Serbia': 'Europe',
    'Sri Lanka': 'Asia',
    'Jamaica': 'North America',
    'Zambia': 'Africa',
    'Chile': 'South America',
    'Iraq': 'Asia',
    'Costa Rica': 'North America',
    'Bangladesh': 'Asia',
    'Mozambique': 'Africa',
    'Colombia': 'South America',
    'Kenya': 'Africa',
    'Namibia': 'Africa',
    'Peru': 'South America',
    'Latvia': 'Europe',
    'Oman': 'Asia',
    'Thailand': 'Asia',
    'Kuwait': 'Asia',
    'Lithuania': 'Europe',
    'Slovenia': 'Europe',
    'Uganda': 'Africa',
    'Malta': 'Europe',
    'Mexico': 'North America',
    'Nepal': 'Asia',
    'Kazakhstan': 'Asia',
    'Botswana': 'Africa',
    'Slovakia': 'Europe',
    'Morocco': 'Africa',
    'Georgia': 'Europe',
    'Indonesia': 'Asia',
    'Tunisia': 'Africa',
    'Mauritius': 'Africa',
    'Puerto Rico': 'North America',
    'Ecuador': 'South America',
    'Fiji': 'Oceania',
    'Croatia': 'Europe',
    'Zimbabwe': 'Africa',
    'Argentina': 'South America',
    'Bulgaria': 'Europe',
    'Venezuela': 'South America',
    'Azerbaijan': 'Asia',
    'Cuba': 'North America',
    'Montenegro': 'Europe'
}

data['Continent'] = data['Location'].map(nations_to_continents)

In [None]:
#General Visualization

In [None]:
#Histograms
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown', 
                options=[x for x in data.columns if data[x].dtype!='O' and x!='Rank'], 
                value='Number of Student'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(column):
    
    fig = px.histogram(
        data,
        x=column,
        nbins=30,
        title=f'{column}'
    )
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8096)

In [None]:
#Top 20 Countries with most Top universities 

df = data.Location.value_counts().reset_index()

px.pie(
    df[:20],
    names='index', 
    values='Location',
    title='Top 20 Countries with most Top Universities' 
).update_traces(textposition='inside', textinfo='percent+label')

In [None]:
df[:10]['index'].unique()

In [None]:
#Boxplot for {column} in each Top 10 Countries with most Universities
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Country Selection:'),
    dcc.Dropdown(id='dropdown', 
                options=[x for x in data.columns if data[x].dtype != 'O' and x != 'Rank'], 
                value='Number of Student'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(column):

    dfm = data.loc[data['Location'].isin(['United States', 'Japan', 'United Kingdom', 'India', 'Iran',
       'Turkey', 'China', 'Brazil', 'Spain', 'Italy'])]
    
    fig = px.box(
        dfm,
        y=column,
        x='Location',
#         text_auto=True,
        color='Location',
        title=f'{column} in each Top 10 Countries with Most Universities'
    ).update_layout(height=570)
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=9000)

In [None]:
# Top 15 universities with highest {columns}
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Selection:'),
    dcc.Dropdown(id='dropdown', 
                options=[{'label':x.title(), 'value':x} for x in data.columns if data[x].dtype != 'O'], 
                value='Number of Student'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(column):
    df = data.sort_values(column, ascending=False)[:15]
    
    fig = px.bar(
        df,
        y=column,
        x='University name',
        text_auto=True,
        color='International Student Cnt',
        labels={'University name':'Universities'},
        title=f'Top 15 Universities with Highest {column}'
    ).update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=600
    )
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8091)

In [None]:
#Top 15 Countries with most Total/Avg {columns}
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Selection:'),
    dcc.Dropdown(id='dropdown', 
                options=[{'label':x.title(), 'value':x} for x in data.columns if data[x].dtype == 'int64' and x!='Rank'], 
                value='Number of Student'),
    html.Br(),
    dcc.RadioItems(id='items',
                  options=[{'label':x.title(), 'value':x} for x in ['sum', 'mean', 'median']],
                  value='sum'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'), Input('items', 'value'))
def plot(column, metric):
    df = data.groupby('Location', as_index=False)\
         .agg({column:metric})\
         .sort_values(column, ascending=False)[:15]
    
    fig = px.bar(
        df,
        y=column,
        x='Location',
        text_auto=True,
        color='Location',
        title=f'Top 15 Countries Highest {column} ({metric})'
    ).update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=600
    ).update_traces(showlegend=False)
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8092)

In [None]:
#choropleth map 
df = data.groupby('Location', as_index=False).sum()
dfm = df.loc[:, [x for x in data.columns if data[x].dtype == 'int64' or x == 'Location']]

In [None]:
country_names = [
    'Algeria', 'Argentina', 'Australia', 'Austria', 'Azerbaijan',
    'Bangladesh', 'Belgium', 'Botswana', 'Brazil', 'Brunei Darussalam',
    'Bulgaria', 'Canada', 'Chile', 'China', 'Colombia', 'Costa Rica',
    'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Denmark',
    'Ecuador', 'Egypt', 'Estonia', 'Ethiopia', 'Fiji', 'Finland',
    'France', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Hong Kong',
    'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq',
    'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan',
    'Kazakhstan', 'Kenya', 'Kuwait', 'Latvia', 'Lebanon', 'Lithuania',
    'Luxembourg', 'Macao', 'Malaysia', 'Malta', 'Mauritius', 'Mexico',
    'Montenegro', 'Morocco', 'Mozambique', 'Namibia', 'Nepal',
    'Netherlands', 'New Zealand', 'Nigeria', 'Northern Cyprus',
    'Norway', 'Oman', 'Pakistan', 'Palestine', 'Peru', 'Philippines',
    'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Romania',
    'Saudi Arabia', 'Serbia', 'Singapore', 'Slovakia', 'Slovenia',
    'South Africa', 'South Korea', 'Spain', 'Sri Lanka', 'Sweden',
    'Switzerland', 'Taiwan', 'Tanzania', 'Thailand', 'Tunisia',
    'Turkey', 'Uganda', 'Ukraine', 'United Arab Emirates',
    'United Kingdom', 'United States', 'Venezuela', 'Vietnam',
    'Zambia', 'Zimbabwe'
]

country_codes = [
    'DZA', 'ARG', 'AUS', 'AUT', 'AZE', 'BGD', 'BEL', 'BWA', 'BRA', 'BRN',
    'BGR', 'CAN', 'CHL', 'CHN', 'COL', 'CRI', 'HRV', 'CUB', 'CYP', 'CZE',
    'DNK', 'ECU', 'EGY', 'EST', 'ETH', 'FJI', 'FIN', 'FRA', 'GEO', 'DEU',
    'GHA', 'GRC', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL',
    'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KAZ', 'KEN', 'KWT', 'LVA', 'LBN',
    'LTU', 'LUX', 'MAC', 'MYS', 'MLT', 'MUS', 'MEX', 'MNE', 'MAR', 'MOZ',
    'NAM', 'NPL', 'NLD', 'NZL', 'NGA', 'CYP', 'NOR', 'OMN', 'PAK', 'PSE',
    'PER', 'PHL', 'POL', 'PRT', 'PRI', 'QAT', 'ROU', 'SAU', 'SRB', 'SGP',
    'SVK', 'SVN', 'ZAF', 'KOR', 'ESP', 'LKA', 'SWE', 'CHE', 'TWN', 'TZA',
    'THA', 'TUN', 'TUR', 'UGA', 'UKR', 'ARE', 'GBR', 'USA', 'VEN', 'VNM',
    'ZMB', 'ZWE'
]

country_dict = dict(zip(country_names, country_codes))
dfm['Code'] = dfm['Location'].map(country_dict)

In [None]:
dfm.head()

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Selection:'),
    dcc.Dropdown(id='dropdown', 
                   options = [x for x in dfm.columns if x!='Location' and x!='Code' and x!= 'Rank'],
                   value='Number of Student'), 
    dcc.Graph('visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(column):

    fig = px.choropleth(
        dfm, 
        locations="Code",
        color=column, 
        hover_name="Location", 
        color_continuous_scale=px.colors.sequential.Plasma,
        title=f'University Location {column} Map Distribution'
    )
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8093)

In [None]:
#University analysis among each location
#Top 10 universities with is column value

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Country Selection:'),
    dcc.Dropdown(id='dropdown1', 
                options=data.Location.unique(), 
                value='United States'),
    html.Br(), 
    dcc.Dropdown(id='dropdown2',
                  options=[x for x in data.columns if data[x].dtype != 'O' and x != 'Rank'],
                  value='Number of Student'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown1', 'value'), Input('dropdown2', 'value'))
def plot(country, column):
    df = data.query(f"Location == '{country}'").sort_values('Rank')[:10]
    
    fig = px.bar(
        df,
        x=column,
        y='University name',
        text_auto=True,
        labels={'University name':'University Name'},
        color='University name',
        title=f'Top 10 Universities in {country}'
    ).update_layout(
        height=600)
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8094)

In [None]:
#Top 10 Universities in {country} with Highest {column
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Country Selection:'),
    dcc.Dropdown(id='dropdown1', 
                options=data.Location.unique(), 
                value='United States'),
    html.Br(), 
    dcc.Dropdown(id='dropdown2',
                  options=[x for x in data.columns if data[x].dtype != 'O' and x != 'Rank'],
                  value='Number of Student'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown1', 'value'), Input('dropdown2', 'value'))
def plot(country, column):
    df = data.query(f"Location == '{country}'").sort_values(column, ascending=False)[:10]
    
    fig = px.bar(
        df,
        x=column,
        y='University name',
        text_auto=True,
        labels={'University name':'University Name'},
        color='University name',
        title=f'Top 10 Universities in {country} with Highest {column}'
    ).update_layout(
        height=600)
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8095)

In [None]:
#How about on a continent level? 

In [None]:
df = data.Continent.value_counts().reset_index()

fig = px.pie(
    df, 
    names='index',
    values='Continent',
    title='Continent Distribution with most Top Universities' 
).update_traces(textposition='inside', textinfo='percent+label')

fig.show()

In [None]:
#Top 10 Universities in each Continents

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3("Selection:"),
    dcc.Dropdown(id='dropdown1',
                 options=data.Continent.unique(),
                 value='North America'),
    html.Br(), 
    dcc.Dropdown(id='dropdown2',
                  options=[x for x in data.columns if data[x].dtype != 'O' and x != 'Rank'],
                  value='Number of Student'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown1', 'value'), Input('dropdown2', 'value'))
def plot(location, column):
    
    df = data.query(f"Continent == '{location}'").sort_values('Rank')[:10]
        
    fig = px.bar(
        df, 
        y='University name',
        x=column,
        text_auto=True, 
        color='University name',
        title=f'Top 10 Universities in {location}'
    ).update_layout(
        height=600)
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8097)

In [None]:
#Top 10 Universities in {location} with Highest {column}
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3("Selection:"),
    dcc.Dropdown(id='dropdown1',
                 options=data.Continent.unique(),
                 value='North America'),
    html.Br(), 
    dcc.Dropdown(id='dropdown2',
                  options=[x for x in data.columns if data[x].dtype != 'O' and x != 'Rank'],
                  value='Number of Student'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown1', 'value'), Input('dropdown2', 'value'))
def plot(location, column):
    
    df = data.query(f"Continent == '{location}'").sort_values(column, ascending=False)[:10]
        
    fig = px.bar(
        df, 
        y='University name',
        x=column,
        text_auto=True, 
        color='University name',
        title=f'Top 10 Universities in {location} with Highest {column}'
    ).update_layout(
        height=600)
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8098)

In [None]:
#{column} in Different Continents ({metric}) 
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Selection:'),
    dcc.Dropdown(id='dropdown', 
                options=[{'label':x.title(), 'value':x} for x in data.columns if data[x].dtype == 'int64' and x!='Rank'], 
                value='Number of Student'),
    html.Br(),
    dcc.RadioItems(id='items',
                  options=[{'label':x.title(), 'value':x} for x in ['sum', 'mean', 'median']],
                  value='sum'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'), Input('items', 'value'))
def plot(column, metric):
    df = data.groupby('Continent', as_index=False)\
         .agg({column:metric})\
         .sort_values(column, ascending=False)
    
    fig = px.bar(
        df,
        y=column,
        x='Continent',
        text_auto=True,
        color='Continent',
        title=f'{column} in Different Continents ({metric}) '
    ).update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=600
    ).update_traces(showlegend=False)
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8099)