In [9]:
from plotly.offline import init_notebook_mode, iplot
from sklearn import datasets, linear_model
import plotly.graph_objs as go
import pandas as pd
import numpy as np
import json

In [10]:
#Reads in the CSV dataset
complete_data = pd.read_csv("timesData.csv")

# Strips the '=' character from the world_rank column 
# and removes all entries with NaN values in Interernation_students column
complete_data['world_rank'] = complete_data['world_rank'].map(lambda x: x.lstrip('='))
complete_data = complete_data.dropna(axis=0, subset=['international_students'])

#Make all possible columns datatype integers so you can make calculations with the values
#Also removed all percentages for easier calculations
complete_data = complete_data[~complete_data.world_rank.str.contains("-")]
complete_data = complete_data.apply(pd.to_numeric, errors='ignore')

complete_data['international_students'] = complete_data['international_students'].str[:-1].astype(float)

complete_data.loc[complete_data['international'] == '-', 'international'] = 0
complete_data['international'] = complete_data['international'].astype(float)

#Removes all the comma's from the num_students columns
complete_data['num_students'] = complete_data['num_students'].str.replace(',', '').astype(int)

complete_data

Unnamed: 0,world_rank,university_name,country,teaching,international,research,citations,income,total_score,num_students,student_staff_ratio,international_students,female_male_ratio,year
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152,8.9,25.0,,2011
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243,6.9,27.0,33 : 67,2011
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074,9.0,33.0,37 : 63,2011
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596,7.8,22.0,42 : 58,2011
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,-,94.2,7929,8.4,27.0,45 : 55,2011
5,6,University of Cambridge,United Kingdom,90.5,77.7,94.1,94.0,57.0,91.2,18812,11.8,34.0,46 : 54,2011
6,6,University of Oxford,United Kingdom,88.2,77.2,93.9,95.1,73.5,91.2,19919,11.6,34.0,46 : 54,2011
7,8,"University of California, Berkeley",United States of America,84.2,39.6,99.3,97.8,-,91.1,36186,16.4,15.0,50 : 50,2011
8,9,Imperial College London,United Kingdom,89.2,90.0,94.5,88.3,92.9,90.6,15060,11.7,51.0,37 : 63,2011
9,10,Yale University,United States of America,92.1,59.2,89.7,91.5,-,89.5,11751,4.4,20.0,50 : 50,2011


In [11]:
# This function takes in a year as a parameter and filters the complete dataframe on that year
# After that it calculates the mean of the international students per group of 50 universities in the ranking
# The function returns a list with the all the means of the given year. This list is rounded up to a decimal 

def get_meantop200(year, column, country_divided = False):
    selected = complete_data[(complete_data['year']) == year][:200]
    if country_divided == True:
        countries = set(selected['country'])
        country_means = {}
        for country in countries:
            country_means[country] = selected[(selected['country']) == country][column].mean()
        return country_means
    else:
        means = [selected[column][:50].mean(),
                      selected[column][51:100].mean(),
                      selected[column][101:150].mean(),
                      selected[column][151:200].mean()
                    ]
    return np.around(means, 1);

In [12]:
means = []
for i in range(2011, 2017):
    means.append(get_meantop200(i, 'international_students', country_divided=False))

    mean_grouped = []
for i in range(0, 4):
    group_sum = 0
    for j in range(0, 4):
        group_sum += means[i][j]
    mean_grouped.append(group_sum / 4)

In [13]:
# Start notebook mode
init_notebook_mode(connected=True)

data = [go.Bar(
            x= ['Rang 1-50', 'Rang 51-100', 'Rang 101-150', 'Rang 151-200'],
            y= [mean_grouped[0], mean_grouped[1], mean_grouped[2], mean_grouped[3]],
            marker=dict(
                color='rgb(0,0,255)',
                line=dict(
                color='rgb(8,48,107)',
                width=3.5),
            ),
            text = np.around([mean_grouped[0], mean_grouped[1], mean_grouped[2], mean_grouped[3]], 2),
            textposition = 'auto',
    )]


layout = go.Layout(
    title = 'Gemiddelde percentage internationale studenten op de top 200 universiteiten, per groep van 50, over 2011-2016',
    yaxis = go.layout.YAxis(
        tickmode = 'linear',
        ticksuffix = "%",
        ticklen= 6,
        tickwidth= 3,
        title = 'Percentage internationale studenten'
    ),
    xaxis = go.layout.XAxis(
        title = 'Groepen op ranglijst'
    ),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic-bar')

In [14]:
layout = go.Layout(
    title = 'De spreiding van het percentage internationale studenten binnen de top 200, verdeeld in groepen van 50',
    yaxis = go.layout.YAxis(
        ticksuffix = "%",
        ticklen= 6,
        tickwidth= 3,
        title = 'Percentage internationale studenten'
    ),
)

data = [
    go.Box(
        y=int_students[:50],
        boxpoints='all',
        jitter=0.3,
        pointpos=-1.8,
        name = 'Rang 1-50'
    ),
    go.Box(
        y=int_students[51:100],
        boxpoints='all',
        jitter=0.3,
        pointpos=-1.8,
        name = 'Rang 51-100'
    ),
    go.Box(
        y=int_students[101:150],
        boxpoints='all',
        jitter=0.3,
        pointpos=-1.8,
        name = 'Rang 101-150'
    ),
    go.Box(
        y=int_students[151:200],
        boxpoints='all',
        jitter=0.3,
        pointpos=-1.8,
        name = 'Rang 151-200'
    )
]

fig = go.Figure(data=data, layout=layout)
# Plot and embed in ipython notebook!
iplot(fig, filename='boxplots')

In [15]:
def get_meanitem(column):
    means = []
    for i in range(1, 201):
        filtered = complete_data[(complete_data['world_rank']) == i]
        means.append(filtered[column].mean())
    return np.around(means, 1)


int_students = get_meanitem('international_students') 
int_score = get_meanitem('international')
total_score = get_meanitem('total_score')

In [16]:
regr = linear_model.LinearRegression()
reshaped_score= int_score[:200].reshape(-1, 1)
reshaped_students= int_students[:200].reshape(-1,1)
regr.fit(reshaped_score,reshaped_students)

# Create a trace
trace1 = go.Scatter(
    x = int_score[:50],
    y = int_students[:50],
    mode = 'markers',
    name = 'Ranked 1-50'
)
trace2 = go.Scatter(
    x = int_score[51:100],
    y = int_students[51:100],
    mode = 'markers',
    name = 'Ranked 51-100'
)
trace3 = go.Scatter(
    x = int_score[101:150],
    y = int_students[101:150],
    mode = 'markers',
    name = 'Ranked 101-150'
)
trace4 = go.Scatter(
    x = int_score[151:200],
    y = int_students[151:200],
    mode = 'markers',
    name = 'Ranked 151-200'
)
trace5 = go.Scatter(
    x= list(range(20,100)),
    y= list(0.47 * x for x in range(5,100)),
    mode="lines",
    line=dict(color="purple", width=3),
    name="Regression"
)

layout = go.Layout(
    title = 'Verhouding tussen het percentage internationale studenten en internationale score bij de top 200 universiteiten',
    yaxis = go.layout.YAxis(
        tick0 = 5,
        dtick = 5,
        ticksuffix = "%",
        ticklen= 6,
        tickwidth= 3,
        range = [5,36],
        title = 'Percentage internationale studenten'
    ),
    xaxis = go.layout.XAxis(
        title = 'Internationale score',
        range = [30, 100]
    ),
)

data = [trace1, trace2, trace3, trace4, trace5]
fig = go.Figure(data=data, layout=layout)


# Plot and embed in ipython notebook!
iplot(fig, filename='basic-scatter')




In [17]:
def groupby_mean(attribute1, attribute2 = 0, group = False, column = 0):
    if attribute2 != 0:
        country_stats = complete_data.groupby([attribute1, attribute2]).mean()
        counts = complete_data.groupby([attribute1, attribute2]).size()
    else:
        country_stats = complete_data.groupby([attribute1]).mean()
        counts = complete_data.groupby([attribute1]).size()
    country_stats['count'] = counts
    
    if group == True:
        country_stats = country_stats.sort_values(by=[column])

    return country_stats;

In [18]:
grouped = groupby_mean('country', 0, group = True, column = 'international_students')

data = [go.Bar( 
            x= grouped.index,
            y= grouped['international_students'],
            marker=dict(
                color='rgb(40, 180, 25)',
                line=dict(
                color='rgb(8,48,107)',
                width=3.5),
            ),
    )]

layout = go.Layout(
    title = 'Gemiddelde percentage internationale studenten per land over de jaren 2011-2016',
    yaxis = go.layout.YAxis(
        tickmode = 'linear',
        ticksuffix = "%",
        tick0 = 0,
        dtick= 5,
        ticklen= 6,
        tickwidth= 3,
        automargin=True,
        title = 'Percentage internationale studenten',
        titlefont=dict(
        family='Arial, sans-serif',
        size=20,
        color='black'
        ),
    ),
    xaxis=dict(
        automargin=True,
        title = 'Land',
        titlefont=dict(
        family='Arial, sans-serif',
        size=20,
        color='black'
        ),
        showticklabels=True,
        tickangle=45,
        tickfont=dict(
        family='Old Standard TT, serif',
        size=14,
        color='black')
    )

)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic-bar')

In [19]:
country_stats = groupby_mean('year', 'country', group = False)

def choro(raw, year, statistic, unit, zmax, rv):
    return go.Choropleth(
        zmin = 20,
        zmax = zmax,
        reversescale = rv,
        colorscale = [[0, 'rgb(50,50,255)'], [1, 'rgb(255,50,50)']],
        autocolorscale = False,
        locations = raw.index,
        z = raw[statistic].astype(float),
        locationmode = 'country names',
        marker = go.choropleth.Marker(
            line = go.choropleth.marker.Line(
                color = 'rgb(0,0,0)',
                width = .25
            )),
        colorbar = go.choropleth.ColorBar(
            title = unit),
        customdata = [year]
    )

def worldmap(statistic, title, unit, zmax, rv):
    data = [choro(country_stats.loc[year], year, statistic, unit, zmax, rv) for year in country_stats.index.levels[0]]

    steps = []
    for i, d in enumerate(data):
        step = dict(method='restyle',
                    args=['visible', [False] * (len(data))],
                    label='Jaar {}'.format(d.customdata[0]))
        step['args'][1][i] = True
        steps.append(step)

    sliders = [dict(active=0,
                    pad={"t": 1},
                    steps=steps)]  

    layout = go.Layout(
        title = go.layout.Title(
            text = title
        ),
        geo = go.layout.Geo(
            scope = 'world',
            projection = go.layout.geo.Projection(type = 'equirectangular'),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
        sliders = sliders
    )

    fig = go.Figure(data = data, layout = layout)
    iplot(fig)

worldmap('international', 'Gemiddelde internationale score per land', 'Gemiddelde score', 100, False)

In [20]:
def filter_yearly(year, column, sort = True):
    filtered = country_stats.loc[year]
    if sort == True:
        filtered = filtered.sort_values(by=[column])
    return filtered

def dotplot_scatter(x_values, y_values, name, visible):
    trace = {"x": x_values, 
          "y": y_values, 
          "marker": {"color": "blue", "size": 12}, 
          "mode": "markers", 
          "name": name, 
          "type": "scatter",
          "visible": visible
    }
    return trace;

def make_all_scatters(startyear, endyear, activeyear):
    traces = []
    for year in range(startyear, (endyear + 1), 1):
        if year == activeyear:
            traces.append(dotplot_scatter(filter_yearly(year, 'international')['international'], 
                            filter_yearly(year, 'international').index,
                            name = str(year),
                            visible = True,
                           ))
        else:
            traces.append(dotplot_scatter(filter_yearly(year, 'international')['international'], 
                            filter_yearly(year, 'international').index,
                            name = str(year),
                            visible = False,
                           ))
    return traces;

In [21]:
data = make_all_scatters(2011, 2016, 2011)

layout = go.Layout(
    autosize=False,
    width=800,
    height=800,
    title = 'Gemiddelde internationale score voor elk land per jaar',
    yaxis = go.layout.YAxis(
        ticklen= 6,
        tickwidth= 3,
        automargin=True,
        title = 'Internationale score',
        titlefont=dict(
        family='Arial, sans-serif',
        size=20,
        color='black'
        ),
    ),
    xaxis=dict(
        automargin=True,
        title = 'Land',
        titlefont=dict(
        family='Arial, sans-serif',
        size=20,
        color='black'
        ),
        showticklabels=True,
        tickfont=dict(
        family='Old Standard TT, serif',
        size=14,
        color='black')
    )
)

updatemenus=list([
    dict(
        buttons=list([
            dict(
                args=['visible', [True, False, False, False, False, False]],
                label='2011',
                method='restyle'
            ),
            
            dict(
                args=['visible', [False, True, False, False, False, False]],
                label='2012',
                method='restyle'
            ),

            dict(
                args=['visible', [False, False, True, False, False, False]],
                label='2013',
                method='restyle'
            ),

            dict(
                args=['visible', [False, False, False, True, False, False]],
                label='2014',
                method='restyle'
            ),

            dict(
                args=['visible', [False, False, False, False, True, False]],
                label='2015',
                method='restyle'
            ),

            dict(
                args=['visible', [False, False, False, False, False, True]],
                label='2016',
                method='restyle'
            ),
        ]),
        direction = 'left',
        pad = {'r': 10, 't': 10},
        showactive = True,
        type = 'buttons',
        x = 0.1,
        xanchor = 'left',
        y = 1.08,
        yanchor = 'top',
        bgcolor = '#E2D3D0',
        bordercolor = '#FFFFFF',
        font = dict(size=18)
    ),
])

layout['updatemenus'] = updatemenus

fig = dict(data=data, layout=layout)
iplot(fig, filename='basic_dot-plot')
