In [1]:



























import pandas as pd

In [2]:
#Reads in the CSV dataset
complete_data = pd.read_csv("timesData.csv")

# Strips the '=' character from the world_rank column 
# and removes all entries with NaN values in Interernation_students column
complete_data['world_rank'] = complete_data['world_rank'].map(lambda x: x.lstrip('='))
complete_data = complete_data.dropna(axis=0, subset=['international_students'])

#Make all possible columns datatype integers so you can make calculations with the values
#Also removed all percentages for easier calculations
complete_data = complete_data[~complete_data.world_rank.str.contains("-")]
complete_data = complete_data.apply(pd.to_numeric, errors='ignore')
complete_data['international_students'] = complete_data['international_students'].str[:-1].astype(float) 

#Removes all the comma's from the num_students columns
complete_data['num_students'] = complete_data['num_students'].str.replace(',', '').astype(int)

# Filters all entries on the year 2016 and selects the top 200 of that result
latest_year = complete_data[(complete_data['year'] == 2016) & (complete_data['world_rank'] <= 200)]

complete_data

Unnamed: 0,world_rank,university_name,country,teaching,international,research,citations,income,total_score,num_students,student_staff_ratio,international_students,female_male_ratio,year
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152,8.9,25.0,,2011
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243,6.9,27.0,33 : 67,2011
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074,9.0,33.0,37 : 63,2011
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596,7.8,22.0,42 : 58,2011
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,-,94.2,7929,8.4,27.0,45 : 55,2011
5,6,University of Cambridge,United Kingdom,90.5,77.7,94.1,94.0,57.0,91.2,18812,11.8,34.0,46 : 54,2011
6,6,University of Oxford,United Kingdom,88.2,77.2,93.9,95.1,73.5,91.2,19919,11.6,34.0,46 : 54,2011
7,8,"University of California, Berkeley",United States of America,84.2,39.6,99.3,97.8,-,91.1,36186,16.4,15.0,50 : 50,2011
8,9,Imperial College London,United Kingdom,89.2,90.0,94.5,88.3,92.9,90.6,15060,11.7,51.0,37 : 63,2011
9,10,Yale University,United States of America,92.1,59.2,89.7,91.5,-,89.5,11751,4.4,20.0,50 : 50,2011


In [3]:
# Importeer nodige functies en modules
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

# Start notebook mode
init_notebook_mode(connected=True)

# Create random data with numpy
import numpy as np


# Create a trace
trace1 = go.Scatter(
    x=latest_year['world_rank'],
    y=latest_year['total_score'],
    mode = 'lines'
)

data = [trace1]

# Edit the layout
layout = dict(title = 'The total score for the top 200 universities of the most recent given year (2016)',
              xaxis = dict(title = 'Position in the world ranking'),
              yaxis = dict(title = 'Total score', ticklen= 6, tickwidth= 3),
              )

fig = dict(data=data, layout=layout)
iplot(fig, filename='basic-line')

In [4]:
N = list(range(1, 201, 1))


data = [go.Bar(
            x=N,
            y=latest_year['international_students']
    )]


layout = go.Layout(
    title = 'The percentage of international students per university in the top 200 for the latest given year (2016)',
    yaxis = go.layout.YAxis(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 5,
        ticksuffix = "%",
        ticklen= 6,
        tickwidth= 3,
        title = 'Percentage international students'
    ),
    xaxis = go.layout.XAxis(
        title = 'University ranking #'
    ),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic-bar')
print('Note: The graph does not include 2 universities because of missing data.')

Note: The graph does not include 2 universities because of missing data.


In [5]:
#For optimal calculations for future graphs and pie charts we want to divide all the years into 4 groups of 50 university.


# This function takes in a year as a parameter and filters the complete dataframe on that year
# After that it calculates the mean of the international students per group of 50 universities in the ranking
# The function returns a list with the all the means of the given year. This list is rounded up to a decimal 
def get_mean(year):
    year = complete_data[(complete_data['year'] == year)]
    means = []
    means.extend([year['international_students'][:50].mean(),
                  year['international_students'][51:100].mean(),
                  year['international_students'][101:150].mean(),
                  year['international_students'][151:].mean()
                ])
    return np.around(means, 1);

In [6]:
#Create a list for all the years for better readability
years = list(range(2011, 2017, 1))

# 
means = []
for i in range(2011, 2017, 1):
    means.append(get_mean(i))

trace1 = go.Bar(
    x=years,
    y=[means[0][0], means[1][0], means[2][0], means[3][0], means[4][0], means[5][0]],
    name='Ranked 1-50'
)
trace2 = go.Bar(
    x=years,
    y=[means[0][1], means[1][1], means[2][1], means[3][1], means[4][1], means[5][1]],
    name='Ranked 51-100'
)
trace3 = go.Bar(
    x=years,
    y=[means[0][2], means[1][2], means[2][2], means[3][2], means[4][2], means[5][2]],
    name='Ranked 101-150'
)
trace4 = go.Bar(
    x=years,
    y=[means[0][3], means[1][3], means[2][3], means[3][3], means[4][3], means[5][3]],
    name='Ranked 151-200'
)

data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
    barmode='group',
    title = 'The top 200 universities, divided in groups of 50, compared based on the percentage of international students',
    yaxis = go.layout.YAxis(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 5,
        ticklen=6,
        tickwidth=3,
        range=[0, 30],
        ticksuffix = "%",
        title = 'Percentage international students'
    ),
    xaxis = go.layout.XAxis(
        title = 'Year'
    )
    
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar')



In [7]:
# Calculates per group of 50 universities in the rankings the sum of all the students and 
# multiplies that value with the average percetenage international students for that group
# The values are then put into a list for a specific year

def average_intstudent(year):
    mean_lst = get_mean(year)
    averages = []
    
    year = complete_data[(complete_data['year'] == year)]
    
    averages.extend([sum(year['num_students'][:50]) * (mean_lst[0] / 100),
                  sum(year['num_students'][51:100]) * (mean_lst[1] / 100),
                  sum(year['num_students'][101:150]) * (mean_lst[2] / 100),
                  sum(year['num_students'][151:]) * (mean_lst[3] / 100)
                ])
    return averages;

In [8]:
averages = []

for i in range(2011, 2017, 1):
    averages.append(average_intstudent(i))

trace0 = dict(
    x=years,
    y=[averages[0][0], averages[1][0], averages[2][0], averages[3][0], averages[4][0], averages[5][0]],
    hoverinfo='x+y',
    mode='lines',
    stackgroup='one',
    name = 'Ranked 1-50'
)
trace1 = dict(
    x=years,
    y=[averages[0][1], averages[1][1], averages[2][1], averages[3][1], averages[4][1], averages[5][1]],
    hoverinfo='x+y',
    mode='lines',
    stackgroup='one',
    name = 'Ranked 51-100'
)
trace2 = dict(
    x=years,
    y=[averages[0][2], averages[1][2], averages[2][2], averages[3][2], averages[4][2], averages[5][2]],
    hoverinfo='x+y',
    mode='lines',
    stackgroup='one',
    name = 'Ranked 101-150'
)
trace3 = dict(
    x=years,
    y=[averages[0][3], averages[1][3], averages[2][3], averages[3][3], averages[4][3], averages[5][3]],
    hoverinfo='x+y',
    mode='lines',
    stackgroup='one',
    name = 'Ranked 151-200'
)
data = [trace0, trace1, trace2, trace3]
layout = go.Layout(
    title = 'The top 200 universities, divided in groups of 50, compared based on the amount of international students',
    yaxis = go.layout.YAxis(
        range=[0, 1000000],
        title = 'International students',
        ticklen= 6,
        tickwidth= 3
    ),
    xaxis = go.layout.XAxis(
        title = 'Year'
    )
    
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='stacked-area-plot-hover', validate=False)


In [9]:
def get_cumulative_intscore(year):
    year = complete_data[(complete_data['year'] == year)]
    scores = []
    scores.extend([year['international_students'][:50].sum(),
                  year['international_students'][51:100].sum(),
                  year['international_students'][101:150].sum(),
                  year['international_students'][151:].sum()
                ])
    return np.around(scores)

intscores = []
for i in range(2011, 2017, 1):
    intscores.append(get_cumulative_intscore(i))

In [10]:
x = years
y = [sum(intscores[0]), sum(intscores[1]), sum(intscores[2]), sum(intscores[3]), sum(intscores[4]), sum(intscores[5])]

data = [go.Bar(
            x=x,
            y=y,
            text=y,
            textposition = 'auto',
            marker=dict(
                color='rgb(158,202,225)',
                line=dict(
                    color='rgb(8,48,107)',
                    width=1.5),
            ),
            opacity=0.6
        )]
layout = go.Layout(
    title = 'Cumulative international score for each year',
    yaxis = go.layout.YAxis(
        range=[0, 4000],
        title = 'Cumulative international score',
        ticklen= 6,
        tickwidth= 3
    ),
    xaxis = go.layout.XAxis(
        title = 'Year'
    )
    
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='bar-direct-labels')

In [11]:
fig = {
  "data": [
    {
      "values": [intscores[0][0], intscores[0][1], intscores[0][2], intscores[0][3]],
      "labels": [
        "Ranked 1-50",
        "Ranked 51-100",
        "Ranked 101-150",
        "Ranked 151-200"
      ],
      "domain": {"column": 0, "row": 0},
      "name": "",
      "textposition":"outside",
      "hoverinfo":"label+percent+name",
      "hole": .7,
      "type": "pie"
    },
      {
      "values": [intscores[1][0], intscores[1][1], intscores[1][2], intscores[1][3]],
      "labels": [
        "Ranked 1-50",
        "Ranked 51-100",
        "Ranked 101-150",
        "Ranked 151-200"
      ],
      "textposition":"outside",
      "domain": {"column": 1, "row": 0},
      "name": "International Score",
      "hoverinfo":"label+percent+name",
      "hole": .7,
      "type": "pie"
    },
    {
      "values": [intscores[2][0], intscores[2][1], intscores[2][2], intscores[2][3]],
      "labels": [
        "Ranked 1-50",
        "Ranked 51-100",
        "Ranked 101-150",
        "Ranked 151-200"
      ],
      "textposition":"outside",
      "domain": {"column": 2, "row": 0},
      "name": "International Score",
      "hoverinfo":"label+percent+name",
      "hole": .7,
      "type": "pie"},
  {
      "values": [intscores[3][0], intscores[3][1], intscores[3][2], intscores[3][3]],
      "labels": [
        "Ranked 1-50",
        "Ranked 51-100",
        "Ranked 101-150",
        "Ranked 151-200"
      ],
      "textposition":"outside",
      "domain": {"column": 0, "row": 1},
      "name": "International Score",
      "hoverinfo":"label+percent+name",
      "hole": .7,
      "type": "pie"
    },
    {
      "values": [intscores[4][0], intscores[4][1], intscores[4][2], intscores[4][3]],
      "labels": [
        "Ranked 1-50",
        "Ranked 51-100",
        "Ranked 101-150",
        "Ranked 151-200"
      ],
      "textposition":"outside",
      "domain": {"column": 1, "row": 1},
      "name": "International Score",
      "hoverinfo":"label+percent+name",
      "hole": .7,
      "type": "pie"
    },
    {
      "values": [intscores[5][0], intscores[5][1], intscores[5][2], intscores[5][3]],
      "labels": [
        "Ranked 1-50",
        "Ranked 51-100",
        "Ranked 101-150",
        "Ranked 151-200"
      ],
      "textposition":"outside",
      "domain": {"column": 2, "row": 1},
      "name": "International Score",
      "hoverinfo":"label+percent+name",
      "hole": .7,
      "type": "pie"}],
    
    "layout": {
        "title":"Percentage of the total international score per year per group of 50 universities",
        "grid": {"rows": 2, "columns": 3},
        "height": 700,
        "width": 900,
        "legend": dict(x=1, y=.5),
        "annotations": [
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "2011",
                "x": 0.1135,
                "y": 0.797
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "2012",
                "x": 0.5,
                "y": 0.797
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "2013",
                "x": 0.887,
                "y": 0.797
            },
                        {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "2014",
                "x": 0.1135,
                "y": 0.215
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "2015",
                "x": 0.5,
                "y": 0.215
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "2016",
                "x": 0.887,
                "y": 0.215
            }
        ]
    }
    
}
iplot(fig, filename='donut')