In [8]:
import pandas as pd

In [9]:
#Reads in the CSV dataset
complete_data = pd.read_csv("timesData.csv")

# Strips the '=' character from the world_rank column 
# and removes all entries with NaN values in Interernation_students column
complete_data['world_rank'] = complete_data['world_rank'].map(lambda x: x.lstrip('='))
complete_data = complete_data.dropna(axis=0, subset=['international_students'])

#Make all possible columns datatype integers so you can make calculations with the values
#Also removed all percentages for easier calculations
complete_data = complete_data[~complete_data.world_rank.str.contains("-")]
complete_data = complete_data.apply(pd.to_numeric, errors='ignore')
complete_data['international_students'] = complete_data['international_students'].str[:-1].astype(float) 

#Removes all the comma's from the num_students columns
complete_data['num_students'] = complete_data['num_students'].str.replace(',', '').astype(int)

# Filters all entries on the year 2016 and selects the top 200 of that result
latest_year = complete_data[(complete_data['year'] == 2016) & (complete_data['world_rank'] <= 200)]

complete_data.head()

Unnamed: 0,world_rank,university_name,country,teaching,international,research,citations,income,total_score,num_students,student_staff_ratio,international_students,female_male_ratio,year
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152,8.9,25.0,,2011
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243,6.9,27.0,33 : 67,2011
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074,9.0,33.0,37 : 63,2011
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596,7.8,22.0,42 : 58,2011
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,-,94.2,7929,8.4,27.0,45 : 55,2011


In [10]:
# Importeer nodige functies en modules
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

# Start notebook mode
init_notebook_mode(connected=True)

# Create random data with numpy
import numpy as np


# Create a trace
trace1 = go.Scatter(
    x=latest_year['world_rank'],
    y=latest_year['total_score'],
    mode = 'lines'
)

data = [trace1]

# Edit the layout
layout = dict(title = 'The total score for the top 200 of the most recent year (2016)',
              xaxis = dict(title = 'Position in the world ranking'),
              yaxis = dict(title = 'Total score'),
              )

fig = dict(data=data, layout=layout)
iplot(fig, filename='basic-line')

In [11]:
N = list(range(1, 201, 1))

data = [go.Bar(
            x=N,
            y=latest_year['international_students']
    )]


layout = go.Layout(
    title = 'The percentage of international students per university in the top 200 for the latest year (2016)',
    yaxis = go.layout.YAxis(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 5,
        ticksuffix = "%",
        title = 'Percentage international students'
    ),
    xaxis = go.layout.XAxis(
        title = 'University ranking #'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic-bar')

In [12]:
#For optimal calculations for future graphs and pie charts we want to divide all the years into 4 groups of 50 university.
#We first make 6 seperate dataframe, each for every year
#After that we create a list to put the means of every group inside of it
#The last step is to append every mean calculation into the earlier created list

#2011
year1 = complete_data[(complete_data['year'] == 2011)]

#Divides the years into 4 groups of 50 universities and puts them in a list for every year.
year1_means = []
year1_means.extend([
    year1['international_students'][:50].mean(),
    year1['international_students'][51:100].mean(),
    year1['international_students'][101:150].mean(),
    year1['international_students'][150:].mean()
])

#2012
year2 = complete_data[(complete_data['year'] == 2012)]

year2_means = []
year2_means.extend([
    year2['international_students'][:50].mean(),
    year2['international_students'][51:100].mean(),
    year2['international_students'][101:150].mean(),
    year2['international_students'][150:].mean()
])
                      
#2013
year3 = complete_data[(complete_data['year'] == 2013)]

year3_means = []
year3_means.extend([
    year3['international_students'][:50].mean(),
    year3['international_students'][51:100].mean(),
    year3['international_students'][101:150].mean(),
    year3['international_students'][151:].mean()
])
                      
#2014              
year4 = complete_data[(complete_data['year'] == 2014)]

year4_means = []
year4_means.extend([
    year4['international_students'][:50].mean(),
    year4['international_students'][51:100].mean(),
    year4['international_students'][101:150].mean(),
    year4['international_students'][151:].mean()
])
                      
#2015
year5 = complete_data[(complete_data['year'] == 2015)]

year5_means = []
year5_means.extend([
    year5['international_students'][:50].mean(),
    year5['international_students'][51:100].mean(),
    year5['international_students'][101:150].mean(),
    year5['international_students'][151:].mean()
])
            
#2016
year6 = latest_year

year6_means = []
year6_means.extend([
    year6['international_students'][:50].mean(),
    year6['international_students'][51:100].mean(),
    year6['international_students'][101:150].mean(),
    year6['international_students'][151:].mean()
])
              


In [13]:
#Create a list for all the years for better readability
years = list(range(2011, 2017, 1))

trace1 = go.Bar(
    x=years,
    y=[year1_means[0], year2_means[0], year3_means[0], year4_means[0], year5_means[0], year6_means[0]],
    name='Ranked 1-50'
)
trace2 = go.Bar(
    x=years,
    y=[year1_means[1], year2_means[1], year3_means[1], year4_means[1], year5_means[1], year6_means[1]],
    name='Ranked 51-100'
)
trace3 = go.Bar(
    x=years,
    y=[year1_means[2], year2_means[2], year3_means[2], year4_means[2], year5_means[2], year6_means[2]],
    name='Ranked 101-150'
)
trace4 = go.Bar(
    x=years,
    y=[year1_means[3], year2_means[3], year3_means[3], year4_means[3], year5_means[3], year6_means[3]],
    name='Ranked 151-200'
)

data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
    barmode='group',
    title = 'Comparison between 4 equally divided groups for every 50 rankings on the amount of international students',
    yaxis = go.layout.YAxis(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 5,
        ticklen=6,
        tickwidth=3,
        range=[0, 30],
        ticksuffix = "%",
        title = 'Percentage international students'
    ),
    xaxis = go.layout.XAxis(
        title = 'Year'
    )
    
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar')



In [14]:
# Calculates per group of 50 universities in the rankings the sum of all the students and 
# multiplies that value with the average percetenage international students for that group
# The values are then put into a list for a specific year
raw_Y1 = [(sum(year1['num_students'][:50])) * (year1_means[0] / 100),
               (sum(year1['num_students'][51:100])) * (year1_means[1] / 100),
               (sum(year1['num_students'][101:150])) * (year1_means[2] / 100),
               (sum(year1['num_students'][151:])) * (year1_means[3] / 100)
               ]

raw_Y2 = [(sum(year2['num_students'][:50])) * (year2_means[0] / 100),
               (sum(year2['num_students'][51:100])) * (year2_means[1] / 100),
               (sum(year2['num_students'][101:150])) * (year2_means[2] / 100),
               (sum(year2['num_students'][151:])) * (year2_means[3] / 100)
               ]

raw_Y3 = [(sum(year3['num_students'][:50])) * (year3_means[0] / 100),
               (sum(year3['num_students'][51:100])) * (year3_means[1] / 100),
               (sum(year3['num_students'][101:150])) * (year3_means[2] / 100),
               (sum(year3['num_students'][151:])) * (year3_means[3] / 100)
               ]

raw_Y4 = [(sum(year4['num_students'][:50])) * (year4_means[0] / 100),
               (sum(year4['num_students'][51:100])) * (year4_means[1] / 100),
               (sum(year4['num_students'][101:150])) * (year4_means[2] / 100),
               (sum(year4['num_students'][151:])) * (year4_means[3] / 100)
               ]

raw_Y5 = [(sum(year5['num_students'][:50])) * (year5_means[0] / 100),
               (sum(year5['num_students'][51:100])) * (year5_means[1] / 100),
               (sum(year5['num_students'][101:150])) * (year5_means[2] / 100),
               (sum(year5['num_students'][151:])) * (year5_means[3] / 100)
               ]

raw_Y6 = [(sum(year6['num_students'][:50])) * (year6_means[0] / 100),
               (sum(year6['num_students'][51:100])) * (year6_means[1] / 100),
               (sum(year6['num_students'][101:150])) * (year6_means[2] / 100),
               (sum(year6['num_students'][151:])) * (year6_means[3] / 100)
               ]

In [15]:

trace0 = dict(
    x=years,
    y=[raw_Y1[0], raw_Y2[0], raw_Y3[0], raw_Y4[0], raw_Y5[0], raw_Y6[0]],
    hoverinfo='x+y',
    mode='lines',
    stackgroup='one',
    name = 'Ranked 1-50'
)
trace1 = dict(
    x=years,
    y=[raw_Y1[1], raw_Y2[1], raw_Y3[1], raw_Y4[1], raw_Y5[1], raw_Y6[1]],
    hoverinfo='x+y',
    mode='lines',
    stackgroup='one',
    name = 'Ranked 51-100'
)
trace2 = dict(
    x=years,
    y=[raw_Y1[2], raw_Y2[2], raw_Y3[2], raw_Y4[2], raw_Y5[2], raw_Y6[2]],
    hoverinfo='x+y',
    mode='lines',
    stackgroup='one',
    name = 'Ranked 101-150'
)
trace3 = dict(
    x=years,
    y=[raw_Y1[3], raw_Y2[3], raw_Y3[3], raw_Y4[3], raw_Y5[3], raw_Y6[3]],
    hoverinfo='x+y',
    mode='lines',
    stackgroup='one',
    name = 'Ranked 151-200'
)
data = [trace0, trace1, trace2, trace3]
layout = go.Layout(
    title = 'The amount of international students in numbers per group of 50 universities',
    yaxis = go.layout.YAxis(
        range=[0, 1000000],
        title = 'International students'
    ),
    xaxis = go.layout.XAxis(
        title = 'Year'
    )
    
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='stacked-area-plot-hover', validate=False)
