In [2]:
import pandas as pd

In [3]:
#Reads in the CSV dataset
complete_data = pd.read_csv("timesData.csv")

# Strips the '=' character from the world_rank column 
# and removes all entries with NaN values in Interernation_students column
complete_data['world_rank'] = complete_data['world_rank'].map(lambda x: x.lstrip('='))
complete_data = complete_data.dropna(axis=0, subset=['international_students'])

#Make all possible columns datatype integers so you can make calculations with the values
#Also removed all percentages for easier calculations
complete_data = complete_data[~complete_data.world_rank.str.contains("-")]
complete_data = complete_data.apply(pd.to_numeric, errors='ignore')
complete_data['international_students'] = complete_data['international_students'].str[:-1].astype(float) 

# Filters all entries on the year 2016 and selects the top 200 of that result
latest_year = complete_data[(complete_data['year'] == 2016) & (complete_data['world_rank'] <= 200)]

latest_year

Unnamed: 0,world_rank,university_name,country,teaching,international,research,citations,income,total_score,num_students,student_staff_ratio,international_students,female_male_ratio,year
1803,1,California Institute of Technology,United States of America,95.6,64.0,97.6,99.8,97.8,95.2,2243,6.9,27.0,33 : 67,2016
1804,2,University of Oxford,United Kingdom,86.5,94.4,98.9,98.8,73.1,94.2,19919,11.6,34.0,46 : 54,2016
1805,3,Stanford University,United States of America,92.5,76.3,96.2,99.9,63.3,93.9,15596,7.8,22.0,42 : 58,2016
1806,4,University of Cambridge,United Kingdom,88.2,91.5,96.7,97.0,55.0,92.8,18812,11.8,34.0,46 : 54,2016
1807,5,Massachusetts Institute of Technology,United States of America,89.4,84.0,88.6,99.7,95.4,92.0,11074,9.0,33.0,37 : 63,2016
1808,6,Harvard University,United States of America,83.6,77.2,99.0,99.8,45.2,91.6,20152,8.9,25.0,,2016
1809,7,Princeton University,United States of America,85.1,78.5,91.9,99.3,52.1,90.1,7929,8.4,27.0,45 : 55,2016
1810,8,Imperial College London,United Kingdom,83.3,96.0,88.5,96.7,53.7,89.1,15060,11.7,51.0,37 : 63,2016
1811,9,ETH Zurich – Swiss Federal Institute of Techno...,Switzerland,77.0,97.9,95.0,91.1,80.0,88.3,18178,14.7,37.0,31 : 69,2016
1812,10,University of Chicago,United States of America,85.7,65.0,88.9,99.2,36.6,87.9,14221,6.9,21.0,42 : 58,2016


In [4]:
# Importeer nodige functies en modules
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

# Start notebook mode
init_notebook_mode(connected=True)

# Create random data with numpy
import numpy as np


# Create a trace
trace1 = go.Scatter(
    x=latest_year['world_rank'],
    y=latest_year['total_score'],
    mode = 'lines'
)

data = [trace1]

# Edit the layout
layout = dict(title = 'The total score for the top 200 of the most recent year (2016)',
              xaxis = dict(title = 'Position in the world ranking'),
              yaxis = dict(title = 'Total score'),
              )

fig = dict(data=data, layout=layout)
iplot(fig, filename='basic-line')

In [5]:
N = list(range(1, 201, 1))

data = [go.Bar(
            x=N,
            y=latest_year['international_students']
    )]


layout = go.Layout(
    title = 'The percentage of international students per university in the top 200 for the latest year (2016)',
    yaxis = go.layout.YAxis(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 5,
        ticksuffix = "%",
        title = 'Percentage international students'
    ),
    xaxis = go.layout.XAxis(
        title = 'University ranking #'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic-bar')

In [6]:
#For optimal calculations for future graphs and pie charts we want to divide all the years into 4 groups of 50 university.
#We first make 6 seperate dataframe, each for every year
#After that we create a list to put the means of every group inside of it
#The last step is to append every mean calculation into the earlier created list

#2011
year1 = complete_data[(complete_data['year'] == 2011)]

#Divides the years into 4 groups of 50 universities and puts them in a list for every year.
year1_means = []
year1_means.extend([
    year1['international_students'][:50].mean(),
    year1['international_students'][51:100].mean(),
    year1['international_students'][101:150].mean(),
    year1['international_students'][150:].mean()
])

#2012
year2 = complete_data[(complete_data['year'] == 2012)]

year2_means = []
year2_means.extend([
    year2['international_students'][:50].mean(),
    year2['international_students'][51:100].mean(),
    year2['international_students'][101:150].mean(),
    year2['international_students'][150:].mean()
])
                      
#2013
year3 = complete_data[(complete_data['year'] == 2013)]

year3_means = []
year3_means.extend([
    year3['international_students'][:50].mean(),
    year3['international_students'][51:100].mean(),
    year3['international_students'][101:150].mean(),
    year3['international_students'][151:].mean()
])
                      
#2014              
year4 = complete_data[(complete_data['year'] == 2014)]

year4_means = []
year4_means.extend([
    year4['international_students'][:50].mean(),
    year4['international_students'][51:100].mean(),
    year4['international_students'][101:150].mean(),
    year4['international_students'][151:].mean()
])
                      
#2015
year5 = complete_data[(complete_data['year'] == 2015)]

year5_means = []
year5_means.extend([
    year5['international_students'][:50].mean(),
    year5['international_students'][51:100].mean(),
    year5['international_students'][101:150].mean(),
    year5['international_students'][151:].mean()
])
            
#2016
year6 = latest_year

year6_means = []
year6_means.extend([
    year6['international_students'][:50].mean(),
    year6['international_students'][51:100].mean(),
    year6['international_students'][101:150].mean(),
    year6['international_students'][151:].mean()
])
              


In [7]:
#Create a list for all the years for better readability
years = list(range(2011, 2017, 1))

trace1 = go.Bar(
    x=years,
    y=[year1_means[0], year2_means[0], year3_means[0], year4_means[0], year5_means[0], year6_means[0]],
    name='Ranked 1-50'
)
trace2 = go.Bar(
    x=years,
    y=[year1_means[1], year2_means[1], year3_means[1], year4_means[1], year5_means[1], year6_means[1]],
    name='Ranked 51-100'
)
trace3 = go.Bar(
    x=years,
    y=[year1_means[2], year2_means[2], year3_means[2], year4_means[2], year5_means[2], year6_means[2]],
    name='Ranked 101-150'
)
trace4 = go.Bar(
    x=years,
    y=[year1_means[3], year2_means[3], year3_means[3], year4_means[3], year5_means[3], year6_means[3]],
    name='Ranked 151-200'
)

data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
    barmode='group',
    title = 'Comparison between 4 equally divided groups for every 50 rankings on the amount of international students',
    yaxis = go.layout.YAxis(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 5,
        ticklen=6,
        tickwidth=3,
        range=[0, 30],
        ticksuffix = "%",
        title = 'Percentage international students'
    ),
    xaxis = go.layout.XAxis(
        title = 'Year'
    )
    
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar')



In [22]:
import plotly.plotly as py
import plotly.graph_objs as go

trace0 = dict(
    x=years,
    y=[120, 60, 40, 10],
    hoverinfo='x+y',
    mode='lines',
    stackgroup='one',
    name = 'Ranked 1-50'
)
trace1 = dict(
    x=years,
    y=[20, 10, 10, 60],
    hoverinfo='x+y',
    mode='lines',
    stackgroup='one',
    name = 'Ranked 51-100'
)
trace2 = dict(
    x=years,
    y=[40, 30, 50, 30],
    hoverinfo='x+y',
    mode='lines',
    stackgroup='one',
    name = 'Ranked 101-150'
)
trace3 = dict(
    x=years,
    y=[40, 30, 50, 30],
    hoverinfo='x+y',
    mode='lines',
    stackgroup='one',
    name = 'Ranked 151-200'
)
data = [trace0, trace1, trace2, trace3]

fig = dict(data=data)
iplot(fig, filename='stacked-area-plot-hover', validate=False)