In [115]:
import pandas as pd

In [167]:
#Reads in the CSV dataset
complete_data = pd.read_csv("timesData.csv")

# Strips the '=' character from the world_rank column 
# and removes all entries with NaN values in Interernation_students column
complete_data['world_rank'] = complete_data['world_rank'].map(lambda x: x.lstrip('='))
complete_data = complete_data.dropna(axis=0, subset=['international_students'])

#Make all possible columns datatype integers so you can make calculations with the values
complete_data = complete_data[~complete_data.world_rank.str.contains("-")]
complete_data = complete_data.apply(pd.to_numeric, errors='ignore')

# Filters all entries on the year 2016 and selects the top 200 of that result
latest_year = complete_data[(complete_data['year'] == 2016) & (complete_data['world_rank'] <= 200)]

latest_year

Unnamed: 0,world_rank,university_name,country,teaching,international,research,citations,income,total_score,num_students,student_staff_ratio,international_students,female_male_ratio,year
1803,1,California Institute of Technology,United States of America,95.6,64.0,97.6,99.8,97.8,95.2,2243,6.9,27%,33 : 67,2016
1804,2,University of Oxford,United Kingdom,86.5,94.4,98.9,98.8,73.1,94.2,19919,11.6,34%,46 : 54,2016
1805,3,Stanford University,United States of America,92.5,76.3,96.2,99.9,63.3,93.9,15596,7.8,22%,42 : 58,2016
1806,4,University of Cambridge,United Kingdom,88.2,91.5,96.7,97.0,55.0,92.8,18812,11.8,34%,46 : 54,2016
1807,5,Massachusetts Institute of Technology,United States of America,89.4,84.0,88.6,99.7,95.4,92.0,11074,9.0,33%,37 : 63,2016
1808,6,Harvard University,United States of America,83.6,77.2,99.0,99.8,45.2,91.6,20152,8.9,25%,,2016
1809,7,Princeton University,United States of America,85.1,78.5,91.9,99.3,52.1,90.1,7929,8.4,27%,45 : 55,2016
1810,8,Imperial College London,United Kingdom,83.3,96.0,88.5,96.7,53.7,89.1,15060,11.7,51%,37 : 63,2016
1811,9,ETH Zurich – Swiss Federal Institute of Techno...,Switzerland,77.0,97.9,95.0,91.1,80.0,88.3,18178,14.7,37%,31 : 69,2016
1812,10,University of Chicago,United States of America,85.7,65.0,88.9,99.2,36.6,87.9,14221,6.9,21%,42 : 58,2016


In [153]:
import plotly.plotly as py
import plotly.graph_objs as go

# Create random data with numpy
import numpy as np


# Create a trace
trace1 = go.Scatter(
    x=latest_year['world_rank'],
    y=latest_year['total_score'],
    mode = 'lines'
)

data = [trace1]

# Edit the layout
layout = dict(title = 'The total score for the top 200 of the most recent year (2016)',
              xaxis = dict(title = 'Position in the world ranking'),
              yaxis = dict(title = 'Total score'),
              )

fig = dict(data=data, layout=layout)
iplot(fig, filename='basic-line')

In [154]:
N = list(range(1, 201, 1))

data = [go.Bar(
            x=N,
            y=latest_year['international_students']
    )]


layout = go.Layout(
    yaxis = go.layout.YAxis(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 5,
        ticksuffix = "%"
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic-bar')

In [155]:
#Seperating into groups of 50
latest_year.head(50)

#...
trace1 = go.Bar(
    x=['2011', '2012', '2013', '2014', '2015', '2016'],
    y=[20, 14, 23, 4, 7, 29],
    name='Ranked 1-50'
)
trace2 = go.Bar(
    x=['2011', '2012', '2013', '2014', '2015', '2016'],
    y=[20, 14, 23, 4, 7, 18],
    name='Ranked 51-100'
)
trace3 = go.Bar(
    x=['2011', '2012', '2013', '2014', '2015', '2016'],
    y=[20, 14, 23, 4, 7, 12],
    name='Ranked 101-151'
)
trace4 = go.Bar(
    x=['2011', '2012', '2013', '2014', '2015', '2016'],
    y=[20, 14, 23, 4, 7, 6],
    name='Ranked 151-200'
)

data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar')