In [2]:
import pandas as pd
import numpy as np
from plotly.tools import FigureFactory as FF
import json

In [3]:
#Reads in the CSV dataset
complete_data = pd.read_csv("timesData.csv")

# Strips the '=' character from the world_rank column 
# and removes all entries with NaN values in Interernation_students column
complete_data['world_rank'] = complete_data['world_rank'].map(lambda x: x.lstrip('='))
complete_data = complete_data.dropna(axis=0, subset=['international_students'])

#Make all possible columns datatype integers so you can make calculations with the values
#Also removed all percentages for easier calculations
complete_data = complete_data[~complete_data.world_rank.str.contains("-")]
complete_data = complete_data.apply(pd.to_numeric, errors='ignore')

complete_data['international_students'] = complete_data['international_students'].str[:-1].astype(float)

complete_data.loc[complete_data['international'] == '-', 'international'] = 0
complete_data['international'] = complete_data['international'].astype(float)

#Removes all the comma's from the num_students columns
complete_data['num_students'] = complete_data['num_students'].str.replace(',', '').astype(int)

complete_data

Unnamed: 0,world_rank,university_name,country,teaching,international,research,citations,income,total_score,num_students,student_staff_ratio,international_students,female_male_ratio,year
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152,8.9,25.0,,2011
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243,6.9,27.0,33 : 67,2011
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074,9.0,33.0,37 : 63,2011
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596,7.8,22.0,42 : 58,2011
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,-,94.2,7929,8.4,27.0,45 : 55,2011
5,6,University of Cambridge,United Kingdom,90.5,77.7,94.1,94.0,57.0,91.2,18812,11.8,34.0,46 : 54,2011
6,6,University of Oxford,United Kingdom,88.2,77.2,93.9,95.1,73.5,91.2,19919,11.6,34.0,46 : 54,2011
7,8,"University of California, Berkeley",United States of America,84.2,39.6,99.3,97.8,-,91.1,36186,16.4,15.0,50 : 50,2011
8,9,Imperial College London,United Kingdom,89.2,90.0,94.5,88.3,92.9,90.6,15060,11.7,51.0,37 : 63,2011
9,10,Yale University,United States of America,92.1,59.2,89.7,91.5,-,89.5,11751,4.4,20.0,50 : 50,2011


In [4]:
# This function takes in a year as a parameter and filters the complete dataframe on that year
# After that it calculates the mean of the international students per group of 50 universities in the ranking
# The function returns a list with the all the means of the given year. This list is rounded up to a decimal 

def get_meantop200(year, column, country_divided = False):
    selected = complete_data[(complete_data['year']) == year][:200]
    if country_divided == True:
        countries = set(selected['country'])
        country_means = {}
        for country in countries:
            country_means[country] = selected[(selected['country']) == country][column].mean()
        return country_means
    else:
        means = [selected[column][:50].mean(),
                      selected[column][51:100].mean(),
                      selected[column][101:150].mean(),
                      selected[column][151:200].mean()
                    ]
    return np.around(means, 1);


#print(get_meantop200(2011, 'international_students', country_divided=True))
#test = get_meantop200(2011, 'international_students', country_divided=True)
#test.values()

In [5]:
means = []
for i in range(2011, 2017):
    means.append(get_meantop200(i, 'international_students', country_divided=False))

    mean_grouped = []
for i in range(0, 4):
    group_sum = 0
    for j in range(0, 4):
        group_sum += means[i][j]
    mean_grouped.append(group_sum / 4)

In [6]:
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

# Start notebook mode
init_notebook_mode(connected=True)

data = [go.Bar(
            x= ['Ranked 1-50', 'Ranked 51-100', 'Ranked 101-150', 'Ranked 151-200'],
            y= [mean_grouped[0], mean_grouped[1], mean_grouped[2], mean_grouped[3]],
            marker=dict(
                color='rgb(40, 180, 25)',
                line=dict(
                color='rgb(8,48,107)',
                width=3.5),
            ),
    )]


layout = go.Layout(
    title = 'Maak hier titel aub, gemiddelde % int studenten per groep van 50 voor alle jaren',
    yaxis = go.layout.YAxis(
        tickmode = 'linear',
        ticksuffix = "%",
        ticklen= 6,
        tickwidth= 3,
        title = 'Percentage international students'
    ),
    xaxis = go.layout.XAxis(
        title = 'Geef izjn goeie naam'
    ),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic-bar')

In [7]:
def get_meanitem(column):
    means = []
    for i in range(1, 201):
        filtered = complete_data[(complete_data['world_rank']) == i]
        means.append(filtered[column].mean())
    return np.around(means, 1)


int_students = get_meanitem('international_students') 
int_score = get_meanitem('international')
total_score = get_meanitem('total_score')

In [8]:
# Create a trace
trace1 = go.Scatter(
    x = int_score[:50],
    y = int_students[:50],
    mode = 'markers',
    name = 'Ranked 1-50'
)
trace2 = go.Scatter(
    x = int_score[51:100],
    y = int_students[51:100],
    mode = 'markers',
    name = 'Ranked 51-100'
)
trace3 = go.Scatter(
    x = int_score[101:150],
    y = int_students[101:150],
    mode = 'markers',
    name = 'Ranked 101-150'
)
trace4 = go.Scatter(
    x = int_score[151:200],
    y = int_students[151:200],
    mode = 'markers',
    name = 'Ranked 151-200'
)

layout = go.Layout(
    title = 'Zebi zebi title',
    yaxis = go.layout.YAxis(
        tick0 = 5,
        dtick = 5,
        ticksuffix = "%",
        ticklen= 6,
        tickwidth= 3,
        range = [5,36],
        title = 'Percentage international students'
    ),
    xaxis = go.layout.XAxis(
        title = 'International score',
        range = [30, 100]
    ),
)

data = [trace1, trace2, trace3, trace4]
fig = go.Figure(data=data, layout=layout)


# Plot and embed in ipython notebook!
iplot(fig, filename='basic-scatter')

In [9]:
trace1 = go.Scatter(
    x = total_score[:50],
    y = int_students[:50],
    mode = 'markers',
    name = 'Ranked 1-50'
)
trace2 = go.Scatter(
    x = total_score[51:100],
    y = int_students[51:100],
    mode = 'markers',
    name = 'Ranked 51-100'
)
trace3 = go.Scatter(
    x = total_score[101:150],
    y = int_students[101:150],
    mode = 'markers',
    name = 'Ranked 101-150'
)
trace4 = go.Scatter(
    x = total_score[151:200],
    y = int_students[151:200],
    mode = 'markers',
    name = 'Ranked 151-200'
)

layout = go.Layout(
    title = 'TITLE AH MATTIE',
    yaxis = go.layout.YAxis(
        tick0 = 5,
        dtick = 5,
        ticksuffix = "%",
        ticklen= 6,
        tickwidth= 3,
        range = [5,36],
        title = 'Percentage international students'
    ),
    xaxis = go.layout.XAxis(
        title = 'Total Score',
        range = [30, 100]
    ),
)

data = [trace1, trace2, trace3, trace4]
fig = go.Figure(data=data, layout=layout)


# Plot and embed in ipython notebook!
iplot(fig, filename='basic-scatter')

In [10]:
country_stats = complete_data.groupby(['year', 'country']).mean()
counts = complete_data.groupby(['year', 'country']).size()
country_stats['count'] = counts

In [19]:
print(country_stats)

                               world_rank   teaching  international  \
year country                                                          
2011 Australia                  90.571429  49.657143      84.428571   
     Austria                   191.000000  42.750000      81.350000   
     Belgium                   121.500000  55.250000      27.150000   
     Canada                    106.888889  52.422222      38.622222   
     China                     105.333333  60.300000      36.783333   
     Denmark                   155.333333  42.800000      47.733333   
     Egypt                     147.000000  29.500000      19.300000   
     Finland                   102.000000  49.000000      24.200000   
     France                     80.250000  56.925000      47.775000   
     Germany                   145.214286  49.107143      52.450000   
     Hong Kong                  80.500000  47.775000      85.875000   
     Japan                      91.400000  70.300000      20.360000   
     N

In [11]:
data = [go.Bar(
            x= ['Ranked 1-50', 'Ranked 51-100', 'Ranked 101-150', 'Ranked 151-200'],
            y= [mean_grouped[0], mean_grouped[1], mean_grouped[2], mean_grouped[3]],
            marker=dict(
                color='rgb(40, 180, 25)',
                line=dict(
                color='rgb(8,48,107)',
                width=3.5),
            ),
    )]


layout = go.Layout(
    title = 'Maak hier titel aub, gemiddelde % int studenten per groep van 50 voor alle jaren',
    yaxis = go.layout.YAxis(
        tickmode = 'linear',
        ticksuffix = "%",
        ticklen= 6,
        tickwidth= 3,
        title = 'Percentage international students'
    ),
    xaxis = go.layout.XAxis(
        title = 'Geef izjn goeie naam'
    ),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic-bar')

In [136]:
def choro(raw, year, statistic, unit, zmax, rv):
    return go.Choropleth(
        zmin = 20,
        zmax = zmax,
        reversescale = rv,
        colorscale = [[0, 'rgb(50,50,255)'], [1, 'rgb(255,50,50)']],
        autocolorscale = False,
        locations = raw.index, #countries['country'],
        z = raw[statistic].astype(float),
        locationmode = 'country names',
#         text = raw.index,
        marker = go.choropleth.Marker(
            line = go.choropleth.marker.Line(
                color = 'rgb(0,0,0)',
                width = .25
            )),
        colorbar = go.choropleth.ColorBar(
            title = unit),
        customdata = [year]
    )

def worldmap(statistic, title, unit, zmax, rv):
    data = [choro(country_stats.loc[year], year, statistic, unit, zmax, rv) for year in country_stats.index.levels[0]]

    steps = []
    for i, d in enumerate(data):
        step = dict(method='restyle',
                    args=['visible', [False] * (len(data))],
                    label='Year {}'.format(d.customdata[0]))
        step['args'][1][i] = True
        steps.append(step)

    sliders = [dict(active=0,
                    pad={"t": 1},
                    steps=steps)]  

    layout = go.Layout(
        title = go.layout.Title(
            text = title
        ),
        geo = go.layout.Geo(
            scope = 'world',
            projection = go.layout.geo.Projection(type = 'equirectangular'),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
        sliders = sliders
    )

    fig = go.Figure(data = data, layout = layout)
    iplot(fig)

# for stat in country_stats.columns:
#     worldmap(stat)


worldmap('international', 'Average international score for each country', 'Average score', 100, False)

In [104]:
def filter_yearly(year, column, sort = True):
    filtered = country_stats.loc[year]
    if sort == True:
        filtered = filtered.sort_values(by=[column])
    return filtered

In [106]:
trace1 = {"x": filter_yearly(2011, 'international')['international'], 
          "y": filter_yearly(2011, 'international').index, 
          "marker": {"color": "blue", "size": 12}, 
          "mode": "markers", 
          "name": "2011", 
          "type": "scatter"
}

data = [trace1]
layout = {"title": "Gender Earnings Disparity", 
          "xaxis": {"title": "Annual Salary (in thousands)", }, 
          "yaxis": {"title": "School"}}

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic_dot-plot')

In [168]:
def Scatter(x_values, y_values, marker, name, visible):
    trace1 = {"x": x_values, 
          "y": y_values, 
          "marker": marker, 
          "mode": "markers", 
          "name": name, 
          "type": "scatter",
          "visible": visible
    }
    return trace1;

In [171]:
from plotly.tools import FigureFactory as FF
import json

trace1 = Scatter(filter_yearly(2011, 'international')['international'], 
                 filter_yearly(2011, 'international').index,
                 marker = {"color": "blue", "size": 12}, 
                 name='2011',
                 visible=True,  
               )

trace2 = Scatter(filter_yearly(2012, 'international')['international'], 
                 filter_yearly(2012, 'international').index,
                 marker = {"color": "blue", "size": 12}, 
                 name='2012',
                 visible=False
               )

data = [trace1, trace2]

layout = {"title": "Gender Earnings Disparity", 
          "xaxis": {"title": "Annual Salary (in thousands)", }, 
          "yaxis": {"title": "School"}}

updatemenus=list([
    dict(

        buttons=list([
            dict(
                args=['visible', [True, False, False, False, False, False]],
                label='2011',
                method='restyle'
            ),
            
            dict(
                args=['visible', [False, True, False, False, False, False]],
                label='2012',
                method='restyle'
            ),
        ]),
        direction = 'down',
        pad = {'r': 10, 't': 10},
        showactive = True,
        x = 0.1,
        xanchor = 'left',
        y = 1.1,
        yanchor = 'top' 
    ),
])


layout['updatemenus'] = updatemenus

fig = dict(data=data, layout=layout)
iplot(fig, filename='basic_dot-plot')