In [2]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import numpy as np

## Question 1

In [3]:
TOP_UNIV_BASE = "https://www.topuniversities.com"
TOP_UNIV_WEBSITE = "https://www.topuniversities.com/university-rankings/world-university-rankings/2018"
# This is the file containing the general informations about university in the topuniversities website.
TOP_UNIV_DATA_INDICATOR = "https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051_indicators.txt"

In [4]:
# We get the file containing information about the universities
data_indicator = requests.get(TOP_UNIV_DATA_INDICATOR)

indicator_data = json.loads(data_indicator.text)

In [5]:
# Utility function to parse integers
def parse_int(string):
    tmp = string.replace(',', '')
    return int(tmp)

# For each university entry in the indicator file, we get the detail of the university.
# This method returns a list with every information requested
def get_university_info_from_indicator(raw_university):
    overview_soup = BeautifulSoup(raw_university["uni"], 'html.parser')    
    
    name = overview_soup.a.text
    rank = raw_university["overall_rank"]
    region = raw_university["region"]
    country = raw_university["location"]
    
    # Append
    details_request = requests.get(TOP_UNIV_BASE + "/" +  overview_soup.a.attrs['href'])
    details_soup = BeautifulSoup(details_request.text, 'html.parser')
    
    try:      
        total_staff = parse_int(details_soup.find("div", {"class": "total faculty"}).find("div", {"class" : "number"}).text)
        inter_staff = parse_int(details_soup.find("div", {"class": "inter faculty"}).find("div", {"class" : "number"}).text)
        total_students = parse_int(details_soup.find("div", {"class": "total student"}).find("div", {"class": "number"}).text)
        inter_students = parse_int(details_soup.find("div", {"class": "total inter"}).find("div", {"class": "number"}).text)
    except:
        total_staff = np.nan
        inter_staff = np.nan
        total_students = np.nan
        inter_students = np.nan
        
    return [name, rank, country, region, inter_staff, total_staff, inter_students, total_students]


In [6]:
# The main loop, for each university in the indicator file.
univ_list = []

for i, raw_university in enumerate(indicator_data["data"][0:200]):
    print(i, end="\r")
    univ_list.append(get_university_info_from_indicator(raw_university))
    
        
top_univ_df = pd.DataFrame(univ_list, columns=["name", "rank", "country", "region", "inter_staff", "total_staff", "inter_students", "total_students"])

199

In [7]:
top_univ_df

Unnamed: 0,name,rank,country,region,inter_staff,total_staff,inter_students,total_students
0,Massachusetts Institute of Technology (MIT),1,United States,North America,1679.0,2982.0,3717.0,11067.0
1,Stanford University,2,United States,North America,2042.0,4285.0,3611.0,15878.0
2,Harvard University,3,United States,North America,1311.0,4350.0,5266.0,22429.0
3,California Institute of Technology (Caltech),4,United States,North America,350.0,953.0,647.0,2255.0
4,University of Cambridge,5,United Kingdom,Europe,2278.0,5490.0,6699.0,18770.0
5,University of Oxford,6,United Kingdom,Europe,2964.0,6750.0,7353.0,19720.0
6,UCL (University College London),7,United Kingdom,Europe,2554.0,6345.0,14854.0,31080.0
7,Imperial College London,8,United Kingdom,Europe,2071.0,3930.0,8746.0,16090.0
8,University of Chicago,9,United States,North America,635.0,2449.0,3379.0,13557.0
9,ETH Zurich - Swiss Federal Institute of Techno...,10,Switzerland,Europe,1886.0,2477.0,7563.0,19815.0


In [8]:
top_univ_df.to_csv("top_univ_df.csv")

In [9]:
top_univ_df = pd.read_csv("top_univ_df.csv", index_col=False, usecols=range(1, 9))

In [10]:
def print_best_university_with_ratio(df): 
    sorted_df = df.sort_values("ratio", ascending=False)
    display(sorted_df.head(1))
    
def print_best_university_with_ratio_grouped(df_grouped):
    display(df_grouped.apply(lambda x: x[x["ratio"] == x["ratio"].max()]).reset_index(1).drop("level_1", 1))

def append_column_to_dataframe(dataframe, column, column_name):
    output_df = dataframe.copy()
    output_df.loc[:, column_name] = column
    return output_df

members_to_studends_ratio = top_univ_df.total_staff / (top_univ_df.total_students + top_univ_df.total_staff)
top_univ_df_with_ratio_a = append_column_to_dataframe(top_univ_df, members_to_studends_ratio, "ratio")

inter_students_ratio = top_univ_df.inter_students / top_univ_df.total_students
top_univ_df_with_ratio_b = append_column_to_dataframe(top_univ_df, inter_students_ratio, "ratio")


print("The best university in term of ratio between faculty members and students is")
print_best_university_with_ratio(top_univ_df_with_ratio_a)
print("\n\nThe best university in term of  ratio of international students is")
print_best_university_with_ratio(top_univ_df_with_ratio_b)

print("\n\nThe best university in term of ratio between faculty members and students per country are")
print_best_university_with_ratio_grouped(top_univ_df_with_ratio_a.groupby("country"))
print("\n\nThe best university in term of  ratio of international students per country are")
print_best_university_with_ratio_grouped(top_univ_df_with_ratio_b.groupby("country"))

print("\n\nThe best university in term of ratio between faculty members and students per region are")
print_best_university_with_ratio_grouped(top_univ_df_with_ratio_a.groupby("region"))
print("\n\nThe best university in term of  ratio of international students per region are")
print_best_university_with_ratio_grouped(top_univ_df_with_ratio_b.groupby("region"))

The best university in term of ratio between faculty members and students is


Unnamed: 0,name,rank,country,region,inter_staff,total_staff,inter_students,total_students,ratio
3,California Institute of Technology (Caltech),4,United States,North America,350.0,953.0,647.0,2255.0,0.29707




The best university in term of  ratio of international students is


Unnamed: 0,name,rank,country,region,inter_staff,total_staff,inter_students,total_students,ratio
34,London School of Economics and Political Scien...,35,United Kingdom,Europe,687.0,1088.0,6748.0,9760.0,0.691393




The best university in term of ratio between faculty members and students per country are


Unnamed: 0_level_0,name,rank,country,region,inter_staff,total_staff,inter_students,total_students,ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Argentina,Universidad de Buenos Aires (UBA),75,Argentina,Latin America,3165.0,16421.0,27109.0,122301.0,0.118373
Australia,The Australian National University,20,Australia,Oceania,927.0,1600.0,5551.0,14442.0,0.099738
Austria,University of Vienna,154,Austria,Europe,1400.0,3411.0,14468.0,45967.0,0.069079
Belgium,Vrije Universiteit Brussel (VUB),182,Belgium,Europe,515.0,1792.0,1853.0,9284.0,0.161791
Brazil,Universidade de São Paulo,121,Brazil,Latin America,279.0,5582.0,2086.0,65711.0,0.078297
Canada,McMaster University,140,Canada,North America,1170.0,3231.0,3548.0,23702.0,0.119964
Chile,Pontificia Universidad Católica de Chile (UC),137,Chile,Latin America,198.0,2260.0,991.0,27003.0,0.077231
China,Tsinghua University,25,China,Asia,932.0,5506.0,4072.0,36300.0,0.131704
Denmark,Technical University of Denmark,116,Denmark,Europe,966.0,2117.0,2098.0,8878.0,0.192542
Finland,University of Helsinki,102,Finland,Europe,535.0,2645.0,1234.0,22419.0,0.10553




The best university in term of  ratio of international students per country are


Unnamed: 0_level_0,name,rank,country,region,inter_staff,total_staff,inter_students,total_students,ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Argentina,Universidad de Buenos Aires (UBA),75,Argentina,Latin America,3165.0,16421.0,27109.0,122301.0,0.221658
Australia,The University of Melbourne,41,Australia,Oceania,1477.0,3311.0,18030.0,42182.0,0.427434
Austria,University of Vienna,154,Austria,Europe,1400.0,3411.0,14468.0,45967.0,0.314748
Belgium,Vrije Universiteit Brussel (VUB),182,Belgium,Europe,515.0,1792.0,1853.0,9284.0,0.199591
Brazil,Universidade Estadual de Campinas (Unicamp),182,Brazil,Latin America,109.0,1968.0,966.0,26572.0,0.036354
Canada,McGill University,32,Canada,North America,1220.0,3646.0,9540.0,28837.0,0.330825
Chile,Pontificia Universidad Católica de Chile (UC),137,Chile,Latin America,198.0,2260.0,991.0,27003.0,0.0367
China,Peking University,38,China,Asia,1038.0,5185.0,7090.0,42136.0,0.168265
Denmark,Technical University of Denmark,116,Denmark,Europe,966.0,2117.0,2098.0,8878.0,0.236314
Finland,Aalto University,137,Finland,Europe,370.0,1257.0,1831.0,12147.0,0.150737




The best university in term of ratio between faculty members and students per region are


Unnamed: 0_level_0,name,rank,country,region,inter_staff,total_staff,inter_students,total_students,ratio
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Africa,University of Cape Town,191,South Africa,Africa,379.0,1733.0,3325.0,19593.0,0.081262
Asia,Pohang University of Science And Technology (P...,71,South Korea,Asia,113.0,664.0,126.0,3117.0,0.175615
Europe,University of Oxford,6,United Kingdom,Europe,2964.0,6750.0,7353.0,19720.0,0.255006
Latin America,Instituto Tecnológico y de Estudios Superiores...,199,Mexico,Latin America,821.0,1822.0,1412.0,13376.0,0.119884
North America,California Institute of Technology (Caltech),4,United States,North America,350.0,953.0,647.0,2255.0,0.29707
Oceania,The Australian National University,20,Australia,Oceania,927.0,1600.0,5551.0,14442.0,0.099738




The best university in term of  ratio of international students per region are


Unnamed: 0_level_0,name,rank,country,region,inter_staff,total_staff,inter_students,total_students,ratio
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Africa,University of Cape Town,191,South Africa,Africa,379.0,1733.0,3325.0,19593.0,0.169703
Asia,The University of Hong Kong,26,Hong Kong,Asia,2085.0,3012.0,8230.0,20214.0,0.407144
Europe,London School of Economics and Political Scien...,35,United Kingdom,Europe,687.0,1088.0,6748.0,9760.0,0.691393
Latin America,Universidad de Buenos Aires (UBA),75,Argentina,Latin America,3165.0,16421.0,27109.0,122301.0,0.221658
North America,Carnegie Mellon University,47,United States,North America,425.0,1342.0,6385.0,13356.0,0.478062
Oceania,The University of Melbourne,41,Australia,Oceania,1477.0,3311.0,18030.0,42182.0,0.427434


## Question 2

In [13]:
TIMES_HIGHER_DATA = "https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json"

# Loads JSON files containing universities data
th_data = json.loads(requests.get(TIMES_HIGHER_DATA).text)

In [54]:
map_country_region = top_univ_df[['country', 'region']].groupby('country').agg(lambda x:x.value_counts().index[0])
map_country_region = map_country_region.T.to_dict('list')

# Add missing values
map_country_region['Luxembourg'] = ['Europe']
map_country_region

{'Argentina': ['Latin America'],
 'Australia': ['Oceania'],
 'Austria': ['Europe'],
 'Belgium': ['Europe'],
 'Brazil': ['Latin America'],
 'Canada': ['North America'],
 'Chile': ['Latin America'],
 'China': ['Asia'],
 'Denmark': ['Europe'],
 'Finland': ['Europe'],
 'France': ['Europe'],
 'Germany': ['Europe'],
 'Hong Kong': ['Asia'],
 'India': ['Asia'],
 'Ireland': ['Europe'],
 'Israel': ['Asia'],
 'Italy': ['Europe'],
 'Japan': ['Asia'],
 'Luxembourg': ['Europe'],
 'Malaysia': ['Asia'],
 'Mexico': ['Latin America'],
 'Netherlands': ['Europe'],
 'New Zealand': ['Oceania'],
 'Norway': ['Europe'],
 'Russia': ['Europe'],
 'Russian Federation': ['Europe'],
 'Saudi Arabia': ['Asia'],
 'Singapore': ['Asia'],
 'South Africa': ['Africa'],
 'South Korea': ['Asia'],
 'Spain': ['Europe'],
 'Sweden': ['Europe'],
 'Switzerland': ['Europe'],
 'Taiwan': ['Asia'],
 'United Kingdom': ['Europe'],
 'United States': ['North America']}

In [57]:
def parse_percentage(string):
    """Parse a string representing a percentage as a float"""
    return float(string.strip('%'))/100

def th_get_university_info(raw_university):
    """Extract the relevent university info from the raw university data"""
    
    name = raw_university["aliases"]
    rank = raw_university["rank"].strip('=')
    country = raw_university["location"]
    if country=='Russian Federation':
        country = 'Russia'
        
    region = map_country_region[country][0]
    
    total_students = parse_int(raw_university["stats_number_students"])
    inter_students_ratio = parse_percentage(raw_university["stats_pc_intl_students"])
    student_faculty_ratio = (float)(raw_university["stats_student_staff_ratio"])
    
    
    return [name, rank, country, region, total_students, inter_students_ratio, student_faculty_ratio]

In [58]:
th_univ_list = []

# Extract data for all universities
for i, raw_university in enumerate(th_data["data"][0:200]):
    print(i, end="\r")
    th_univ_list.append(th_get_university_info(raw_university))
    
        
times_higher_df = pd.DataFrame(th_univ_list, columns=["name", "rank", "country", "region", "total_students", "inter_students_ratio", "student_faculty_ratio"])
times_higher_df.to_csv("times_higher_df.csv")
times_higher_df

0123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199

Unnamed: 0,name,rank,country,region,total_students,inter_students_ratio,student_faculty_ratio
0,University of Oxford,1,United Kingdom,Europe,20409,0.38,11.2
1,University of Cambridge,2,United Kingdom,Europe,18389,0.35,10.9
2,California Institute of Technology caltech,3,United States,North America,2209,0.27,6.5
3,Stanford University,3,United States,North America,15845,0.22,7.5
4,Massachusetts Institute of Technology,5,United States,North America,11177,0.34,8.7
5,Harvard University,6,United States,North America,20326,0.26,8.9
6,Princeton University,7,United States,North America,7955,0.24,8.3
7,Imperial College London,8,United Kingdom,Europe,15857,0.55,11.4
8,University of Chicago,9,United States,North America,13525,0.25,6.2
9,ETH Zurich – Swiss Federal Institute of Techno...,10,Switzerland,Europe,19233,0.38,14.6
