In [1]:
# Import libraries
import pandas as pd
import seaborn
import json
%matplotlib inline
import requests
from bs4 import BeautifulSoup

# Scraping

#### We will start by using postman to spy on the get requests that are sent for both website, in order to retrieve the url that loads the rankings in the corresponding websites. To do this, we send requests to both websites and retrieve the information as json files and extract two dictionarries that will be used to form the dataframes

In [2]:
r_THE = requests.get('https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json')
r_QS = requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt?_=1508593854921')

data_THE = r_THE.json() #ranking data for timeshighereducation.com
data_QS = r_QS.json()  #ranking data for topuniversities.com

##### We only took the data part for each json, which is of interest

In [3]:
print(list(data_THE.keys()))
print(list(data_QS.keys()))

['subjects', 'locations', 'pillars', 'data']
['data']


In [4]:
df_rank_THE = pd.DataFrame(data_THE['data'])
df_rank_QS = pd.DataFrame(data_QS['data'])

##### We create the two dataframes and drop all the universites ranked above 200

In [5]:
# We need to clean the rankings in replacing first special characters
df_rank_THE['rank'] = df_rank_THE['rank'].apply(lambda x: int(x.replace('=', '').replace('–', '').replace('+', '')))
#df_rank_THE['rank'] = df_rank_THE['rank'].replace("=|–|+", "", regex=True).astype(int)
df_rank_THE = df_rank_THE.drop(df_rank_THE[df_rank_THE['rank'] > 200].index)

df_rank_QS['rank_display'] = df_rank_QS['rank_display'].apply(lambda x: int(x.replace('=', '').replace('-', '').replace('+', '')))
#df_rank_QS['rank_display'] = df_rank_QS['rank_display'].replace("=|-|+", "", regex=True).astype(int)
df_rank_QS = df_rank_QS.drop(df_rank_QS[df_rank_QS['rank_display'] > 200].index)

##### Then, we select the required features for both dataframes

In [6]:
THE_features = ['name', 'location', 'rank', 'stats_number_students', 'stats_pc_intl_students', 'stats_student_staff_ratio']
df_rank_THE = df_rank_THE[THE_features]

df_rank_THE = df_rank_THE.set_index('name').sort_index(ascending=True)


In [None]:
QS_features = ['country', 'region', 'rank_display']

df_rank_QS = df_rank_QS.set_index('title').sort_index(ascending=True)

urls = df_rank_QS['url'] # We need to search further informations as the list isn't exhaustive, hence the url we are saving 
df_rank_QS = df_rank_QS[QS_features]

# Question 1

##### For the first webpage, we need to scrap more data from the detail page of each university. Postman was very helpful for finding tags with the corresponing informations.

In [None]:
link = 'https://www.topuniversities.com/'
university = {'title': [],
              'faculty members total': [],
              'faculty members international': [],
              'students total': [],
              'students international': []}

for uni in urls.iteritems():
    soup = BeautifulSoup(requests.get(link + uni[1]).text, 'html.parser')
    
    faculty = soup.find('div', class_='faculty-main wrapper col-md-4 item active')
    if faculty is not None:
        faculty = faculty.find_all('div', class_='number')
        
    students = soup.find('div', class_='students-main wrapper col-md-4 item')
    if students is not None:
        students = students.find('div', class_='number')
    
    int_students = soup.find('div', class_='int-students-main wrapper col-md-4 item')
    if int_students is not None:
        int_students = int_students.find('div', class_='number')
    
    
    if faculty is not None:
        if len(faculty) > 1:
            university['faculty members total'].append(int(faculty[0].text.replace(',', '')))
            university['faculty members international'].append(int(faculty[1].text.replace(',', '')))
        else:
            university['faculty members total'].append(int(faculty[0].text.replace(',', '')))
            university['faculty members international'].append(None)
    else:
        university['faculty members total'].append(None)
        university['faculty members international'].append(None)
        
        
    if students is not None:
        university['students total'].append(int(students.text.replace(',', '')))
        
    else:
        university['students total'].append(None)
    
    if int_students is not None:
        university['students international'].append(int(int_students.text.replace(',', '')))
        
    else:
        university['students international'].append(None)

    university['title'].append(uni[0])
        

##### We join the original dataframe with the additional informations collected and rearrange headers in a intuitive way

In [None]:
df_rank_QS_supp = pd.DataFrame(university).set_index('title')
df_rank_QS = df_rank_QS.join(df_rank_QS_supp)

In [None]:
col = [('country', None), ('region', None), ('rank', None), ('faculty members', 'international'), ('faculty members', 'total'), ('students', 'international'), ('students', 'total')]
df_rank_QS.columns = pd.MultiIndex.from_tuples([('', x[0]) if pd.isnull(x[1]) else x for x in col])
df_rank_QS.index.name = 'name'

## University

In [None]:
QS_ratio_university = df_rank_QS.copy()[['faculty members', 'students']]

QS_ratio_university['ratio', 'faculty/students'] = QS_ratio_university['faculty members', 'total'] / QS_ratio_university['students', 'total']
QS_ratio_university['ratio', 'international/total'] = QS_ratio_university['students', 'international'] / QS_ratio_university['students', 'total']

### Best faculty/students ratio

In [None]:
QS_ratio_university.sort_values(('ratio', 'faculty/students'), ascending=False).head()

### Best international ratio

In [None]:
QS_ratio_university.sort_values(('ratio', 'international/total'), ascending=False).head()

## Country

In [None]:
QS_ratio_country = df_rank_QS.copy()
QS_ratio_country = QS_ratio_country.groupby([('', 'country')]).sum()[['faculty members', 'students']]
QS_ratio_country.index.name = 'Country'

QS_ratio_country['ratio', 'faculty/students'] = QS_ratio_country['faculty members', 'total']/QS_ratio_country['students', 'total']
QS_ratio_country['ratio', 'international/total'] = QS_ratio_country['students', 'international']/QS_ratio_country['students', 'total']

### Best faculty/student ratio

In [None]:
QS_ratio_country.sort_values(('ratio', 'faculty/students'), ascending=False).head()

### Best international ratio

In [None]:
QS_ratio_country.sort_values(('ratio', 'international/total'), ascending=False).head()

## Region

In [None]:
QS_ratio_region = df_rank_QS.copy()
QS_ratio_region = QS_ratio_region.groupby([('', 'region')]).sum()[['faculty members', 'students']]
QS_ratio_region.index.name = 'Region'

QS_ratio_region['ratio', 'faculty/students'] = QS_ratio_region['faculty members', 'total']/QS_ratio_region['students', 'total']
QS_ratio_region['ratio', 'international/total'] = QS_ratio_region['students', 'international']/QS_ratio_region['students', 'total']

### Best faculty/student ratio

In [None]:
QS_ratio_region.sort_values(('ratio', 'faculty/students'), ascending=False).head()

### Best international ratio

In [None]:
QS_ratio_region.sort_values(('ratio', 'international/total'), ascending=False).head()

# Question 2

In [None]:
df_rank_THE['stats_number_students'] = df_rank_THE.stats_number_students.apply(lambda x: x.replace(',', ''))
df_rank_THE['stats_pc_intl_students'] = df_rank_THE.stats_pc_intl_students.apply(lambda x: float(x.replace('%', ''))/100)
df_rank_THE['stats_student_staff_ratio'] = df_rank_THE.stats_student_staff_ratio.apply(lambda x: 1/float(x))

In [None]:
col = [('', 'Country'),('', 'rank'), ('students', 'total'), ('ratio', 'international/total'), ('ratio', 'faculty/students')]

df_rank_THE.columns = pd.MultiIndex.from_tuples([('', x[0]) if pd.isnull(x[1]) else x for x in col])

df_rank_THE['students', 'international'] = df_rank_THE[('students', 'total')].apply(lambda x: float(x)).multiply(df_rank_THE[('ratio', 'international/total')])
df_rank_THE['faculty', 'total'] = df_rank_THE[('ratio', 'faculty/students')].apply(lambda x: float(x)).multiply(df_rank_THE[('students', 'total')].apply(lambda x: float(x)))

df_rank_THE['faculty', 'total'] = df_rank_THE['faculty', 'total'].apply(lambda x: int(x))
df_rank_THE['students', 'total'] = df_rank_THE['students', 'total'].apply(lambda x: int(x))

df_rank_THE = df_rank_THE[[('', 'Country'), ('', 'rank'), ('faculty', 'total'), ('students', 'international'), ('students', 'total'), ('ratio', 'faculty/students'), ('ratio', 'international/total')]]

In [None]:
THE_ratio_university = df_rank_THE.copy()[['students', 'ratio']]

## University

### Best faculty/student ratio

In [None]:
THE_ratio_university.sort_values(('ratio', 'faculty/students'), ascending=False).head()

### Best international ratio

In [None]:
THE_ratio_university.sort_values(('ratio', 'international/total'), ascending=False).head()

## Country

In [None]:
THE_ratio_country = df_rank_THE.copy()
THE_ratio_country = THE_ratio_country.groupby([('', 'Country')]).sum()[['faculty', 'students']]
THE_ratio_country.index.name = 'Country'

THE_ratio_country['ratio', 'faculty/students'] = THE_ratio_country['faculty', 'total']/THE_ratio_country['students', 'total']
THE_ratio_country['ratio', 'international/total'] = THE_ratio_country['students', 'international']/THE_ratio_country['students', 'total']

### Best faculty/student ratio

In [None]:
THE_ratio_country.sort_values(('ratio', 'faculty/students'), ascending=False).head()

### Best international ratio

In [None]:
THE_ratio_country.sort_values(('ratio', 'international/total'), ascending=False)

# Question 3