In [None]:
# Import libraries
import requests
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import pandas as pd
import math
pd.options.mode.chained_assignment = None  # default='warn'
%matplotlib inline

# First Website : QS Ranking

Data scraping for this first website takes quite a long time, so we let us the opportunity to load the dataframe from a presaved csv.

In [None]:
qs_uni_rank = pd.DataFrame.from_csv('QS_dataframe.csv')

Using Postman, we found this request which gives a JSON containing most of the data we're interested in.

In [None]:
qs_url = 'https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt?_=1508694016501'
qs_r = requests.get(qs_url)

In [None]:
qs_uni_dict = qs_r.json()

We create the dataframe with the column name in accordance with the data we're about to collect.

In [None]:
qs_columns = ['Name','Region','Location','QS Rank','QS Score','No Staffs', 'No internat staffs', 'QS No students', 'No internat students']
qs_uni_rank = pd.DataFrame(index = [], columns=qs_columns)
qs_uni_rank.head()

Since the informations about the numbers of staff and students, international or not, aren't stored in the JSON file,
we use the following function to get these data from the specific universities HTML pages.

In [None]:
def get_numbers_stud_staff(url):
    r_uni = requests.get("https://www.topuniversities.com" + url)
    
    html_request_uni = BeautifulSoup(r_uni.text, 'html.parser')
    
    header_staff = html_request_uni.find('div',class_='faculty-main wrapper col-md-4')
    header_stud_total = html_request_uni.find('div',class_='students-main wrapper col-md-4')
    header_stud_inter = html_request_uni.find('div',class_='int-students-main wrapper col-md-4')
    
    if ((header_staff == None) & (header_stud_total == None) & (header_stud_inter == None)):
        return (pd.np.nan,pd.np.nan,pd.np.nan,pd.np.nan)
        

    subdiv_staff_total = header_staff.find('div', class_='total faculty')
    subdiv_staff_inter = header_staff.find('div', class_= 'inter faculty')
    subdiv_stud_total = header_stud_total.find('div', class_='total student')
    subdiv_stud_inter = header_stud_inter.find('div', class_='total inter')
    
    num_staff_total = int(subdiv_staff_total.find('div', class_ = 'number').text[1:-1].replace(',',''))
    if (subdiv_staff_inter != None):
        num_staff_inter = int(subdiv_staff_inter.find('div', class_ = 'number').text[1:-1].replace(',',''))
    else:
        num_staff_inter = pd.np.nan
    num_stud_total = int(subdiv_stud_total.find('div', class_ = 'number').text[1:-1].replace(',',''))
    num_stud_inter = int(subdiv_stud_inter.find('div', class_ = 'number').text[1:-1].replace(',',''))
    
    return (num_staff_total, num_staff_inter, num_stud_total, num_stud_inter)

In [None]:
uni_list = qs_uni_dict['data']
i = 0
for uni in uni_list[:200]:
    i+=1
    if (i%40 == 0):
        print(i)
    (tot_staff, inter_staff, tot_stud, inter_stud) = get_numbers_stud_staff(uni['url'])
    uni_df = pd.DataFrame([[uni['title'],uni['region'], uni['country'], uni['rank_display'].replace('=',' '),
                            uni['score'], tot_staff, inter_staff, tot_stud, inter_stud]], columns = qs_columns)
    qs_uni_rank = qs_uni_rank.append(uni_df)

qs_uni_rank.index = list(range(1,201))


In [None]:
qs_uni_rank.to_csv('QS_dataframe.csv')
qs_uni_rank

We now convert each numerical column into float type and calculate the ratios we're interested in

In [None]:
numerical_fields = ['QS Rank', 'QS Score','No Staffs', 'No internat staffs', 'QS No students', 'No internat students']

for field in numerical_fields:
    qs_uni_rank.loc[:,field] = qs_uni_rank.loc[:,field].astype(float)

qs_uni_rank['QS International Students Ratio']\
                    = qs_uni_rank['No internat students'] / qs_uni_rank['QS No students']
qs_uni_rank['QS Staff per Student Ratio'] = qs_uni_rank['No Staffs'] / qs_uni_rank['QS No students']

qs_uni_rank = qs_uni_rank.drop(['No internat students','No Staffs','No internat staffs'],axis=1)

qs_uni_rank

In [None]:
num_values_index = ['QS International Students Ratio', 'QS Staff per Student Ratio']

region_qs = qs_uni_rank[['Region'] + num_values_index].set_index('Region')\
                .groupby('Region').mean()
    
location_qs = qs_uni_rank[['Location'] + num_values_index].set_index('Location')\
                .groupby('Location').mean()

name_qs = qs_uni_rank[['Name'] + num_values_index].set_index('Name')

Now that we grouped the values we were interested in by location, region and name, we can plot and print 
the main results

In [None]:
print( "Ten first universities in terms of international students ratio :")
print(name_qs.sort_values(['QS International Students Ratio'],ascending=False)[1:10]['QS International Students Ratio'],'\n','\n')

print( "Ten first universities in terms of staff over students ratio :")
print(name_qs.sort_values(['QS Staff per Student Ratio'],ascending=False)[1:10]['QS Staff per Student Ratio'])

In [None]:
print( "Ten first locations in terms of international students ratio :")
print(location_qs.sort_values(['QS International Students Ratio'],ascending=False)[1:10]['QS International Students Ratio'],'\n','\n')

print( "Ten first locations in terms of staff over students ratio :")
print(location_qs.sort_values(['QS Staff per Student Ratio'],ascending=False)[1:10]['QS Staff per Student Ratio'])

location_qs.plot(kind='bar',figsize=[20,5],fontsize=14)

In [None]:
region_qs.plot(kind='bar',fontsize=12)

# Second Website: THE ranking

Note that as the number students, the number of staffs and the number of international students are already mentionned in the QS ranking website they are not scrapped from the Times University Ranking website. We indeed assume that the information provided by the QS ranking website are reliable.

In [None]:
the_r = requests.get("https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json")

the_uni_dict = the_r.json()['data'][:200]

the_columns = ['Name','Location','THE Rank','THE Score','THE No Students', 'THE International Students Ratio','THE Staff per Student Ratio']
the_uni_rank = pd.DataFrame(index = [], columns=the_columns)
the_uni_rank.head()

In [None]:
for uni in the_uni_dict:
    uni_df = pd.DataFrame([[uni['name'],uni['location'], uni['rank'].replace('=',' '),uni['scores_overall'],\
                           uni['stats_number_students'],uni['stats_pc_intl_students'],\
                          uni['stats_student_staff_ratio']]],columns = the_columns)
    the_uni_rank = the_uni_rank.append(uni_df)
    
the_uni_rank.index = list(range(1,201))

Let's convert all the numbers in float format.

In [None]:
the_uni_rank.loc[:,'THE International Students Ratio'] = \
            the_uni_rank.loc[:,'THE International Students Ratio'].astype(str).replace({'%':''}, regex=True)\
            .astype(float)/100

numeric_fields = ['THE Rank', 'THE Score', 'THE No Students', 'THE Staff per Student Ratio']
for field in numeric_fields:
    the_uni_rank.loc[:,field] = the_uni_rank.loc[:,field].astype(str).replace({',':'.'}, regex=True).astype(float)

the_uni_rank.loc[:,'THE Staff per Student Ratio'] = 1/the_uni_rank.loc[:,'THE Staff per Student Ratio']

In [None]:
the_uni_rank

In [None]:
qs_location_region = qs_uni_rank[['Location','Region']].drop_duplicates()

the_with_regions = pd.merge(the_uni_rank,qs_location_region, how='outer', on='Location')
qs_location_region = the_with_regions[['Location','Region']]

the_with_regions = the_with_regions[the_with_regions['THE Rank'].notnull()]
the_with_regions[the_with_regions['Region'].isnull()]

In [None]:
the_with_regions.loc[the_with_regions['Location'] == 'Luxembourg','Region'] = 'Europe'
the_with_regions.loc[the_with_regions['Location'] == 'Russian Federation','Region'] = 'Europe'
the_with_regions[the_with_regions['Region'].isnull()]

In [None]:
num_values_index = ['THE International Students Ratio', 'THE Staff per Student Ratio']

region_the = the_with_regions[['Region'] + num_values_index].set_index('Region')\
                .groupby('Region').mean()
    
location_the = the_with_regions[['Location'] + num_values_index].set_index('Location')\
                .groupby('Location').mean()

name_the = the_with_regions[['Name'] + num_values_index].set_index('Name')

In [None]:
print( "Ten first universities in terms of international students ratio :")
print(name_the.sort_values(['THE International Students Ratio'],ascending=False)[1:10]['THE International Students Ratio'],'\n','\n')

print( "Ten first universities in terms of staff over students ratio :")
print(name_the.sort_values(['THE Staff per Student Ratio'],ascending=False)[1:10]['THE Staff per Student Ratio'])

In [None]:
print( "Ten first locations in terms of international students ratio :")
print(location_the.sort_values(['THE International Students Ratio'],ascending=False)[1:10]['THE International Students Ratio'],'\n','\n')

print( "Ten first locations in terms of staff over students ratio :")
print(location_the.sort_values(['THE Staff per Student Ratio'],ascending=False)[1:10]['THE Staff per Student Ratio'])

location_the.plot(kind='bar',figsize=[20,5],fontsize=14)

In [None]:
plt.figure()
region_the.plot(kind='bar')

# Merging the two DataFrames

In [None]:
# Removing brackets and choosing the option that is less likely to be an acronym
for ind_qs, qs_uni_name in enumerate(qs_uni_rank['Name']):
    if ('(' in qs_uni_name):
        first_brack = qs_uni_name.index('(')
        brack_content = qs_uni_name[first_brack+1:-1]
        if (' ' in brack_content):
            qs_uni_rank['Name'].iloc[ind_qs] = brack_content
        else:
            qs_uni_rank['Name'].iloc[ind_qs] = qs_uni_name[:first_brack-1]
         

There is only one difference in the location definition between both DataFrames that is fixed now. Indeed it is necessary to do so for the merging operation to not fail.

In [None]:
print("Before changing:")
print(qs_uni_rank[qs_uni_rank['Name'] == 'Lomonosov Moscow State University']['Location'])
print(the_uni_rank[the_uni_rank['Name'] == 'Lomonosov Moscow State University']['Location'])
the_uni_rank.loc[194, 'Location'] = 'Russia'
print()
print("After changing:")
print(the_uni_rank[the_uni_rank['Name'] == 'Lomonosov Moscow State University']['Location'])


In [None]:
merged_uni_rank = pd.merge(qs_uni_rank, the_uni_rank, how = "outer")
merged_uni_rank = merged_uni_rank.set_index('Name')
merged_uni_rank.index.is_unique

In [None]:
merged_uni_rank