In [35]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import re
from dotenv import load_dotenv
import os

### Info: Webometrics

In [12]:
NUM_PAGES = 39
links = [f'https://www.webometrics.info/es/latin_america_es?page={page}' for page in range(NUM_PAGES)]
links[-4:]

['https://www.webometrics.info/es/latin_america_es?page=35',
 'https://www.webometrics.info/es/latin_america_es?page=36',
 'https://www.webometrics.info/es/latin_america_es?page=37',
 'https://www.webometrics.info/es/latin_america_es?page=38']

In [13]:
def get_metadata(text):
    val = text.get_text(strip=True) 
    if val == '':
        img_element = text.find('img')
        if img_element:
            return img_element.attrs['src']
    return val

In [14]:
headers = None
data = []
for i, link in enumerate(links):
    print(f'Page: {i+1}')
    response = requests.get(url=link)
    if response.status_code != 200:
        print(f'Error: {response.status_code}')
        break

    web_content = response.text
    soup = BeautifulSoup(web_content, 'html.parser')
    element = soup.find(class_='sticky-enabled')

    if i == 0:
        headers = [header.get_text(strip=True) for header in element.find('thead').find_all('th')]
        print(f'Headers: {headers}')

    data += [[get_metadata(val) for val in row.find_all('td')] for row in element.find('tbody').find_all('tr')]

Page: 1
Headers: ['Ranking', 'Ranking Mundial', 'Universidad', 'Det.', 'País', 'Impacto (Posición*)', 'Apertura (Posición*)', 'Excelencia (Posición*)']
Page: 2
Page: 3
Page: 4
Page: 5
Page: 6
Page: 7
Page: 8
Page: 9
Page: 10
Page: 11
Page: 12
Page: 13
Page: 14
Page: 15
Page: 16
Page: 17
Page: 18
Page: 19
Page: 20
Page: 21
Page: 22
Page: 23
Page: 24
Page: 25
Page: 26
Page: 27
Page: 28
Page: 29
Page: 30
Page: 31
Page: 32
Page: 33
Page: 34
Page: 35
Page: 36
Page: 37
Page: 38
Page: 39


In [212]:
# with open('data/latam_universities_data.txt', 'w') as file:
#     file.write(json.dumps(data))

# with open('data/latam_universities_headers.txt', 'w') as file:
#     file.write(json.dumps(headers))

In [16]:
df = pd.DataFrame(data=data, columns=headers)
df.head()

Unnamed: 0,Ranking,Ranking Mundial,Universidad,Det.,País,Impacto (Posición*),Apertura (Posición*),Excelencia (Posición*)
0,1,71,Universidade de São Paulo USP,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,124,72,66
1,2,108,Universidad Nacional Autónoma de México,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,61,185,310
2,3,246,Universidade Estadual de Campinas UNICAMP,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,317,204,272
3,4,286,Universidade Federal de Minas Gerais UFMG,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,196,282,494
4,5,298,Universidade Federal do Rio de Janeiro,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,273,296,422


In [19]:
df['Universidad'] = df['Universidad'].apply(lambda x: re.sub(r'\(\d+\)', '', x).strip())

In [20]:
df['País'].unique()

array(['https://www.webometrics.info/sites/default/files/logos/br.png',
       'https://www.webometrics.info/sites/default/files/logos/mx.png',
       'https://www.webometrics.info/sites/default/files/logos/cl.png',
       'https://www.webometrics.info/sites/default/files/logos/ar.png',
       'https://www.webometrics.info/sites/default/files/logos/co.png',
       'https://www.webometrics.info/sites/default/files/logos/pr.png',
       'https://www.webometrics.info/sites/default/files/logos/pe.png',
       'https://www.webometrics.info/sites/default/files/logos/jm.png',
       'https://www.webometrics.info/sites/default/files/logos/ec.png',
       'https://www.webometrics.info/sites/default/files/logos/cr.png',
       'https://www.webometrics.info/sites/default/files/logos/tt.png',
       'https://www.webometrics.info/sites/default/files/logos/ve.png',
       'https://www.webometrics.info/sites/default/files/logos/uy.png',
       'https://www.webometrics.info/sites/default/files/logos/g

In [21]:
doc_countries = [
    {'is_latam': False, 'country': 'Brasil', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/br.png'},
    {'is_latam': True, 'country': 'Mexico', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/mx.png'},
    {'is_latam': True, 'country': 'Chile', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/cl.png'},
    {'is_latam': True, 'country': 'Argentina', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/ar.png'},
    {'is_latam': True, 'country': 'Colombia', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/co.png'},
    {'is_latam': True, 'country': 'Puerto Rico', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/pr.png'},
    {'is_latam': True, 'country': 'Peru', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/pe.png'},
    {'is_latam': False, 'country': 'Jamaica', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/jm.png'},
    {'is_latam': True, 'country': 'Ecuador', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/ec.png'},
    {'is_latam': True, 'country': 'Costa Rica', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/cr.png'},
    {'is_latam': False, 'country': 'Trinidad y Tobago', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/tt.png'},
    {'is_latam': True, 'country': 'Venezuela', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/ve.png'},
    {'is_latam': True, 'country': 'Uruguay', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/uy.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/gd.png'},
    {'is_latam': True, 'country': 'Bolivia', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/bo.png'},
    {'is_latam': True, 'country': 'Paraguay', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/py.png'},
    {'is_latam': True, 'country': 'Cuba', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/cu.png'},
    {'is_latam': True, 'country': 'Guatemala', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/gt.png'},
    {'is_latam': True, 'country': 'Panama', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/pa.png'},
    {'is_latam': True, 'country': 'Honduras', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/hn.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/bb.png'},
    {'is_latam': True, 'country': 'Republica Dominicana', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/do.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/vi.png'},
    {'is_latam': True, 'country': 'Nicaragua', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/ni.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/bs.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/gy.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/ag.png'},
    {'is_latam': True, 'country': 'El Salvador', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/sv.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/bz.png'},
    {'is_latam': False, 'country': 'Surinam', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/sr.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/ht.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/ai.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/aw.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/kn.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/ky.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/dm.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/vc.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/lc.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/ms.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/bm.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/vg.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/tc.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/KY.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/gf.png'},
    {'is_latam': False, 'country': 'Unknown', 'icon_link': 'https://www.webometrics.info/sites/default/files/logos/mq.png'}
]

In [22]:
def search(documents, query):
    results = [doc for doc in doc_countries if doc[query['field']] == query['value']]
    return results

In [23]:
query = {'field': 'is_latam', 'value': True}
filtered_countries = search(doc_countries, query)
filtered_countries[:3]

[{'is_latam': True,
  'country': 'Mexico',
  'icon_link': 'https://www.webometrics.info/sites/default/files/logos/mx.png'},
 {'is_latam': True,
  'country': 'Chile',
  'icon_link': 'https://www.webometrics.info/sites/default/files/logos/cl.png'},
 {'is_latam': True,
  'country': 'Argentina',
  'icon_link': 'https://www.webometrics.info/sites/default/files/logos/ar.png'}]

In [24]:
countries = [record['country'] for record in filtered_countries]
countries[:3]

['Mexico', 'Chile', 'Argentina']

In [25]:
country_links = [record['icon_link'] for record in filtered_countries]
country_links[:3]

['https://www.webometrics.info/sites/default/files/logos/mx.png',
 'https://www.webometrics.info/sites/default/files/logos/cl.png',
 'https://www.webometrics.info/sites/default/files/logos/ar.png']

In [26]:
latam_df = df.loc[df['País'].isin(country_links), :]
latam_df.head()

Unnamed: 0,Ranking,Ranking Mundial,Universidad,Det.,País,Impacto (Posición*),Apertura (Posición*),Excelencia (Posición*)
1,2,108,Universidad Nacional Autónoma de México,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,61,185,310
5,6,300,Universidad de Chile,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,282,184,445
7,8,382,Universidad de Buenos Aires,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,337,453,594
9,10,434,Pontificia Universidad Católica de Chile,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,685,421,451
10,11,577,Universidad Nacional de La Plata,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,534,266,1008


In [27]:
latam_df.shape

(2490, 8)

In [30]:
latam_df.to_csv('data/latam_universities.csv', index=False, encoding='utf-16', sep=';')

### Benchmark

In [36]:
load_dotenv(dotenv_path='.env')
UDOCZ_AUTHORIZATION_BEARER = os.getenv('UDOCZ_AUTHORIZATION_BEARER')

In [187]:
reqUrl = "https://search.udocz.com/indexes/University/search"

headersList = {
 "Content-Type": "application/json",
 "Authorization": f"Bearer {UDOCZ_AUTHORIZATION_BEARER}" 
}

payload = json.dumps({
  "q": "",
  "limit": 2000,
  "filter": []
})

response = requests.request("POST", reqUrl, data=payload,  headers=headersList)

print(f'Status code: {response.status_code}')

Status code: 200


In [188]:
df_benchmark = pd.DataFrame(json.loads(response.text)['hits'])
df_benchmark.head()

Unnamed: 0,name,country,id,person_count
0,Escuela Superior de Fisioterapia y Rehabilitac...,Mexico,20229,515
1,Instituto Universitario de Tecnología de Cabimas,Venezuela,88278,11
2,Instituto de Educación Superior Tecnológico Pú...,Peru,46418,44
3,IE University,Spain,51811,55
4,Instituto Tecnológico de Toluca,Mexico,18939,387


In [189]:
df_benchmark.shape

(1000, 4)

In [196]:
df_benchmark.country.unique()

array(['Mexico', 'Venezuela', 'Peru', 'Spain', 'Chile', 'Argentina',
       'Colombia', 'Guatemala', 'Bolivia', 'Cuba', 'Ecuador', 'Paraguay',
       'Costa Rica', 'Panama', 'Puerto Rico', 'Nicaragua', 'Uruguay',
       'Honduras', 'El Salvador', 'Dominican Republic'], dtype=object)

In [195]:
countries

['Mexico',
 'Chile',
 'Argentina',
 'Colombia',
 'Puerto Rico',
 'Peru',
 'Ecuador',
 'Costa Rica',
 'Venezuela',
 'Uruguay',
 'Bolivia',
 'Paraguay',
 'Cuba',
 'Guatemala',
 'Panama',
 'Honduras',
 'Republica Dominicana',
 'Nicaragua',
 'El Salvador']

### Validation

In [204]:
latam_records = latam_df.to_dict('records')
latam_records[:3]

[{'Ranking': '2',
  'Ranking Mundial': '108',
  'Universidad': 'Universidad Nacional Autónoma de México',
  'Det.': 'https://www.webometrics.info/sites/default/files/Details.jpg',
  'País': 'https://www.webometrics.info/sites/default/files/logos/mx.png',
  'Impacto (Posición*)': '61',
  'Apertura (Posición*)': '185',
  'Excelencia (Posición*)': '310'},
 {'Ranking': '6',
  'Ranking Mundial': '300',
  'Universidad': 'Universidad de Chile',
  'Det.': 'https://www.webometrics.info/sites/default/files/Details.jpg',
  'País': 'https://www.webometrics.info/sites/default/files/logos/cl.png',
  'Impacto (Posición*)': '282',
  'Apertura (Posición*)': '184',
  'Excelencia (Posición*)': '445'},
 {'Ranking': '8',
  'Ranking Mundial': '382',
  'Universidad': 'Universidad de Buenos Aires',
  'Det.': 'https://www.webometrics.info/sites/default/files/Details.jpg',
  'País': 'https://www.webometrics.info/sites/default/files/logos/ar.png',
  'Impacto (Posición*)': '337',
  'Apertura (Posición*)': '453',


In [207]:
udocz_university_search_url = "https://search.udocz.com/indexes/University/search"
headersList = {
 "Content-Type": "application/json",
 "Authorization": "Bearer b21b0fad8ef17e929596c43ee41bfabeee110a3450755d0f601f7184925ce824" 
}

new_data = []
for i, record in enumerate(latam_records):
    university = record['Universidad']
    payload = json.dumps(
        {
            "q": f"{university}",
            "limit": 10,
            "filter": []
        }
    )
    response = requests.post(url=udocz_university_search_url, data=payload,  headers=headersList)
    first_match = ''
    payload = None
    if response.status_code == 200:
        data = json.loads(response.text)['hits']
        if len(data) > 0:
            first_match = data[0]['name']
        else:
            payload = data 
    else:
        print(f'Status code: {response.status_code} - Reason: {response.reason}')
    
    record['first_match'] = first_match
    record['payload'] = payload
    print(f'Iteration: {i} -- University: {university} -- First match: {first_match}')
    new_data.append(record)

Iteration: 0 -- University: Universidad Nacional Autónoma de México -- First match: Universidad Nacional Autónoma de México
Iteration: 1 -- University: Universidad de Chile -- First match: Universidad de Chile
Iteration: 2 -- University: Universidad de Buenos Aires -- First match: Universidad de Buenos Aires
Iteration: 3 -- University: Pontificia Universidad Católica de Chile -- First match: Pontificia Universidad Católica de Chile
Iteration: 4 -- University: Universidad Nacional de La Plata -- First match: Universidad Nacional de La Plata
Iteration: 5 -- University: Tecnológico de Monterrey -- First match: Tecnológico de Monterrey
Iteration: 6 -- University: Universidad de Concepción -- First match: Universidad de Concepción
Iteration: 7 -- University: Universidad de los Andes Colombia -- First match: Universidad de los Andes
Iteration: 8 -- University: Universidad de Guadalajara -- First match: Universidad de Guadalajara
Iteration: 9 -- University: Centro de Investigación y de Estudi

In [213]:
# with open('data/new_data.txt', 'w') as file:
#     file.write(json.dumps(new_data))

In [209]:
new_df = pd.DataFrame(new_data)
new_df.head()

Unnamed: 0,Ranking,Ranking Mundial,Universidad,Det.,País,Impacto (Posición*),Apertura (Posición*),Excelencia (Posición*),first_match,payload
0,2,108,Universidad Nacional Autónoma de México,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,61,185,310,Universidad Nacional Autónoma de México,
1,6,300,Universidad de Chile,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,282,184,445,Universidad de Chile,
2,8,382,Universidad de Buenos Aires,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,337,453,594,Universidad de Buenos Aires,
3,10,434,Pontificia Universidad Católica de Chile,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,685,421,451,Pontificia Universidad Católica de Chile,
4,11,577,Universidad Nacional de La Plata,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,534,266,1008,Universidad Nacional de La Plata,


In [None]:
new_df['Universidad'].apply(lambda x: x.lower())

### Load downloaded data

In [10]:
_df = pd.read_csv('data/latam_universities.csv')
_df.head()

Unnamed: 0,Ranking,Ranking Mundial,Universidad,Det.,País,Impacto (Posición*),Apertura (Posición*),Excelencia (Posición*)
0,2,108,Universidad Nacional Autónoma de México,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,61,185,310
1,6,300,Universidad de Chile,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,282,184,445
2,8,382,Universidad de Buenos Aires,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,337,453,594
3,10,434,Pontificia Universidad Católica de Chile,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,685,421,451
4,11,577,Universidad Nacional de La Plata,https://www.webometrics.info/sites/default/fil...,https://www.webometrics.info/sites/default/fil...,534,266,1008
