# Web Scraping

In [1]:
#!pip install lxml
#!pip install lxmunidecodel
#!pip install unidecode

import os
import time
import re
import unidecode
import time 

import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Initializing chrome web manager for SUNEDU web scraping
ChromeDriverManager
driver = webdriver.Chrome( ChromeDriverManager().install() )
driver.maximize_window()
url = 'https://www.sunedu.gob.pe/lista-universidades/'
driver.get( url )
driver.execute_script( "document.body.style.zoom='100%'" )

# Scrapping public universities information
public_tab_path = driver.find_element_by_xpath( "/html/body/div[1]/div[3]/div/div/div[1]/div/div/div/div[1]/div/div[1]/div" )
public_html     = public_tab_path.get_attribute( 'innerHTML' )
public_html_pd  = pd.read_html( public_html )
public_tab      = public_html_pd[0]
public_tab.set_index("N°", inplace = True)
public_tab      = public_tab[['UNIVERSIDAD','DEPARTAMENTO','PROVINCIA']]

# Scrapping private universities information
private_tab_path = driver.find_element_by_xpath( "/html/body/div[1]/div[3]/div/div/div[1]/div/div/div/div[1]/div/div[2]/div" )
private_html     = private_tab_path.get_attribute( 'innerHTML' )
private_html_pd  = pd.read_html( private_html )
private_tab      = private_html_pd[0]
private_tab.set_index("N°", inplace = True)
private_tab      = private_tab[['UNIVERSIDAD','DEPARTAMENTO','PROVINCIA']]

# Unifying public and private universities information
uni_df =  pd.concat([private_tab, public_tab],ignore_index=True)
uni_df["DIRECCION"] = uni_df['UNIVERSIDAD']  + ', ' + uni_df['DEPARTAMENTO'] + ', ' + uni_df['PROVINCIA']
uni_df



Current google-chrome version is 97.0.4692
Get LATEST chromedriver version for 97.0.4692 google-chrome
Driver [C:\Users\josed\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache


Unnamed: 0,UNIVERSIDAD,DEPARTAMENTO,PROVINCIA,DIRECCION
0,Pontificia Universidad Católica del Perú,Lima,Lima,"Pontificia Universidad Católica del Perú, Lima..."
1,Universidad Peruana Cayetano Heredia,Lima,Lima,"Universidad Peruana Cayetano Heredia, Lima, Lima"
2,Universidad Católica de Santa María,Arequipa,Arequipa,"Universidad Católica de Santa María, Arequipa,..."
3,Universidad del Pacífico,Lima,Lima,"Universidad del Pacífico, Lima, Lima"
4,Universidad de Lima,Lima,Lima,"Universidad de Lima, Lima, Lima"
...,...,...,...,...
138,Universidad Nacional Autónoma de Huanta,Ayacucho,Huanta,"Universidad Nacional Autónoma de Huanta, Ayacu..."
139,Universidad Nacional Tecnológica de San Juan d...,Lima,Lima,Universidad Nacional Tecnológica de San Juan d...
140,Universidad Autónoma Municipal de Los Olivos,Lima,Lima,"Universidad Autónoma Municipal de Los Olivos, ..."
141,Universidad Nacional Autónoma de Tayacaja Dani...,Huancavelica,Tayacaja,Universidad Nacional Autónoma de Tayacaja Dani...


# Geolocalization of universities

In [3]:
#!pip install googlemaps
import googlemaps
import json
import pprint
import time
import re

In [4]:
# Generate direccion
direccion = list(uni_df["DIRECCION"])

# Generate a Matrix
coord = np.zeros(shape=(len(direccion),2), dtype =float)
i=0

gmaps = googlemaps.Client(key='AIzaSyBZz-ffplUIzI1PclCpd_c1JTlxp06ufjM')

# Loop over lists
for direc in direccion:   

    # Geocoding an address
    geocode_result = gmaps.geocode( direc , region = 'PE')
    
    # Check the len of result
    if len(geocode_result)==0 :
        coord[i][0] = np.nan
        coord[i][1] = np.nan

    
    # Get info
    else :
        coord[i][0] = geocode_result[0]['geometry']['location']['lat']
        coord[i][1] = geocode_result[0]['geometry']['location']['lng']
        
    i=i+1

In [5]:
uni_df["lat"] = coord[:,0]
uni_df["lon"] = coord[:,1]
uni_df

Unnamed: 0,UNIVERSIDAD,DEPARTAMENTO,PROVINCIA,DIRECCION,lat,lon
0,Pontificia Universidad Católica del Perú,Lima,Lima,"Pontificia Universidad Católica del Perú, Lima...",-12.069512,-77.079359
1,Universidad Peruana Cayetano Heredia,Lima,Lima,"Universidad Peruana Cayetano Heredia, Lima, Lima",-12.023977,-77.056505
2,Universidad Católica de Santa María,Arequipa,Arequipa,"Universidad Católica de Santa María, Arequipa,...",-16.406179,-71.547630
3,Universidad del Pacífico,Lima,Lima,"Universidad del Pacífico, Lima, Lima",-12.083797,-77.048806
4,Universidad de Lima,Lima,Lima,"Universidad de Lima, Lima, Lima",-12.084724,-76.971009
...,...,...,...,...,...,...
138,Universidad Nacional Autónoma de Huanta,Ayacucho,Huanta,"Universidad Nacional Autónoma de Huanta, Ayacu...",-12.939932,-74.243683
139,Universidad Nacional Tecnológica de San Juan d...,Lima,Lima,Universidad Nacional Tecnológica de San Juan d...,-11.983305,-77.009474
140,Universidad Autónoma Municipal de Los Olivos,Lima,Lima,"Universidad Autónoma Municipal de Los Olivos, ...",-12.000352,-77.083390
141,Universidad Nacional Autónoma de Tayacaja Dani...,Huancavelica,Tayacaja,Universidad Nacional Autónoma de Tayacaja Dani...,-12.360644,-74.837916


# Prepocesing data for Directions API

In [6]:
## Import centroid Data
dep_centroids = pd.read_excel( r'../../_data/peru_departments_centroids.xlsx')


## Generate Key for merge university data with centorid data
uni_df['DEP_KEY'] = uni_df['DEPARTAMENTO']

for i in range(len(uni_df)):
    uni_df.loc[i,'DEP_KEY'] = unidecode.unidecode(uni_df['DEPARTAMENTO'][i])

uni_df['NOMBDEP'] = uni_df['DEP_KEY'].str.upper()

## Merge data
dist_data = uni_df.merge(          # dataframe A to be merged
                 dep_centroids,    # dataframe B to be merged with
                 on = 'NOMBDEP',   # by variable name
                 how = 'left',     # keep A and complete with B
                 validate = "m:1"  # Asign unique values (others: m:1, 1:m, m:m)
                 )

dist_data["origin"] = dist_data["lat"].astype(str) + ',' + dist_data["lon"].astype(str)
dist_data["destination"] = dist_data["Dpt_Centroid_Latitude"].astype(str) + ',' + dist_data["Dpt_Centroid_Longitude"].astype(str)
dist_data

Unnamed: 0,UNIVERSIDAD,DEPARTAMENTO,PROVINCIA,DIRECCION,lat,lon,DEP_KEY,NOMBDEP,CCDD,CAPITAL,Dpt_Centroid_Latitude,Dpt_Centroid_Longitude,origin,destination
0,Pontificia Universidad Católica del Perú,Lima,Lima,"Pontificia Universidad Católica del Perú, Lima...",-12.069512,-77.079359,Lima,LIMA,15,LIMA,-11.785115,-76.628934,"-12.069512,-77.0793592","-11.7851149934921,-76.62893414942204"
1,Universidad Peruana Cayetano Heredia,Lima,Lima,"Universidad Peruana Cayetano Heredia, Lima, Lima",-12.023977,-77.056505,Lima,LIMA,15,LIMA,-11.785115,-76.628934,"-12.0239773,-77.0565049","-11.7851149934921,-76.62893414942204"
2,Universidad Católica de Santa María,Arequipa,Arequipa,"Universidad Católica de Santa María, Arequipa,...",-16.406179,-71.547630,Arequipa,AREQUIPA,4,AREQUIPA,-15.844475,-72.472918,"-16.4061786,-71.5476298","-15.84447501555863,-72.47291786966441"
3,Universidad del Pacífico,Lima,Lima,"Universidad del Pacífico, Lima, Lima",-12.083797,-77.048806,Lima,LIMA,15,LIMA,-11.785115,-76.628934,"-12.0837969,-77.048806","-11.7851149934921,-76.62893414942204"
4,Universidad de Lima,Lima,Lima,"Universidad de Lima, Lima, Lima",-12.084724,-76.971009,Lima,LIMA,15,LIMA,-11.785115,-76.628934,"-12.0847243,-76.9710095","-11.7851149934921,-76.62893414942204"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,Universidad Nacional Autónoma de Huanta,Ayacucho,Huanta,"Universidad Nacional Autónoma de Huanta, Ayacu...",-12.939932,-74.243683,Ayacucho,AYACUCHO,5,AYACUCHO,-14.086284,-74.083968,"-12.9399318,-74.2436828","-14.08628399299046,-74.08396839739828"
139,Universidad Nacional Tecnológica de San Juan d...,Lima,Lima,Universidad Nacional Tecnológica de San Juan d...,-11.983305,-77.009474,Lima,LIMA,15,LIMA,-11.785115,-76.628934,"-11.9833053,-77.0094738","-11.7851149934921,-76.62893414942204"
140,Universidad Autónoma Municipal de Los Olivos,Lima,Lima,"Universidad Autónoma Municipal de Los Olivos, ...",-12.000352,-77.083390,Lima,LIMA,15,LIMA,-11.785115,-76.628934,"-12.0003522,-77.08339029999999","-11.7851149934921,-76.62893414942204"
141,Universidad Nacional Autónoma de Tayacaja Dani...,Huancavelica,Tayacaja,Universidad Nacional Autónoma de Tayacaja Dani...,-12.360644,-74.837916,Huancavelica,HUANCAVELICA,9,HUANCAVELICA,-13.023206,-75.002090,"-12.3606436,-74.8379163","-13.02320607851158,-75.0020895354934"


# Google Directions API

In [7]:
# Generate lists 
comb_idx = dist_data.index.tolist()
orig = dist_data['origin'].tolist()
dest = dist_data['destination'].tolist()

# Generate dictionary to store data
data_distance = {} 

In [8]:
import urllib.request, json
# Loop to generate info about geolocations
traf_mod = ['best_guess', 'optimistic', 'pessimistic']
traf_mod_dict = {}

for mod in traf_mod:
    distance_info = np.zeros(shape=(len(comb_idx),7), dtype =float)
    i=0

    for c,o,d in list(zip(comb_idx, orig, dest)):
        try:
            # Google MapsDdirections API endpoint
            endpoint = 'https://maps.googleapis.com/maps/api/directions/json?'

        ## Fixed Parameters
            # Paramaters
            traffic_model = mod  

            # Departure time
            departure_time= 'now'

            # driving, walking, biclycling, transit
            mode = 'driving'

            # key
            api_key = 'AIzaSyBZz-ffplUIzI1PclCpd_c1JTlxp06ufjM'

            # region to look for (spain= es , germany = de , Switzerland= swiss)
            region = 'PE'

         ## Parameters
            # Origin
            origin = o

            # Destinations
            destination = d

            #Building the URL for the request
            nav_request = 'origin={}&destination={}&departure_time={}&traffic_model={}&mode={}&region={}&key={}'.format(origin , 
                            destination , departure_time , traffic_model , mode, region, api_key)

            # https://maps.googleapis.com/maps/api/directions/json?origin=Toledo&destination=Madrid&region=es&key=AIzaSyD_4E6Hd-fYECy3mZ4asxN23JjIstvLdoE


            # Concatenate strings
            request = endpoint + nav_request

            #Sends the request and reads the response.
            response = urllib.request.urlopen(request).read()

            #Loads response as JSON
            directions = json.loads(response)
            #print(json.dumps(directions, indent = 2))

            legs = directions['routes'][0]['legs'][0]


            distance_info[i][0] = c
            distance_info[i][1] = float(re.sub("[^0-9.]", "", legs['distance']['text']))
            distance_info[i][2] = legs['distance']['value']

            distance_info[i][3] = float(re.sub("[^0-9.]", "", legs['duration']['text']))
            distance_info[i][4] = legs['duration']['value']

            distance_info[i][5] = float(re.sub("[^0-9.]", "", legs['duration_in_traffic']['text']))
            distance_info[i][6] = legs['duration_in_traffic']['value']

            i=i+1

            my_keys = ['distance', 'duration', 'duration_in_traffic']
            info = { my_key: legs[my_key] for my_key in my_keys }
            data_distance[c] = info 

        except Exception as e:

            distance_info[i][0] = c

            distance_info[i][1] = "nan"
            distance_info[i][2] = "nan"

            distance_info[i][3] = "nan"
            distance_info[i][4] = "nan"

            distance_info[i][5] = "nan"
            distance_info[i][6] = "nan"

            i=i+1

            #data_distance[c] = {'distance': {'text': 'nan', 'value': 0}, 'duration': {'text': 'nan', 'value': 0}, 'duration_in_traffic': {'text': 'nan', 'value': 0}}
            print(mod, c)
    loop_mod_dict = {mod: distance_info}
    traf_mod_dict.update(loop_mod_dict)

best_guess 15
best_guess 19
best_guess 20
best_guess 22
best_guess 42
best_guess 48
best_guess 54
best_guess 61
best_guess 66
best_guess 68
best_guess 78
best_guess 81
best_guess 93
best_guess 101
best_guess 106
best_guess 107
best_guess 109
best_guess 113
best_guess 116
best_guess 120
best_guess 121
best_guess 123
best_guess 126
best_guess 130
best_guess 133
best_guess 136
best_guess 138
optimistic 15
optimistic 19
optimistic 20
optimistic 22
optimistic 42
optimistic 48
optimistic 49
optimistic 54
optimistic 66
optimistic 68
optimistic 74
optimistic 78
optimistic 81
optimistic 93
optimistic 101
optimistic 106
optimistic 107
optimistic 109
optimistic 113
optimistic 116
optimistic 120
optimistic 121
optimistic 123
optimistic 126
optimistic 133
optimistic 136
optimistic 138
pessimistic 15
pessimistic 19
pessimistic 20
pessimistic 22
pessimistic 42
pessimistic 48
pessimistic 54
pessimistic 66
pessimistic 68
pessimistic 75
pessimistic 78
pessimistic 81
pessimistic 93
pessimistic 98
pessimi

# Final data procesing

In [9]:
## Data frames for best_guess, optimistic and pessimistic
dist_best = pd.DataFrame(traf_mod_dict['best_guess'],
                            columns = ['Combination', 'Distance_Km','Distance_meters','Duration_min',
                                       'Duration_seconds','Duration_min_traf', 'Duration_seconds_traf'])

dist_opti = pd.DataFrame(traf_mod_dict['optimistic'],
                            columns = ['Combination', 'Distance_Km','Distance_meters','Duration_min',
                                       'Duration_seconds','Duration_min_traf', 'Duration_seconds_traf'])

dist_pesi = pd.DataFrame(traf_mod_dict['pessimistic'],
                            columns = ['Combination', 'Distance_Km','Distance_meters','Duration_min',
                                       'Duration_seconds','Duration_min_traf', 'Duration_seconds_traf'])

## Combine data set
data_dist = [dist_best['Combination'], dist_pesi['Combination'], dist_opti['Combination'],
             dist_best['Duration_min'], dist_pesi['Duration_min'], dist_opti['Duration_min'],
             dist_best['Distance_Km'], dist_pesi['Distance_Km'], dist_opti['Distance_Km']]

headers = ['best_idx', 'pesi_idx', 'opti_idx',
           'travel_time_best_guess', 'travel_time_pessimistic', 'travel_time_optimistic',
           'travel_distance_best_guess', 'travel_distance_pessimistic', 'travel_distance_optimistic']
dist_api = pd.DataFrame(data_dist).transpose()
dist_api.columns = headers


## Combine distance data with universities data
dist_api['Department_univ']        = dist_data["DEPARTAMENTO"]
dist_api['Province_univ']          = dist_data["PROVINCIA"]
dist_api['Name_univ']              = dist_data["UNIVERSIDAD"]
dist_api['Latitude_univ']          = dist_data["lat"]
dist_api['Longitude_univ']         = dist_data["lon"]
dist_api['Dpt_Centroid_Latitude']  = dist_data["Dpt_Centroid_Latitude"]
dist_api['Dpt_Centroid_Longitude'] = dist_data["Dpt_Centroid_Longitude"]
dist_api

Unnamed: 0,best_idx,pesi_idx,opti_idx,travel_time_best_guess,travel_time_pessimistic,travel_time_optimistic,travel_distance_best_guess,travel_distance_pessimistic,travel_distance_optimistic,Department_univ,Province_univ,Name_univ,Latitude_univ,Longitude_univ,Dpt_Centroid_Latitude,Dpt_Centroid_Longitude
0,0.0,0.0,0.0,222.0,222.0,222.0,72.2,72.2,72.2,Lima,Lima,Pontificia Universidad Católica del Perú,-12.069512,-77.079359,-11.785115,-76.628934
1,1.0,1.0,1.0,211.0,211.0,211.0,66.7,66.7,66.7,Lima,Lima,Universidad Peruana Cayetano Heredia,-12.023977,-77.056505,-11.785115,-76.628934
2,2.0,2.0,2.0,339.0,339.0,339.0,207.0,207.0,207.0,Arequipa,Arequipa,Universidad Católica de Santa María,-16.406179,-71.547630,-15.844475,-72.472918
3,3.0,3.0,3.0,224.0,224.0,224.0,69.9,69.9,69.9,Lima,Lima,Universidad del Pacífico,-12.083797,-77.048806,-11.785115,-76.628934
4,4.0,4.0,4.0,212.0,212.0,212.0,65.9,65.9,65.9,Lima,Lima,Universidad de Lima,-12.084724,-76.971009,-11.785115,-76.628934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,138.0,138.0,138.0,,,,,,,Ayacucho,Huanta,Universidad Nacional Autónoma de Huanta,-12.939932,-74.243683,-14.086284,-74.083968
139,139.0,139.0,139.0,220.0,222.0,220.0,65.7,69.7,66.0,Lima,Lima,Universidad Nacional Tecnológica de San Juan d...,-11.983305,-77.009474,-11.785115,-76.628934
140,140.0,140.0,140.0,226.0,226.0,226.0,73.6,73.6,73.6,Lima,Lima,Universidad Autónoma Municipal de Los Olivos,-12.000352,-77.083390,-11.785115,-76.628934
141,141.0,141.0,141.0,514.0,514.0,514.0,216.0,216.0,216.0,Huancavelica,Tayacaja,Universidad Nacional Autónoma de Tayacaja Dani...,-12.360644,-74.837916,-13.023206,-75.002090


Final data set

In [10]:
distances = dist_api[['Department_univ', 'Province_univ', 'Name_univ',
                      'Latitude_univ', 'Longitude_univ', 'Dpt_Centroid_Latitude', 'Dpt_Centroid_Longitude',
                      'travel_time_best_guess', 'travel_time_pessimistic', 'travel_time_optimistic',
                      'travel_distance_best_guess', 'travel_distance_pessimistic', 'travel_distance_optimistic']]

#distances.to_csv( r'./distances_group_5.csv', encoding='iso-8859-1')
distances

Unnamed: 0,Department_univ,Province_univ,Name_univ,Latitude_univ,Longitude_univ,Dpt_Centroid_Latitude,Dpt_Centroid_Longitude,travel_time_best_guess,travel_time_pessimistic,travel_time_optimistic,travel_distance_best_guess,travel_distance_pessimistic,travel_distance_optimistic
0,Lima,Lima,Pontificia Universidad Católica del Perú,-12.069512,-77.079359,-11.785115,-76.628934,222.0,222.0,222.0,72.2,72.2,72.2
1,Lima,Lima,Universidad Peruana Cayetano Heredia,-12.023977,-77.056505,-11.785115,-76.628934,211.0,211.0,211.0,66.7,66.7,66.7
2,Arequipa,Arequipa,Universidad Católica de Santa María,-16.406179,-71.547630,-15.844475,-72.472918,339.0,339.0,339.0,207.0,207.0,207.0
3,Lima,Lima,Universidad del Pacífico,-12.083797,-77.048806,-11.785115,-76.628934,224.0,224.0,224.0,69.9,69.9,69.9
4,Lima,Lima,Universidad de Lima,-12.084724,-76.971009,-11.785115,-76.628934,212.0,212.0,212.0,65.9,65.9,65.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,Ayacucho,Huanta,Universidad Nacional Autónoma de Huanta,-12.939932,-74.243683,-14.086284,-74.083968,,,,,,
139,Lima,Lima,Universidad Nacional Tecnológica de San Juan d...,-11.983305,-77.009474,-11.785115,-76.628934,220.0,222.0,220.0,65.7,69.7,66.0
140,Lima,Lima,Universidad Autónoma Municipal de Los Olivos,-12.000352,-77.083390,-11.785115,-76.628934,226.0,226.0,226.0,73.6,73.6,73.6
141,Huancavelica,Tayacaja,Universidad Nacional Autónoma de Tayacaja Dani...,-12.360644,-74.837916,-13.023206,-75.002090,514.0,514.0,514.0,216.0,216.0,216.0
