In [1]:
import requests
import pandas as pd
from geopy.geocoders import Nominatim
import numpy as np

In [9]:
class Universidades:

    def __init__(self):
        self = self

    def api_universidad(self, pais):  ### funcion para llamar a la api y sacar las universidades
        self.pais = pais
        
        url = f"http://universities.hipolabs.com/search?country={self.pais}"
        response = requests.get(url)
        data = response.json()
        self.df = pd.DataFrame(data)
        return self.df

    def limpieza(self, df):  ###funcion para limpiar las columnas
        self.df = df

        # Cambiar nombre de columna
        self.df = self.df.rename(columns={"state-province": "state_province"})

        # Eliminar columna de "domains"
        self.df = self.df.drop(columns="domains")

        # Explotar la columna de "web_page"
        self.df['web_pages'] = self.df['web_pages'].apply(lambda data: ','.join(data))
        self.df = self.df.explode("web_pages").reset_index(drop=True)

        # Eliminar registros duplicados
        self.df = self.df.drop_duplicates(subset="name", keep="first").reset_index(drop=True)

        # Cambiar None por numpy.nan
        self.df["state_province"] = self.df["state_province"].replace("None", np.nan)

        # Cambiar nulos por 'Unknown'
        self.df["state_province"] = self.df["state_province"].fillna("Unknown")

        return self.df
    
    def limpieza2(self, df2, diccionario):   #### funcion para limpiar la columna provincia donde luego uniremos

        self.diccionario = diccionario
        self.df2 = df2


        self.df2["state_province"] = self.df2["state_province"].replace(self.diccionario)
        self.df2["state_province"] = self.df2["state_province"].replace({"New York, NY": "New York",
                                                            "Ciudad Autónoma de Buenos Aires": "Buenos Aires"})
        
        self.df2 = self.df2.rename(columns = {'state_province' : 'provincia'})  ## cambiamos el nombre para el merge
        return self.df2

    def coordenadas (self, df3, nombre):  ###funcion para buscar las coordenadas de las provincias con geopy

        self.nombre = nombre
        self.df3 = df3 

        geolocator = Nominatim(user_agent= self.nombre)

        provincias = self.df3['provincia'].unique()  ### creamos lista con los valores únicos

        latitud = []
        longitud = []

        for provincia in provincias:
            location = geolocator.geocode(provincia)   ##### para cada provincia de la columna vamos a buscar la información

            if location:
                latitud.append(location.latitude)      ####extraemos latitud y longitud
                longitud.append(location.longitude)     ### si la encuentra las añadimos a la lista

            else:

                latitud.append(np.nan)               ## si no las encuentra (tenemos unknown) agregamos np.nan
                longitud.append(np.nan)

        df_coordenadas = pd.DataFrame({'provincia': provincias, 'latitud': latitud, 'longitud': longitud})

        return df_coordenadas



In [3]:
diccionario_provincias =  {
                            "NV": "Nevada",
                            "TX": "Texas",
                            "IN": "Indianapolis",
                            "CA": "California",
                            "VA": "Virginia",
                            "NY": "New York",
                            "MI": "Michigan",
                            "GA": "Georgia",
                            "ND": "North Dakota"}

In [10]:
carga = Universidades()

In [5]:
df_canada = carga.api_universidad("Canada")
df_eeuu = carga.api_universidad("United States")
df_argentina = carga.api_universidad("Argentina")

In [6]:
df_canada = carga.limpieza(df_canada)
df_eeuu = carga.limpieza(df_eeuu)
df_argentina = carga.limpieza(df_argentina)

In [7]:
df_canada = carga.limpieza2(df_canada, diccionario_provincias)
df_eeuu = carga.limpieza2(df_eeuu, diccionario_provincias)
df_argentina = carga.limpieza2(df_argentina, diccionario_provincias)

In [11]:
df_canada_coordenadas = carga.coordenadas(df_canada ,"Ana")
df_eeuu_coordenadas = carga.coordenadas(df_eeuu ,"Ana")
df_argentina_coordenadas = carga.coordenadas(df_argentina ,"Ana")

In [12]:
df_canada_coordenadas

Unnamed: 0,provincia,latitud,longitud
0,Quebec,52.476089,-71.825867
1,Ontario,50.000678,-86.000977
2,Nova Scotia,45.19604,-63.165379
3,British Columbia,55.001251,-125.002441
4,Alberta,55.001251,-115.002136
5,Manitoba,55.001251,-97.001038
6,New Brunswick,46.500283,-66.750183
7,Saskatchewan,55.532126,-106.141224
8,Unknown,25.029422,-77.361956
9,Newfoundland and Labrador,53.821733,-61.229553


In [13]:
df_eeuu_coordenadas

Unnamed: 0,provincia,latitud,longitud
0,Unknown,25.029422,-77.361956
1,Pennsylvania,40.969989,-77.727883
2,Nevada,39.515882,-116.853723
3,Iowa,41.921673,-93.31227
4,Virginia,37.123224,-78.492772
5,Texas,31.26389,-98.545612
6,Colorado,38.725178,-105.607716
7,Indianapolis,39.768333,-86.15835
8,California,36.701463,-118.755997
9,South Carolina,33.687439,-80.436374


In [14]:
df_argentina_coordenadas

Unnamed: 0,provincia,latitud,longitud
0,Buenos Aires,-34.607568,-58.437089
1,Entre Ríos,-31.625284,-59.353958
2,Salta,-25.10767,-64.349496
3,Córdoba,37.884581,-4.776014
4,Mendoza,-34.787093,-68.438187
5,Santa Fé,-30.315474,-61.164508
6,Unknown,25.029422,-77.361956
7,Santiago Del Estero,-27.643102,-63.540854
8,Misiones,-26.737224,-54.431526
9,Catamarca,-27.191083,-67.105374


In [15]:
canada = pd.merge(df_canada, df_canada_coordenadas, on = 'provincia')
eeuu = pd.merge(df_eeuu, df_eeuu_coordenadas, on = 'provincia')
argentina = pd.merge(df_argentina, df_argentina_coordenadas, on = 'provincia')

In [16]:
canada.sample(10)

Unnamed: 0,provincia,name,country,web_pages,alpha_two_code,latitud,longitud
3,Quebec,"École de technologie supérieure, Université du...",Canada,http://www.etsmtl.ca/,CA,52.476089,-71.825867
137,Saskatchewan,First Nations University of Canada,Canada,http://www.firstnationsuniversity.ca/,CA,55.532126,-106.141224
150,Unknown,University of St. Michael's College,Canada,http://www.utoronto.ca/stmikes/,CA,25.029422,-77.361956
107,British Columbia,University of Northern British Columbia,Canada,http://www.unbc.ca/,CA,55.001251,-125.002441
143,Unknown,St. Clair College,Canada,http://www.stclairc.on.ca/,CA,25.029422,-77.361956
50,Ontario,Sheridan College,Canada,http://www.sheridancollege.ca/,CA,50.000678,-86.000977
125,Manitoba,Brandon University,Canada,http://www.brandonu.ca/,CA,55.001251,-97.001038
32,Ontario,Conestoga College,Canada,http://www.conestogac.on.ca/,CA,50.000678,-86.000977
83,British Columbia,University College of the Cariboo,Canada,http://www.cariboo.bc.ca/,CA,55.001251,-125.002441
55,Ontario,University of Ottawa,Canada,http://www.uottawa.ca/,CA,50.000678,-86.000977


In [32]:
eeuu['provincia'].unique()

array(['Unknown', 'Pennsylvania', 'Nevada', 'Iowa', 'Virginia', 'Texas',
       'Colorado', 'Indianapolis', 'California', 'South Carolina',
       'Washington', 'New York', 'North Dakota', 'Michigan', 'Ohio',
       'Florida', 'North Carolina', 'Georgia'], dtype=object)

In [35]:
eeuu.sample(10)

Unnamed: 0,provincia,name,country,web_pages,alpha_two_code,latitud,longitud
2024,Unknown,Latter-day Saints Business College,United States,http://www.ldsbc.edu,US,25.029422,-77.361956
1418,Unknown,Three Rivers Community College,United States,http://www.trcc.commnet.edu,US,25.029422,-77.361956
892,Unknown,Texas Christian University,United States,http://www.tcu.edu/,US,25.029422,-77.361956
906,Unknown,Transylvania University,United States,http://www.transy.edu/,US,25.029422,-77.361956
1229,Unknown,Lurleen B Wallace Community College,United States,http://www.lbwcc.edu,US,25.029422,-77.361956
346,Unknown,DeVry Institute of Technology,United States,http://www.devry.edu/,US,25.029422,-77.361956
843,Unknown,State University of New York at Oswego,United States,http://www.oswego.edu/,US,25.029422,-77.361956
895,Unknown,Texas Tech University,United States,http://www.ttu.edu/,US,25.029422,-77.361956
608,Unknown,Mississippi University for Women,United States,http://www.muw.edu/,US,25.029422,-77.361956
1379,Unknown,Santa Rosa Junior College,United States,http://www.santarosa.edu,US,25.029422,-77.361956


In [19]:
argentina.sample(10)

Unnamed: 0,provincia,name,country,web_pages,alpha_two_code,latitud,longitud
50,Mendoza,Universidad Nacional de Cuyo,Argentina,http://www.uncu.edu.ar/,AR,-34.787093,-68.438187
74,Unknown,"Universidad del Norte ""Santo Tomás de Aquino""",Argentina,http://www.unsta.edu.ar/,AR,25.029422,-77.361956
32,Buenos Aires,Universidad Nacional de Lomas de Zamora,Argentina,http://www.unlz.edu.ar/,AR,-34.607568,-58.437089
49,Mendoza,Universidad de Mendoza,Argentina,http://www.um.edu.ar/,AR,-34.787093,-68.438187
7,Buenos Aires,Instituto Universitario Aeronáutico,Argentina,http://www.iua.edu.ar/,AR,-34.607568,-58.437089
36,Buenos Aires,Universidad Torcuato di Tella,Argentina,http://www.utdt.edu/,AR,-34.607568,-58.437089
12,Buenos Aires,Universidad Nacional de Mar del Plata,Argentina,http://www.mdp.edu.ar/,AR,-34.607568,-58.437089
59,Unknown,Universidad de la Marina Mercante,Argentina,http://www.udemm.edu.ar/,AR,25.029422,-77.361956
84,San Juan,Universidad Nacional de San Juan,Argentina,http://www.unsj.edu.ar/,AR,18.465299,-66.116666
9,Buenos Aires,Instituto Universitario Nacional del Arte,Argentina,http://www.iuna.edu.ar/,AR,-34.607568,-58.437089


In [36]:
eeuu.shape

(2270, 7)

In [20]:
canada.to_pickle('datos/df_canada.pkl')
argentina.to_pickle('datos/df_argentina.pkl')
eeuu.to_pickle('datos/df_eeuu.pkl')