"""
Python Extract Transform Load Example
"""

In [1]:
# to pull data from an API, which is used for the extraction of data
import requests

# to perform transformation and manipulation of data
import pandas as pd

# to support creating a connection to a database (essentially, it’s an SQLite database)
from sqlalchemy import create_engine 

In [28]:
""" This API extracts data from
http://universities.hipolabs.com
"""
    
def extract(country:str)-> dict:
    #country = country

    API_URL = "http://universities.hipolabs.com/search?country={}".format(country)
    data_universities = requests.get(API_URL).json() 
    
    # Data is a list of dictionaries, where each dict is about one university
    return data_universities


In [52]:
""" Transforms the dataset into desired structure and filters"""
 
def transform(data:dict, word:str) -> pd.DataFrame:

    df_universities = pd.DataFrame(data)


    # deleting duplicated rows with list columns
    # df_universities_ma_pi = df_universities[df_universities["name"].str.contains("Maranhão|Piau")]
    # "domains" and "web_pages" columns are list, then drop_duplicates won't work in this dataframe
    # However, it can drop duplicates on the dataframe casted as str and then extract the rows from original df using the index from the results
    #df_universities_ma_pi = df_universities_ma_pi.loc[df_universities_ma_pi.astype(str).drop_duplicates().index].reset_index(drop=True)

    # Converting list columns to string columns
    df_universities['domains'] = [','.join(map(str, l)) for l in df_universities['domains']]
    df_universities['web_pages'] = [','.join(map(str, l)) for l in df_universities['web_pages']]
    # https://www.jquery-az.com/3-ways-convert-python-list-string-join-map-str/
    # in this case it is not necessary to use map function, because all values are already string, but it is important to use as prevention

    # dropping duplicates rows and reseting the index 
    df_universities = df_universities.drop_duplicates().reset_index(drop=True)


    # searching for universities that contains the word given
    df_universities_ma_pi = df_universities[df_universities["name"].str.contains("{}".format(word))].reset_index(drop=True)


    # show results
    print(f"Total Number of universities from API: {len(df_universities)}")
    print(f"Number of universities with the word '{word}': {len(df_universities_ma_pi)}")


    return df_universities[["domains","country","web_pages","name"]], df_universities_ma_pi[["domains","country","web_pages","name"]]

In [53]:
def load(df:pd.DataFrame)-> None:
    """ Loads data into a sqllite database"""
    disk_engine = create_engine('sqlite:///my_lite_store.db')
    df.to_sql('cal_uni', disk_engine, if_exists='replace')

In [54]:
country = 'Brazil'
word_in_university_name = 'São'


data = extract(country)
df_univ, df_univ_word = transform(data, word_in_university_name)
load(df_univ_word)

Total Number of universities from API: 174
Number of universities with the word 'São': 11


In [48]:
df_univ_word

Unnamed: 0,domains,country,web_pages,name
0,epm.br,Brazil,http://www.epm.br/,Universidade Federal de São Paulo
1,pucsp.br,Brazil,http://www.pucsp.br/,Pontifícia Universidade Católica de São Paulo
2,saojudas.br,Brazil,http://www.saojudas.br/,Universidade São Judas Tadeu
3,smarcos.br,Brazil,http://www.smarcos.br/,Universidade São Marcos
4,ufsj.edu.br,Brazil,https://www.ufsj.edu.br/,Universidade Federal de São João del-Rei
5,ufscar.br,Brazil,http://www.ufscar.br/,Universidade Federal de São Carlos
6,uniban.br,Brazil,http://www.uniban.br/,Universidade Bandeirante de São Paulo
7,unicid.br,Brazil,http://www.unicid.br/,Universidade Cidade de São Paulo
8,univasf.edu.br,Brazil,http://www.univasf.edu.br/,Universidade Federal do Vale do São Francisco
9,usf.br,Brazil,http://www.usf.br/,Universidade São Francisco


## Source

https://medium.datadriveninvestor.com/create-your-first-etl-pipeline-with-python-d65070c259a6

https://github.com/syalanuj/youtube/blob/main/de_fundamentals_python/etl.py