In [66]:
# Libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup
from IPython.display import display
from io import StringIO
import csv
import gzip
from unidecode import unidecode
import html5lib
from dotenv import load_dotenv
import os
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas
import logging


def ranking_scrape(url):

    # Fetch the url content
    response = requests.get(url)

    # Parse the url content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html5lib')
    def remove_accents(text):
        return unidecode(text) if isinstance(text, str) else text

    # Iterate through all text elements in the HTML and replace accents
    for element in soup.find_all(string=True):
        element.replace_with(remove_accents(element))

    # Find the h2 with id="Histórico"
    h2_header = soup.find('h2', {'id': 'Classificação_geral'})
    desired_table = None
    next_div = None

    if h2_header:
        parent_div = h2_header.find_parent('div')
        next_div = parent_div.find_next_sibling()

    if next_div:
        if next_div.name == "table":
            desired_table = next_div
        else:
            desired_table = next_div.find("table", recursive=False)
    
    if desired_table:

        #Parsing html table to DataFrame
        html_to_table = pd.read_html(StringIO(str(desired_table)))
        Ranking = html_to_table[0]

        # Normalising column name
        if 'Porcentagem/ Pontos' in Ranking.columns: 
            Ranking = Ranking.rename(columns={'Porcentagem/ Pontos':'Porcent_dos_votos'})
        if 'Porcentagem/ Votos' in Ranking.columns: 
            Ranking = Ranking.rename(columns={'Porcentagem/ Votos':'Porcent_dos_votos'})
        Ranking = Ranking.rename(columns={'% dos votos':'Porcent_dos_votos'})

        # Adding the year of the current file
        Ranking['Edicao'] = url.rsplit('_', 1)[-1]

        # Ensuring the Pos. column is always a string
        Ranking['Pos'] = Ranking['Pos.'].astype(str)
        Ranking = Ranking.drop("Pos.", axis=1)

        # Remove double ranking - take only first number
        Ranking['Pos'] = Ranking['Pos'].apply(lambda x: x.split('-')[0] if '-' in x else x)
        

        # Remove Notes number from % Votes and Meio de indicacao
        Ranking['Porcent_dos_votos'] = Ranking['Porcent_dos_votos'].str.replace(r'\[.*', '', regex=True)
        Ranking['Meio de indicacao'] = Ranking['Meio de indicacao'].str.replace(r'\[.*', '', regex=True)

        # Breakdown Meio de Indicacao into two columns: Meio and Nominated by
        Ranking['Indicado por'] = Ranking['Meio de indicacao'].str.extract(r'\((.*?)\)')
        Ranking['Indicado por'] = Ranking['Indicado por'].fillna(Ranking['Meio de indicacao'])
        Ranking['Meio de indicacao'] = Ranking['Meio de indicacao'].str.replace(r'\((.*?)\)', '', regex=True)

        # Some % Votes isn't %, it says "disqualified" or "withdrawn" in a merged cell.
        Ranking['Porcent_dos_votos'] = Ranking.apply(lambda row: row['Porcent_dos_votos'].replace(row['Porcent_dos_votos'], row['Meio de indicacao']) if '%' not in row['Porcent_dos_votos'] else row['Porcent_dos_votos'], axis=1)
        
        # Replace -- in Eliminado em by Finalista
        Ranking['Eliminado em'] = Ranking.apply(lambda row: row['Eliminado em'].replace('--', row['Meio de indicacao']) if '--' in row['Eliminado em'] else row['Eliminado em'], axis=1)

        # Replacing spaces and commas, etc
        Ranking.columns = [col.replace(' ', '_') for col in Ranking.columns]
        Ranking["Porcent_dos_votos"] = Ranking["Porcent_dos_votos"].apply(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
        Ranking = Ranking.replace(",", "_", regex=True)
        
    
    return Ranking

# Appending the Rankings to one single dataframe

base_url = "https://pt.wikipedia.org/wiki/Big_Brother_Brasil_"
number_of_shows = 25

urls = [f"{base_url}{i}" for i in range(1, number_of_shows + 1)]

Combined_ranking = []

for url in urls:
    try:
        ranking_new = ranking_scrape(url)
        Combined_ranking.append(ranking_new)
        print(f"Ranking information for {url} appended")
    except Exception as e:
        print(f"Error processing {url}: {e}")

# Save to csv
Ranking = pd.concat(Combined_ranking, ignore_index=True)

Ranking.to_csv(f'ranking')



print(Ranking["Pos"].unique())  # Inspect unique values in Pos.
display(Ranking.info())
display(Ranking)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_1 HTTP/11" 200 32705
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_2 HTTP/11" 200 24895


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_1 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_3 HTTP/11" 200 28625


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_2 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_4 HTTP/11" 200 33067


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_3 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_5 HTTP/11" 200 31333


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_4 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_6 HTTP/11" 200 27254


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_5 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_7 HTTP/11" 200 34987


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_6 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_8 HTTP/11" 200 35077


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_7 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_9 HTTP/11" 200 38313


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_8 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_10 HTTP/11" 200 39086


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_9 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_11 HTTP/11" 200 37924


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_10 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_12 HTTP/11" 200 43837


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_11 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_13 HTTP/11" 200 50422


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_12 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_14 HTTP/11" 200 40792


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_13 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_15 HTTP/11" 200 43903


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_14 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_16 HTTP/11" 200 49393


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_15 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_17 HTTP/11" 200 67762


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_16 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_18 HTTP/11" 200 59749


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_17 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_19 HTTP/11" 200 80325


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_18 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_20 HTTP/11" 200 104618


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_19 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_21 HTTP/11" 200 121128


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_20 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_22 HTTP/11" 200 100837


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_21 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_23 HTTP/11" 200 126738


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_22 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_24 HTTP/11" 200 114856


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_23 appended


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): pt.wikipedia.org:443
DEBUG:urllib3.connectionpool:https://pt.wikipedia.org:443 "GET /wiki/Big_Brother_Brasil_25 HTTP/11" 200 114245


Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_24 appended
Ranking information for https://pt.wikipedia.org/wiki/Big_Brother_Brasil_25 appended
['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13' '14' '16' '15'
 'Ret.' '17' '18' '19' '20' '22' '21' '23' '24' '25' '26' '27']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 468 entries, 0 to 467
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Participante       468 non-null    object
 1   Meio_de_indicacao  468 non-null    object
 2   Porcent_dos_votos  468 non-null    object
 3   Eliminado_em       468 non-null    object
 4   Edicao             468 non-null    object
 5   Pos                468 non-null    object
 6   Indicado_por       468 non-null    object
dtypes: object(7)
memory usage: 25.7+ KB


None

Unnamed: 0,Participante,Meio_de_indicacao,Porcent_dos_votos,Eliminado_em,Edicao,Pos,Indicado_por
0,Kleber Bambam,Finalista,68%,Finalista,1,1,Finalista
1,Vanessa Pascale,Finalista,21%,Finalista,1,2,Finalista
2,Andre Gabeh,Finalista,11%,Finalista,1,3,Finalista
3,Sergio Tavares,Lider,52%,Semana 9,1,4,Andre
4,Alessandra Begliomini,Lider,73%,Semana 8,1,5,Sergio
...,...,...,...,...,...,...,...
463,Marcelo Prata,Lideres,55.95%,Semana 1,25,23,Aline & Vinicius
464,Nicole Oliveira,Disputa pelas ultimas vagas,19.86%,Semana 0,25,25,Disputa pelas ultimas vagas
465,Paula Oliveira,Disputa pelas ultimas vagas,19.86%,Semana 0,25,25,Disputa pelas ultimas vagas
466,Cleber Santana,Disputa pelas ultimas vagas,13.74%,Semana 0,25,25,Disputa pelas ultimas vagas


In [57]:
# Load environment variables from .env file
env_path = os.path.abspath("credentials.env")

# Load explicitly
load_dotenv(dotenv_path=env_path, override=True)

print("User:", os.getenv("SNOWFLAKE_USER"))
print("Password:", os.getenv("SNOWFLAKE_PASSWORD"))
print("Account:", os.getenv("SNOWFLAKE_ACCOUNT"))
print("Warehouse:", os.getenv("SNOWFLAKE_WAREHOUSE"))
print("Database:", os.getenv("SNOWFLAKE_DATABASE"))
print("Schema:", os.getenv("SNOWFLAKE_SCHEMA"))
print("Role:", os.getenv("SNOWFLAKE_ROLE"))

User: ANDREALEONEL
Password: Buy100toasters
Account: xf30579.eu-west-2.aws
Warehouse: COMPUTE_WH
Database: BIGBROTHERBRASIL
Schema: WIKIPEDIA
Role: ACCOUNTADMIN


In [58]:
# Set logging level to DEBUG
logging.basicConfig(level=logging.DEBUG)

In [59]:
# Access Snowflake credentials from environment variables
conn_params = {
    'user': os.getenv('SNOWFLAKE_USER'),
    'password': os.getenv('SNOWFLAKE_PASSWORD'),
    'account': os.getenv('SNOWFLAKE_ACCOUNT'),
    'warehouse': os.getenv('SNOWFLAKE_WAREHOUSE'),
    'database': os.getenv('SNOWFLAKE_DATABASE'),
    'schema': os.getenv('SNOWFLAKE_SCHEMA'),
    'role': os.getenv('SNOWFLAKE_ROLE'),
}

# Connect and upload
conn = snowflake.connector.connect(**conn_params)
success, nchunks, nrows, _ = write_pandas(conn, Ranking, table_name='RANKING', database='BIGBROTHERBRASIL', schema='WIKIPEDIA', quote_identifiers=False)

if success:
    print(f'Successfully uploaded {nrows} rows')
else:
    print('Upload failed')

conn.close()

INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.15.0, Python Version: 3.12.3, Platform: Windows-11-10.0.26100-SP0
DEBUG:snowflake.connector.connection:connect
DEBUG:snowflake.connector.connection:__config
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
DEBUG:snowflake.connector.connection:This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
DEBUG:snowflake.connector.converter:use_numpy: False
DEBUG:snowflake.connector.converter_issue23517:initialized
DEBUG:snowflake.connector.connection:REST API object was created: xf30579.eu-west-2.aws.snowflakecomputing.com:443
DEBUG:snowflake.connector.auth._auth:authenticate
DEBUG:snowflake.connector.auth._auth:account=xf30579, user=ANDREALEONEL, database=BIGBROTHERBRASIL, schema=WIKIPEDIA, warehouse=COMPUTE_WH

Successfully uploaded 451 rows
