In [46]:
# Libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup
from IPython.display import display
from io import StringIO
import csv
import gzip
from unidecode import unidecode
import html5lib
from dotenv import load_dotenv
import os
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas

def ratings_weekly_scrape(url):

    # Fetch the url content
    response = requests.get(url)

    # Parse the url content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html5lib')
    def remove_accents(text):
         return unidecode(text) if isinstance(text, str) else text

    # Iterate through all text elements in the HTML and replace accents
    for element in soup.find_all(string=True):
         element.replace_with(remove_accents(element))

    # Find the h2
    h2_header = soup.find('h2', {'id': 'Audiência'})
    desired_table = None
    next_div = None

    if h2_header:
        parent_div = h2_header.find_parent('div')
        next_div = parent_div.find_next_sibling()

        while next_div:
         if next_div.name == "table":
            desired_table = next_div
            break
         else:
            next_div = next_div.find_next_sibling()
            if next_div.find('h2'):
                break 

    if desired_table:

        # Replacing commas for periods in decimals
        for element in desired_table.find_all(string=True):
            if ',' in element:
                updated_text = element.replace(',', '.')
                element.replace_with(updated_text)

        # Parsing html table to DataFrame
        html_to_table = pd.read_html(StringIO(str(desired_table)))
        Ratings_weekly = html_to_table[0]

        # Adding the year of the current file
        Ratings_weekly['Edicao'] = url.rsplit('_', 1)[-1]

        # Remove index column
        Ratings_weekly.reset_index(drop=True, inplace=True)

        # Remove Media de Edicao row (last row)
        Ratings_weekly = Ratings_weekly.iloc[:-1]

        # Remove notes number from SEG to DOM
        Ratings_weekly.loc[:, 'SEG'] = Ratings_weekly['SEG'].str.replace(r'\[.*', '', regex=True)
        Ratings_weekly.loc[:, 'TER'] = Ratings_weekly['TER'].str.replace(r'\[.*', '', regex=True)
        Ratings_weekly.loc[:, 'QUA'] = Ratings_weekly['QUA'].str.replace(r'\[.*', '', regex=True)
        Ratings_weekly.loc[:, 'QUI'] = Ratings_weekly['QUI'].str.replace(r'\[.*', '', regex=True)
        Ratings_weekly.loc[:, 'SEX'] = Ratings_weekly['SEX'].str.replace(r'\[.*', '', regex=True)
        Ratings_weekly.loc[:, 'SAB'] = Ratings_weekly['SAB'].str.replace(r'\[.*', '', regex=True)
        Ratings_weekly.loc[:, 'DOM'] = Ratings_weekly['DOM'].str.replace(r'\[.*', '', regex=True)

        # Using commas as decimals, changing it to points
        Ratings_weekly.loc[:, 'SEG'] = Ratings_weekly['SEG'].str.replace(',', '.', regex=True)
        Ratings_weekly.loc[:, 'TER'] = Ratings_weekly['TER'].str.replace(',', '.', regex=True)
        Ratings_weekly.loc[:, 'QUA'] = Ratings_weekly['QUA'].str.replace(',', '.', regex=True)
        Ratings_weekly.loc[:, 'QUI'] = Ratings_weekly['QUI'].str.replace(',', '.', regex=True)
        Ratings_weekly.loc[:, 'SEX'] = Ratings_weekly['SEX'].str.replace(',', '.', regex=True)
        Ratings_weekly.loc[:, 'SAB'] = Ratings_weekly['SAB'].str.replace(',', '.', regex=True)
        Ratings_weekly.loc[:, 'DOM'] = Ratings_weekly['DOM'].str.replace(',', '.', regex=True)
    
        # Add week number
        Ratings_weekly = Ratings_weekly.copy()
        Ratings_weekly['Semana'] = ['Semana {}'.format(i + 1) for i in range(len(Ratings_weekly))]

        # Replace any -- with null
        Ratings_weekly.loc[:, 'SEG'] = Ratings_weekly.apply(lambda row: row['SEG'].replace('--', 'NaN') if isinstance(row['SEG'], str) and '--' in row['SEG'] else row['SEG'],axis=1)
        Ratings_weekly.loc[:, 'TER'] = Ratings_weekly.apply(lambda row: row['TER'].replace('--', 'NaN') if isinstance(row['TER'], str) and '--' in row['TER'] else row['TER'],axis=1)
        Ratings_weekly.loc[:, 'QUA'] = Ratings_weekly.apply(lambda row: row['QUA'].replace('--', 'NaN') if isinstance(row['QUA'], str) and '--' in row['QUA'] else row['QUA'],axis=1)
        Ratings_weekly.loc[:, 'QUI'] = Ratings_weekly.apply(lambda row: row['QUI'].replace('--', 'NaN') if isinstance(row['QUI'], str) and '--' in row['QUI'] else row['QUI'],axis=1)
        Ratings_weekly.loc[:, 'SEX'] = Ratings_weekly.apply(lambda row: row['SEX'].replace('--', 'NaN') if isinstance(row['SEX'], str) and '--' in row['SEX'] else row['SEX'],axis=1)
        Ratings_weekly.loc[:, 'SAB'] = Ratings_weekly.apply(lambda row: row['SAB'].replace('--', 'NaN') if isinstance(row['SAB'], str) and '--' in row['SAB'] else row['SAB'],axis=1)
        Ratings_weekly.loc[:, 'DOM'] = Ratings_weekly.apply(lambda row: row['DOM'].replace('--', 'NaN') if isinstance(row['DOM'], str) and '--' in row['DOM'] else row['DOM'],axis=1)    

        # Rename columns
        Ratings_weekly = Ratings_weekly.rename(columns = {'Data de transmissao':'Data_transmissao','Media semanal':'Media_semanal'} )
        
    else: print("Ratings table not available")
    
    # Save to csv
    year = url.rsplit('_', 1)[-1]
    
    return Ratings_weekly

# List of URLs to process
base_url = "https://pt.wikipedia.org/wiki/Big_Brother_Brasil_"
number_of_shows = 25

urls = [f"{base_url}{i}" for i in range(1, number_of_shows + 1)]

combined_ratings = []

for url in urls:
    try:
        rating_new = ratings_weekly_scrape(url)
        combined_ratings.append(rating_new)
        print(f"Rating information for {url} appended")
    except Exception as e:
        print(f"Error processing {url}: {e}")

print(combined_ratings)  # Check contents before concatenation
ratings_weekly = pd.concat(combined_ratings, ignore_index=True)
    

Ratings table not available
Error processing https://pt.wikipedia.org/wiki/Big_Brother_Brasil_1: cannot access local variable 'Ratings_weekly' where it is not associated with a value
Ratings table not available
Error processing https://pt.wikipedia.org/wiki/Big_Brother_Brasil_2: cannot access local variable 'Ratings_weekly' where it is not associated with a value
Ratings table not available
Error processing https://pt.wikipedia.org/wiki/Big_Brother_Brasil_3: cannot access local variable 'Ratings_weekly' where it is not associated with a value
Ratings table not available
Error processing https://pt.wikipedia.org/wiki/Big_Brother_Brasil_4: cannot access local variable 'Ratings_weekly' where it is not associated with a value
Ratings table not available
Error processing https://pt.wikipedia.org/wiki/Big_Brother_Brasil_5: cannot access local variable 'Ratings_weekly' where it is not associated with a value
Ratings table not available
Error processing https://pt.wikipedia.org/wiki/Big_Brothe

In [48]:
display(ratings_weekly.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Data_transmissao  188 non-null    object 
 1   SEG               188 non-null    object 
 2   TER               188 non-null    object 
 3   QUA               188 non-null    object 
 4   QUI               188 non-null    object 
 5   SEX               188 non-null    object 
 6   SAB               188 non-null    object 
 7   DOM               188 non-null    object 
 8   Media_semanal     188 non-null    float64
 9   Edicao            188 non-null    object 
 10  Semana            188 non-null    object 
dtypes: float64(1), object(10)
memory usage: 16.3+ KB


None

In [50]:
# Load environment variables from .env file
env_path = os.path.abspath("credentials.env")

# Load explicitly
load_dotenv(dotenv_path=env_path, override=True)

print("User:", os.getenv("SNOWFLAKE_USER"))
print("Password:", os.getenv("SNOWFLAKE_PASSWORD"))
print("Account:", os.getenv("SNOWFLAKE_ACCOUNT"))
print("Warehouse:", os.getenv("SNOWFLAKE_WAREHOUSE"))
print("Database:", os.getenv("SNOWFLAKE_DATABASE"))
print("Schema:", os.getenv("SNOWFLAKE_SCHEMA"))
print("Role:", os.getenv("SNOWFLAKE_ROLE"))

User: ANDREALEONEL
Password: Buy100toasters
Account: xf30579.eu-west-2.aws
Warehouse: COMPUTE_WH
Database: BIGBROTHERBRASIL
Schema: WIKIPEDIA
Role: ACCOUNTADMIN


In [54]:
# Access Snowflake credentials from environment variables
conn_params = {
    'user': os.getenv('SNOWFLAKE_USER'),
    'password': os.getenv('SNOWFLAKE_PASSWORD'),
    'account': os.getenv('SNOWFLAKE_ACCOUNT'),
    'warehouse': os.getenv('SNOWFLAKE_WAREHOUSE'),
    'database': os.getenv('SNOWFLAKE_DATABASE'),
    'schema': os.getenv('SNOWFLAKE_SCHEMA'),
    'role': os.getenv('SNOWFLAKE_ROLE'),
}

# Connect and upload
conn = snowflake.connector.connect(**conn_params)
success, nchunks, nrows, _ = write_pandas(conn, ratings_weekly, table_name='RATINGS_WEEKLY', database='BIGBROTHERBRASIL', schema='WIKIPEDIA', quote_identifiers=False)

if success:
    print(f'Successfully uploaded {nrows} rows')
else:
    print('Upload failed')

conn.close()

Successfully uploaded 188 rows
