# Titulo aqui da documentação etl raw to silver

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

raw_path = Path('../data/raw/currencies_data.csv')
silver_path = Path('../data/silver/silver_currencies_data.csv')

In [4]:
df_raw = pd.read_csv(raw_path)
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 446176 entries, 0 to 446175
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   cmcRank                   446176 non-null  int64  
 1   name                      446176 non-null  object 
 2   symbol                    446176 non-null  object 
 3   marketPairCount           446176 non-null  int64  
 4   circulatingSupply         446176 non-null  float64
 5   totalSupply               446176 non-null  float64
 6   maxSupply                 347894 non-null  float64
 7   isActive                  446176 non-null  int64  
 8   lastUpdated               446176 non-null  object 
 9   dateAdded                 446176 non-null  object 
 10  name.1                    446176 non-null  object 
 11  price                     446176 non-null  float64
 12  volume24h                 446176 non-null  float64
 13  marketCap                 446176 non-null  f

In [22]:
import re

def to_snake_case(name: str) -> str:
    name = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', name)
    name = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', name)
    name = re.sub(r'([a-zA-Z])(\d)', r'\1_\2', name)

    return name.lower()


df = df_raw.copy()
df = df.drop_duplicates()
df = df.drop(columns=["name.1"])

df["lastUpdated"] = pd.to_datetime(df["lastUpdated"], errors='coerce')
df["dateAdded"] = pd.to_datetime(df["dateAdded"], errors='coerce')
df["isActive"] = df["isActive"].astype(bool)
df["name"] = df["name"].astype("string")
df["symbol"] = df["symbol"].astype("string")

df.fillna(np.inf, inplace=True)
df.columns = [to_snake_case(col) for col in df.columns]

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20225 entries, 0 to 427792
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   cmc_rank                     20225 non-null  int64              
 1   name                         20225 non-null  string             
 2   symbol                       20225 non-null  string             
 3   market_pair_count            20225 non-null  int64              
 4   circulating_supply           20225 non-null  float64            
 5   total_supply                 20225 non-null  float64            
 6   max_supply                   20225 non-null  float64            
 7   is_active                    20225 non-null  bool               
 8   last_updated                 20225 non-null  datetime64[ns, UTC]
 9   date_added                   20225 non-null  datetime64[ns, UTC]
 10  price                        20225 non-null  float

In [32]:
silver_cryptocurrencies = Path('../data/silver/silver_currencies_data.csv')
df.to_csv(silver_cryptocurrencies, index=False)

In [36]:
def create_ddl_table(df: pd.DataFrame, table_name: str) -> str:
    dtype_mapping = {
        'int64': 'BIGINT',
        'float64': 'FLOAT',
        'bool': 'BOOLEAN',
        'datetime64[ns]': 'TIMESTAMP',
        'string': 'VARCHAR(255)',
        'object': 'TEXT'
    }

    ddl = f"CREATE TABLE IF NOT EXISTS {table_name} (\n"
    columns = []
    
    for col, dtype in df.dtypes.items():
        sql_type = dtype_mapping.get(str(dtype), 'TEXT')
        columns.append(f"    {col} {sql_type}")
    
    ddl += ",\n".join(columns)
    ddl += "\n);"
    
    return ddl

ddl_path = Path('../data/silver/ddl.sql')

with open(ddl_path, 'w') as f:
    f.write(create_ddl_table(df, 'currencies_data'))

In [30]:
from dotenv import load_dotenv
import psycopg2
import os

load_dotenv('../.env')

def get_connection():
    try:
        conn = psycopg2.connect(
            host=os.getenv('DB_HOST','localhost'),
            database=os.getenv('POSTGRES_DB','postgres'),
            user=os.getenv('POSTGRES_USER','postgres'),
            password=os.getenv('POSTGRES_PASSWORD','postgres'),
            port=os.getenv('DB_PORT', 5432)
        )
        return conn
    except psycopg2.Error as e:
        print(f"Failed to connect to the database: {e}")
        raise e

In [43]:
with open(ddl_path, 'r') as f:
    ddl_sql = f.read()

conn = get_connection()
cursor = conn.cursor()

cursor.execute(ddl_sql)
conn.commit()

cursor.close()
conn.close()


print("Table created successfully.")


Table created successfully.


In [44]:
from psycopg2 import sql

def insert_data(df: pd.DataFrame, table_name: str):
    conn = get_connection()
    cursor = conn.cursor()
    
    query = sql.SQL("INSERT INTO {table} ({fields}) VALUES ({placeholders})").format(
        table=sql.Identifier(table_name),
        fields=sql.SQL(', ').join(map(sql.Identifier, df.columns)),
        placeholders=sql.SQL(', ').join(sql.Placeholder() * len(df.columns))
    )
    
    for row in df.itertuples(index=False, name=None):
        cursor.execute(query, row)
    
    conn.commit()
    cursor.close()
    conn.close()
    print(f"Data inserted into {table_name} successfully.")

insert_data(df, 'currencies_data')


Data inserted into currencies_data successfully.
