# EDA on Silver Tables

In [None]:
# Modifying sys.path to include '/workspace/etl' and '/workspace/etl/utils' in the list of paths
import sys
sys.path.append('/workspace/etl')
sys.path.append('/workspace/etl/utils')
print(sys.path)

# Importing Modules
import os
import boto3
import logging
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine
from extract import DataExtractor
from step2_load_to_postgres import DataLoader
from utils_connection import get_s3_parquet_file_key, get_connection_uri

In [None]:
# Cell 2: Load Environment Variables
load_dotenv()

# Fetch AWS credentials from environment variables
s3_access_key_id = os.getenv('S3_ACCESS_KEY_ID')
s3_secret_access_key = os.getenv('S3_SECRET_ACCESS_KEY')
s3_region = os.getenv('S3_REGION')
s3_bucket_name = os.getenv('S3_BUCKET_NAME')

print("S3_ACCESS_KEY_ID: ", s3_access_key_id)
print("S3_SECRET_ACCESS_KEY: ", s3_secret_access_key)
print("S3_BUCKET_NAME: ", s3_bucket_name)

# Initialize a session using boto3
session = boto3.Session(
    aws_access_key_id=s3_access_key_id,
    aws_secret_access_key=s3_secret_access_key,
    region_name=s3_region
)

# Initialize S3 client
s3_client = session.client('s3')

# Example: List objects in the bucket to verify access
try:
    response = s3_client.list_objects_v2(
        Bucket=s3_bucket_name  # Ensure bucket_name is converted to string
    )
    # Print object keys if listing was successful
    print("Objects in bucket:")
    for obj in response.get('Contents', []):
        print(obj['Key'])
except Exception as e:
    print(f"Error accessing bucket: {str(e)}")

In [3]:
class DataSilverEDA:
    def __init__(self):
        """Initialize the DataTransform class."""
        self.engine = create_engine(get_connection_uri())

    def get_data_from_postgres_to_pd(self, schema_name: str, table_name: str) -> pd.DataFrame:
        """Loads data from a PostgreSQL table in a given schema into a Pandas DataFrame."""
        query = f"SELECT * FROM {schema_name}.{table_name}"
        try:
            df = pd.read_sql(query, self.engine)
            print(f"Data loaded successfully from {schema_name}.{table_name}")
            return df
        except Exception as e:
            print(f"Error loading data from {schema_name}.{table_name}: {e}")
            return None

    def display_csv_summary_statistics(self, df: pd.DataFrame):
        """Displays summary statistics for the cleaned DataFrame."""
        print("Summary statistics:")
        print(df[['entry_date', 'appt_date', 'state', 'zip', 'set', 'demo']].head())
        
        # Count of null values in key columns
        print("\nCount of null values in key columns:")
        print(df[['entry_date', 'appt_date', 'state', 'zip', 'set', 'demo']].isnull().sum())
        
        # Checking specific columns for unique values
        columns_to_check = [
            "_extraction_date", 
            "_partition_date", 
            "state", 
            "zip", 
            "set", 
            "demo", 
            "job_status"
        ]
        
        for column in columns_to_check:
            print(f"\nUnique values in '{column}' column:")
            print(df[column].unique())
        
        # Checking the data types of all columns
        print("\nData types of all columns:")
        print(df.dtypes)

        # Basic information about the cleaned DataFrame
        print("\nBasic information about the cleaned DataFrame:")
        print(df.info())


    def display_parquet_data_info(self, df: pd.DataFrame):
        """Displays information related to Parquet data."""
        # Check the shape of the DataFrame
        print(f"DataFrame Shape: {df.shape}")

        # Check for missing values
        missing_values = df.isnull().sum()
        print("Missing Values in Each Column:")
        print(missing_values[missing_values > 0])

        # Check for duplicates
        duplicate_count = df.duplicated().sum()
        print(f"Number of Duplicate Rows: {duplicate_count}")

        # Check for duplicates in the combination of email_hash and phone_hash
        email_phone_duplicates_count = df.duplicated(subset=['email_hash', 'phone_hash']).sum()
        print(f"Number of Duplicate Rows based on email_hash and phone_hash: {email_phone_duplicates_count}")

        # Check the uniqueness of the first three ID columns
        for col in df.columns[:3]:  # Assuming first three columns are IDs
            unique_count = df[col].nunique()
            total_count = df[col].shape[0]
            print(f"Unique values in '{col}': {unique_count} out of {total_count} total")

        # Check data types of the columns
        print("\nData Types of Columns:")
        print(df.dtypes)

        # Check the first few rows of the DataFrame
        print("\nFirst Few Rows of Data:")
        print(df.head())

In [None]:
logging.basicConfig(level=logging.INFO)

# Example schema and table names
schema_names = ['bronze', 'silver']
bronze_table_names = ['leads_parquet', 'csv_snapshots']
silver_table_names = ['stg_leads_parquet', 'stg_csv_snapshots']

# Instantiate the DataSilverEDA
eda = DataSilverEDA()

# Get Silver Schema
silver_schema = schema_names[1]

# Load data from the silver tables and perform EDA
for table_name in silver_table_names:
    silver_data = eda.get_data_from_postgres_to_pd('silver', table_name)
    if silver_data is not None:
        print(f"\nPerforming EDA on table: {table_name}")
        
        # Determine the type of table and perform EDA accordingly
        if "stg_leads_parquet" in table_name:
            eda.display_parquet_data_info(silver_data)  # For Parquet files
        elif "stg_csv_snapshots" in table_name:
            eda.display_csv_summary_statistics(silver_data)  # For CSV files