#### Day.1_Write an extract() function (read CSV/API).

In [27]:
import pandas as pd

def extract(file_path:str) -> pd.DataFrame: # this code mean return a DataFrame
    """Extract Data from CSV into a DataFrame """
    try:
        df = pd.read_csv(file_path)
        return df # this code mean return df
    
    except Exception as e:
        print(f"Extraction failed with error: {e}")
        return pd.DataFrame()
    
    
# Call function to check the result of extraction
file_path = "Sample Dataset/sales_data.csv"
df = extract(file_path) # extracted data saved in df
print(df)
    

   order_id  order_date customer_name     product  quantity  price
0      1001  2025-01-01      John Doe      Laptop         1   1200
1      1002  2025-01-02    Jane Smith       Phone         2    600
2      1003  2025-01-03       Bob Lee  Headphones         3    100
3      1004  2025-01-03      Anna Kim      Laptop         1   1200
4      1005  2025-01-04      John Doe       Phone         1    600


#### Day.2_Transform Function

In [28]:
def transform(df: pd.DataFrame) -> pd.DataFrame: # this code mean return a DataFrame
    """Transform data: clean & add new fields"""
    try:
        # Remove duplicates
        df = df.drop_duplicates()
        
        # Handle missing values
        df = df.fillna(0)
        
        # Add new fields
        df["total_revenue"] = df["quantity"] * df["price"]
        
        print("Transformation completed")
        return df # this code mean return df
    except Exception as e:
        print(f"Transformation failed with error: {e}")
        return pd.DataFrame()


    
# Call function to check the result of transformation
transformed_df = transform(df)
print(transformed_df)

Transformation completed
   order_id  order_date customer_name     product  quantity  price  \
0      1001  2025-01-01      John Doe      Laptop         1   1200   
1      1002  2025-01-02    Jane Smith       Phone         2    600   
2      1003  2025-01-03       Bob Lee  Headphones         3    100   
3      1004  2025-01-03      Anna Kim      Laptop         1   1200   
4      1005  2025-01-04      John Doe       Phone         1    600   

   total_revenue  
0           1200  
1           1200  
2            300  
3           1200  
4            600  


#### Day.3_Load Function (Postgres/MySQL)

In [29]:
import sqlite3

def load(df:pd.DataFrame,db_name ="sales.db"): # this code mean return a DataFrame
    try:
        conn = sqlite3.connect(db_name)
        
        # load data into the database
        df.to_sql("sales_data", conn, if_exists="replace", index=False)
        
        # Verify how many rows were loaded
        cursor = conn.cursor()
        cursor.execute("SELECT COUNT(*) FROM sales_data") # cursor execute the query
        row_count = cursor.fetchone()[0] # this code means fetch one row.the result is a single value inside a tuple: e.g.(1200,)
        print(f"{row_count} rows loaded into the database")
        
        conn.close()
        print(f"Load successfully completed. {row_count} rows loaded into 'sales_data'.")
    except Exception as e:
        print(f"Load task failed with error: {e}")
        
        
        
# Call function to check the result of load
load(transformed_df)

5 rows loaded into the database
Load successfully completed. 5 rows loaded into 'sales_data'.


#### Day.4_Chain Extract ‚Üí Transform ‚Üí Load

In [30]:
# Explain this code
# this main function is used to test the extract,transform and load function
if __name__ == "__main__":
    raw_df = extract(file_path)
    clean_df = transform(raw_df)
    load_df =load(clean_df, "sales.db")


Transformation completed
5 rows loaded into the database
Load successfully completed. 5 rows loaded into 'sales_data'.


##### Day.5_Logging + Error Handling

In [33]:
import pandas as pd
import sqlite3
import logging

logging.basicConfig(
    filename="etl_pipeline_log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",   
)


# Extract Function
def extract(file_path:str) -> pd.DataFrame: # this code mean return a DataFrame
    """Extract Data from CSV into a DataFrame """
    try:
        df = pd.read_csv(file_path)
        logging.info("‚úÖ Extracted data successfully")
        return df
    
    except Exception as e:
        logging.error(f"Extraction failed with error: {e}")
        return pd.DataFrame()
    
# Transform Function
def transform(df:pd.DataFrame) -> pd.DataFrame: # this code mean return a DataFrame
    """Transform data: clean & add new fields"""
    try:
        if df.empty:
            logging.warning(" No data to transform")
            return df
        
        # Remove duplicates
        df = df.drop_duplicates()
        
        # Handle missing values
        df = df.fillna({"customer_name": "Unknown"})
        
        # Add new fields
        df["total_revenue"] = df["quantity"] * df["price"]
        
        logging.info("Transformation successful")
        return df # this code mean return df
    except Exception as e:
        logging.error(f"Transformation failed with error: {e}")
        return pd.DataFrame()
    
# Load Function
def load(df:pd.DataFrame,db_name="sales.db"): # this code mean return a DataFrame
    try:
        if df.empty:
            logging.warning("‚ö†Ô∏è No data to load into DB")
            return
        
        conn = sqlite3.connect(db_name)
        
        # load data into the database
        df.to_sql("sales_data", conn, if_exists="replace", index=False)
        
        logging.info("‚úÖ Loaded data into DB successfully")
    except Exception as e:
        logging.error(f" Load failed: {e}")
        
        
# Main ETL Pipeline

if __name__ == "__main__":
    logging.info("üöÄ ETL pipeline started")

    # Step 1: Extract
    raw_df = extract(file_path)

    # Step 2: Transform
    clean_df = transform(raw_df)

    # Step 3: Load
    load(clean_df, "sales.db")

    logging.info("üèÅ ETL pipeline finished")
    print("Logging sucessfully completed")
    


Logging sucessfully completed
