# ETL Extract Lab - DSA 2040A


## Project Setup
 This notebook demonstrates:
- Full dataset extraction
- Incremental extraction based on last run timestamp
- Proper ETL workflow practices


In [2]:
#Import Required Libraries
import pandas as pd
from datetime import datetime

In [None]:
# Section 1: Full Extraction

def full_extraction(file_path):
    """Perform a full extraction of the dataset"""
    try:
        df = pd.read_csv(file_path)
        print("Full extraction completed successfully.")
        print(f"Extracted {len(df)} rows fully.")
        
        # Display basic stats
        print("\nDataset Info:")
        print(df.info())
        
        print("\nSample Data:")
        return df.head()
    except Exception as e:
        print(f"Error during full extraction: {e}")
        return None


In [4]:
# Full extraction example
file_path = "custom_data.csv"
full_extraction(file_path)

Error during full extraction: [Errno 2] No such file or directory: 'custom_data.csv'


In [5]:
# Incremental Extraction

# %%
def read_last_extraction_time():
    """Read the last extraction timestamp from file"""
    try:
        with open('last_extraction.txt', 'r') as f:
            return datetime.strptime(f.read().strip(), '%Y-%m-%d %H:%M:%S')
    except (FileNotFoundError, ValueError):
        # Default to beginning of time if file doesn't exist
        return datetime.min

# %%
def incremental_extraction(file_path):
    """Perform incremental extraction based on last run"""
    try:
        # Read the complete data
        full_df = pd.read_csv(file_path)
        
        # Convert Date column to datetime
        full_df['Date'] = pd.to_datetime(full_df['Date'])
        
        # Get last extraction time
        last_time = read_last_extraction_time()
        print(f"Last extraction was at: {last_time}")
        
        # Filter for new records
        new_data = full_df[full_df['Date'] > last_time]
        
        print(f"Extracted {len(new_data)} rows incrementally since last check.")
        return new_data
    except Exception as e:
        print(f"Error during incremental extraction: {e}")
        return None

# Example incremental extraction
incremental_data = incremental_extraction(file_path)
if incremental_data is not None and not incremental_data.empty:
    print("\nNew data since last extraction:")
    print(incremental_data)



Error during incremental extraction: [Errno 2] No such file or directory: 'custom_data.csv'


In [6]:

# Section 3: Save New Timestamp
# %%
def update_extraction_time():
    """Update the last extraction timestamp to current time"""
    try:
        current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        with open('last_extraction.txt', 'w') as f:
            f.write(current_time)
        print(f"Updated last extraction time to: {current_time}")
    except Exception as e:
        print(f"Error updating extraction time: {e}")

# Update timestamp after successful incremental extraction
update_extraction_time()

Updated last extraction time to: 2025-06-15 14:42:13


In [7]:
# Section 4: Update Extraction Timestamp

def update_extraction_time():
    """Update the last extraction timestamp to current time"""
    try:
        current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        with open('last_extraction.txt', 'w') as f:
            f.write(current_time)
        print(f"Updated last extraction time to: {current_time}")
    except Exception as e:
        print(f"Error updating extraction time: {e}")

update_extraction_time()


Updated last extraction time to: 2025-06-15 14:42:13


In [9]:
# Section 5. Transform Full Data

def transform_data(df):
    """Apply transformations to the dataset"""
    if df is None or df.empty:
        print("No data to transform.")
        return df

    # 1. Cleaning
    df = df.drop_duplicates()
    df.fillna({'Quantity': 0, 'Price per Unit': 0}, inplace=True)

    # 2. Enrichment
    df['Computed Total'] = df['Quantity'] * df['Price per Unit']

    # 3. Structural
    df['Date'] = pd.to_datetime(df['Date'])  # Standardize date
    df['Age Group'] = pd.cut(df['Age'],
                                bins=[0, 18, 35, 50, 100],
                                labels=['Teen', 'Young Adult', 'Middle Aged', 'Senior'])

    print("Transformation complete.")
    return df


In [None]:
df_full_transformed = transform_data(df_full)
df_full_transformed.to_csv("transformed_full.csv", index=False)
print("Transformed full dataset saved to transformed_full.csv")
