## ETL Extract Lab - DSA 2040A


## Project Setup
 This notebook demonstrates:
- Full dataset extraction
- Incremental extraction based on last run timestamp
- Proper ETL workflow practices


In [21]:
#Import Required Libraries
import pandas as pd
from datetime import datetime

#### 🟨 Section 1: Full Extraction

In [41]:
import pandas as pd

def wrangle_full(csv_file):
    """Perform full extraction: Load dataset, show stats, sample"""
    try:
        df = pd.read_csv(csv_file)
        
        # Basic stats
        print("✅ Full extraction completed successfully.")
        print(f"Extracted {len(df)} rows fully.")
        print(f"Number of columns: {df.shape[1]}")
        
        print("\n📊 Column Names:")
        print(df.columns.tolist())
        
        print("\n🔍 Sample Records:")
        print(df.head())

        return df
    
    except Exception as e:
        print("❌ Error during full extraction:", e)
        return None



In [42]:
# Usage
file_path = "custom_data.csv"
df_full = wrangle_full(file_path)

✅ Full extraction completed successfully.
Extracted 2003 rows fully.
Number of columns: 9

📊 Column Names:
['Transaction ID', 'Date', 'Customer ID', 'Gender', 'Age', 'Product Category', 'Quantity', 'Price per Unit', 'Total Amount']

🔍 Sample Records:
  Transaction ID        Date Customer ID  Gender Age Product Category  \
0              1  2023-11-24     CUST001    Male  34           Beauty   
1              2  2023-02-27     CUST002  Female  26         Clothing   
2              3  2023-01-13     CUST003    Male  50      Electronics   
3              4  2023-05-21     CUST004    Male  37         Clothing   
4              5  2023-05-06     CUST005    Male  30           Beauty   

  Quantity Price per Unit Total Amount  
0        3             50          150  
1        2            500         1000  
2        1             30           30  
3        1            500          500  
4        2             50          100  


In [24]:
print(df_full.head())

  Transaction ID        Date Customer ID  Gender Age Product Category  \
0              1  2023-11-24     CUST001    Male  34           Beauty   
1              2  2023-02-27     CUST002  Female  26         Clothing   
2              3  2023-01-13     CUST003    Male  50      Electronics   
3              4  2023-05-21     CUST004    Male  37         Clothing   
4              5  2023-05-06     CUST005    Male  30           Beauty   

  Quantity Price per Unit Total Amount  
0        3             50          150  
1        2            500         1000  
2        1             30           30  
3        1            500          500  
4        2             50          100  


In [25]:
print("📊 Dataset Info:")
print(df_full.info())

📊 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2003 entries, 0 to 2002
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    2003 non-null   object
 1   Date              2001 non-null   object
 2   Customer ID       2001 non-null   object
 3   Gender            2001 non-null   object
 4   Age               2001 non-null   object
 5   Product Category  2001 non-null   object
 6   Quantity          2001 non-null   object
 7   Price per Unit    2001 non-null   object
 8   Total Amount      2001 non-null   object
dtypes: object(9)
memory usage: 141.0+ KB
None


In [31]:
# Ensure the 'Date' column is in datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')

# Drop any rows where the date could not be parsed
df_full = df_full.dropna(subset=['Date'])

# Display the earliest and latest dates
start_date = df_full['Date'].min()
end_date = df_full['Date'].max()

print(f"📅 Earliest date in data: {start_date}")
print(f"📅 Latest date in data: {end_date}")

📅 Earliest date in data: 2023-01-01 00:00:00
📅 Latest date in data: 2024-01-01 00:00:00


In [35]:
# Sort by Date in ascending order
df_full = df_full.sort_values(by='Date', ascending=True).reset_index(drop=True)
df_full.tail(10)

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
1990,805,2023-12-29,CUST805,Female,30,Beauty,3,500,1500
1991,805,2023-12-29,CUST805,Female,30,Beauty,3,500,1500
1992,908,2023-12-29,CUST908,Male,46,Beauty,4,300,1200
1993,908,2023-12-29,CUST908,Male,46,Beauty,4,300,1200
1994,857,2023-12-31,CUST857,Male,60,Electronics,2,25,50
1995,857,2023-12-31,CUST857,Male,60,Electronics,2,25,50
1996,650,2024-01-01,CUST650,Male,55,Electronics,1,30,30
1997,211,2024-01-01,CUST211,Male,42,Beauty,3,500,1500
1998,650,2024-01-01,CUST650,Male,55,Electronics,1,30,30
1999,211,2024-01-01,CUST211,Male,42,Beauty,3,500,1500


###  🟨 Section 2: Incremental Extraction

🕒 Incremental Extraction using Timestamp

In this ETL pipeline, incremental extraction is based on a saved timestamp stored in the `last_extraction.txt` file. 

For this lab, I manually set the timestamp to:

In [46]:
def read_last_extraction_time(file_path='last_extraction.txt'):
    try:
        with open(file_path, 'r') as f:
            return datetime.strptime(f.read().strip(), '%Y-%m-%d %H:%M:%S')
    except (FileNotFoundError, ValueError):
        return datetime.min

def wrangle_incremental(csv_file, timestamp_file='last_extraction.txt'):
    """Extract only new or updated records based on last extraction time"""
    try:
        df = pd.read_csv(csv_file)

        # Try parsing the Date column safely
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce', format='%Y-%m-%d')

        # Drop rows where date couldn't be parsed
        df = df.dropna(subset=['Date'])

        # Ensure dates are sorted
        df = df.sort_values('Date')

        last_time = read_last_extraction_time(timestamp_file)
        new_data = df[df['Date'] > last_time]

        print(f"✅ Extracted {len(new_data)} rows incrementally since last check ({last_time}).")
        return new_data
    except Exception as e:
        print("❌ Error during incremental extraction:", e)
        return pd.DataFrame()


### 🟩 Section 3: Save New Timestamp

In [47]:
def update_extraction_time(file_path='last_extraction.txt', timestamp=None):
    """Updates the last_extraction.txt file with current or custom timestamp"""
    try:
        if timestamp is None:
            timestamp = datetime.now()
        with open(file_path, 'w') as f:
            f.write(timestamp.strftime('%Y-%m-%d %H:%M:%S'))
        print(f"✅ Updated last extraction time to: {timestamp}")
    except Exception as e:
        print("❌ Error writing to timestamp file:", e)


#### ✅ Example Incremental Extraction

In [48]:
# Extract new data since last time
new_data = wrangle_incremental("custom_data.csv")

# Update the timestamp only if we found new data
if not new_data.empty:
    update_extraction_time()


✅ Extracted 0 rows incrementally since last check (2025-06-30 20:18:52).
