In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import os

In [4]:
df_full = pd.read_csv('transformed/transformed_full.csv')
df_inc = pd.read_csv('transformed/transformed_incremental.csv')
print("✅ DataFrames loaded successfully!")
print("Full dataset shape:", df_full.shape)
print("Incremental dataset shape:", df_inc.shape)

✅ DataFrames loaded successfully!
Full dataset shape: (251930, 39)
Incremental dataset shape: (49743, 39)


  df_inc = pd.read_csv('transformed/transformed_incremental.csv')


I created two separate database connections:
  - `full_data.db` → stores the full dataset
  - `incremental_data.db` → stores the incremental dataset

In [5]:
# Option 1: Load into SQLite
conn_full = sqlite3.connect('loaded/full_data.db')
df_full.to_sql('full_data', conn_full, if_exists='replace', index=False)

conn_inc = sqlite3.connect('loaded/incremental_data.db')
df_inc.to_sql('incremental_data', conn_inc, if_exists='replace', index=False)

print("✅ Data loaded into SQLite successfully!")

✅ Data loaded into SQLite successfully!


Verify the Load by:
- Running a simple SQL query to fetch the first 5 records.
- Displaying the results in a pandas DataFrame to check column names, data types, and values.

Finally, I close both database connections to prevent memory leaks

In [None]:
# Verify SQLite load
preview_full = pd.read_sql('SELECT * FROM full_data LIMIT 5', conn_full)
print("Full Data - SQL preview:")
display(preview_full)

preview_inc = pd.read_sql('SELECT * FROM incremental_data LIMIT 5', conn_inc)
print("Incremental Data - SQL preview:")
display(preview_inc)

conn_full.close()
conn_inc.close()

Full Data - SQL preview:


Unnamed: 0,Transaction_ID,Customer_ID,Name,Email,Phone,Address,City,State,Zipcode,Country,...,products,Age_filled_flag,Name_filled,Email_filled,Phone_filled,Address_filled,Ratings_missing,Sales_Tier,Age_Group,Is_High_Value
0,8691788,37249.0,Michelle Harrington,Ebony39@gmail.com,1414787000.0,3959 Amanda Burgs,Dortmund,Berlin,77985.0,Germany,...,Cycling shorts,0,Michelle Harrington,Ebony39@gmail.com,1414786801.0,3959 Amanda Burgs,0,Medium,YoungAdult,0
1,2174773,69749.0,Kelsey Hill,Mark36@gmail.com,6852900000.0,82072 Dawn Centers,Nottingham,England,99071.0,Uk,...,Lenovo Tab,0,Kelsey Hill,Mark36@gmail.com,6852899987.0,82072 Dawn Centers,0,High,YoungAdult,0
2,6679610,30192.0,Scott Jensen,Shane85@gmail.com,8362160000.0,4133 Young Canyon,Geelong,New South Wales,75929.0,Australia,...,Sports equipment,0,Scott Jensen,Shane85@gmail.com,8362160449.0,4133 Young Canyon,0,Premium,MidAge,0
3,7232460,62101.0,Joseph Miller,Mary34@gmail.com,2776752000.0,8148 Thomas Creek Suite 100,Edmonton,Ontario,88420.0,Canada,...,Utility knife,0,Joseph Miller,Mary34@gmail.com,2776751724.0,8148 Thomas Creek Suite 100,0,Premium,MidAge,0
4,6095326,41289.0,Ryan Johnson,Haley12@gmail.com,3292677000.0,532 Ashley Crest Suite 014,Brisbane,New South Wales,74430.0,Australia,...,Lenovo Tab,0,Ryan Johnson,Haley12@gmail.com,3292677006.0,532 Ashley Crest Suite 014,0,Premium,MidAge,0


Incremental Data - SQL preview:


Unnamed: 0,Transaction_ID,Customer_ID,Name,Email,Phone,Address,City,State,Zipcode,Country,...,products,Age_filled_flag,Name_filled,Email_filled,Phone_filled,Address_filled,Ratings_missing,Sales_Tier,Age_Group,Is_High_Value
0,4983775,27901.0,Debra Coleman,Charles30@gmail.com,9098268000.0,5813 Lori Ports Suite 269,Bristol,England,48704.0,Uk,...,Chocolate cookies,0,Debra Coleman,Charles30@gmail.com,9098267635.0,5813 Lori Ports Suite 269,0,Medium,YoungAdult,0
1,4155845,80175.0,Diane Clark,Martin39@gmail.com,6219780000.0,8823 Mariah Heights Apt. 263,Wollongong,New South Wales,39820.0,Australia,...,Dark chocolate,0,Diane Clark,Martin39@gmail.com,6219779557.0,8823 Mariah Heights Apt. 263,0,Premium,MidAge,0
2,2846832,31930.0,Kristine Williams,Charles29@gmail.com,1822768000.0,60629 Jones Villages,Portsmouth,England,38173.0,Uk,...,Bottled water,0,Kristine Williams,Charles29@gmail.com,1822767586.0,60629 Jones Villages,0,Premium,MidAge,0
3,7728099,78376.0,Ronald Chen,Nicole72@gmail.com,1796133000.0,6048 Charles Lake,Portsmouth,England,82975.0,Uk,...,Orange juice,0,Ronald Chen,Nicole72@gmail.com,1796133281.0,6048 Charles Lake,0,Medium,MidAge,0
4,9476855,42410.0,Anthony Rodriguez,Paula49@gmail.com,3745118000.0,585 Malik Lodge Suite 186,Portsmouth,England,83903.0,Uk,...,Self-help,0,Anthony Rodriguez,Paula49@gmail.com,3745118170.0,585 Malik Lodge Suite 186,0,Medium,YoungAdult,0


Save Outputs:

 -database and parquet files to the folder

In [7]:
#  Save to Parquet
df_full.to_parquet('loaded/full_data.parquet', index=False)
df_inc.to_parquet('loaded/incremental_data.parquet', index=False)

print("✅ Data saved as Parquet successfully!")

✅ Data saved as Parquet successfully!


Verifying Parquet Load

To confirm that the Parquet files were written correctly, I use pandas’ `read_parquet()` function.

display the first 5 rows to ensure that the columns and data match the transformed input files.


In [8]:
parquet_preview = pd.read_parquet('loaded/full_data.parquet').head()
print("Parquet preview:")
display(parquet_preview)

Parquet preview:


Unnamed: 0,Transaction_ID,Customer_ID,Name,Email,Phone,Address,City,State,Zipcode,Country,...,products,Age_filled_flag,Name_filled,Email_filled,Phone_filled,Address_filled,Ratings_missing,Sales_Tier,Age_Group,Is_High_Value
0,8691788,37249.0,Michelle Harrington,Ebony39@gmail.com,1414787000.0,3959 Amanda Burgs,Dortmund,Berlin,77985.0,Germany,...,Cycling shorts,False,Michelle Harrington,Ebony39@gmail.com,1414786801.0,3959 Amanda Burgs,False,Medium,YoungAdult,False
1,2174773,69749.0,Kelsey Hill,Mark36@gmail.com,6852900000.0,82072 Dawn Centers,Nottingham,England,99071.0,Uk,...,Lenovo Tab,False,Kelsey Hill,Mark36@gmail.com,6852899987.0,82072 Dawn Centers,False,High,YoungAdult,False
2,6679610,30192.0,Scott Jensen,Shane85@gmail.com,8362160000.0,4133 Young Canyon,Geelong,New South Wales,75929.0,Australia,...,Sports equipment,False,Scott Jensen,Shane85@gmail.com,8362160449.0,4133 Young Canyon,False,Premium,MidAge,False
3,7232460,62101.0,Joseph Miller,Mary34@gmail.com,2776752000.0,8148 Thomas Creek Suite 100,Edmonton,Ontario,88420.0,Canada,...,Utility knife,False,Joseph Miller,Mary34@gmail.com,2776751724.0,8148 Thomas Creek Suite 100,False,Premium,MidAge,False
4,6095326,41289.0,Ryan Johnson,Haley12@gmail.com,3292677000.0,532 Ashley Crest Suite 014,Brisbane,New South Wales,74430.0,Australia,...,Lenovo Tab,False,Ryan Johnson,Haley12@gmail.com,3292677006.0,532 Ashley Crest Suite 014,False,Premium,MidAge,False
