In [9]:
# ----------------------------------
# Import required packages
# ----------------------------------
import pandas as pd
import os

# ----------------------------------
# Define file path
# ----------------------------------
file_path = r"C:\Repos\smart-store-agnesmrutu\data\raw\customers_data.csv"

# ----------------------------------
# Check if file exists before loading
# ----------------------------------
if not os.path.exists(file_path):
    print("⚠️ File not found! Please check the file path:", file_path)
else:
    # ----------------------------------
    # Load dataset
    # ----------------------------------
    customers_df = pd.read_csv(file_path)

    # ----------------------------------
    # Basic Data Overview
    # ----------------------------------
    print("✅ File successfully loaded!")
    print("\n--- Dataset Info ---")
    print(customers_df.info())

    print("\n--- First 5 Rows ---")
    print(customers_df.head())

    print("\n--- Missing Values ---")
    print(customers_df.isna().sum())

    # ----------------------------------
    # Descriptive Statistics
    # ----------------------------------
    print("\n--- Descriptive Statistics (Numeric Columns) ---")
    print(customers_df.describe())

    # ----------------------------------
    # For categorical columns
    # ----------------------------------
    print("\n--- Categorical Column Value Counts ---")
    for col in customers_df.select_dtypes(include=['object']).columns:
        print(f"\n{col}:")
        print(customers_df[col].value_counts().head())

✅ File successfully loaded!

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   CustomerID  201 non-null    int64 
 1   Name        201 non-null    object
 2   Region      201 non-null    object
 3   JoinDate    201 non-null    object
dtypes: int64(1), object(3)
memory usage: 6.4+ KB
None

--- First 5 Rows ---
   CustomerID           Name   Region    JoinDate
0        1000   Robert Gomez     West  2024-02-25
1        1001     John Silva     East  2020-12-01
2        1002  Mark Marshall  Central  2020-08-08
3        1003  David Brennan    North  2020-05-21
4        1004  Kerry Collins    North  2023-09-12

--- Missing Values ---
CustomerID    0
Name          0
Region        0
JoinDate      0
dtype: int64

--- Descriptive Statistics (Numeric Columns) ---
        CustomerID
count   201.000000
mean   1099.029851
std      58.117804
mi

Products Data

In [10]:
# ----------------------------------
# Import required packages
# ----------------------------------
import pandas as pd
import os

# ----------------------------------
# Define file path
# ----------------------------------
file_path = r"C:\Repos\smart-store-agnesmrutu\data\raw\products_data.csv"

# ----------------------------------
# Check if file exists before loading
# ----------------------------------
if not os.path.exists(file_path):
    print("⚠️ File not found! Please check the file path:", file_path)
else:
    # ----------------------------------
    # Load dataset
    # ----------------------------------
    customers_df = pd.read_csv(file_path)

    # ----------------------------------
    # Basic Data Overview
    # ----------------------------------
    print("✅ File successfully loaded!")
    print("\n--- Dataset Info ---")
    print(customers_df.info())

    print("\n--- First 5 Rows ---")
    print(customers_df.head())

    print("\n--- Missing Values ---")
    print(customers_df.isna().sum())

    # ----------------------------------
    # Descriptive Statistics
    # ----------------------------------
    print("\n--- Descriptive Statistics (Numeric Columns) ---")
    print(customers_df.describe())

    # ----------------------------------
    # For categorical columns
    # ----------------------------------
    print("\n--- Categorical Column Value Counts ---")
    for col in customers_df.select_dtypes(include=['object']).columns:
        print(f"\n{col}:")
        print(customers_df[col].value_counts().head())

✅ File successfully loaded!

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    int64  
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   UnitPrice    100 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 3.2+ KB
None

--- First 5 Rows ---
   ProductID           ProductName     Category  UnitPrice
0       2000        Electronics-Be  Electronics     969.31
1       2001        Electronics-Be     Clothing     412.35
2       2002         Office-Family     Clothing     866.00
3       2003  Electronics-Training     Clothing     385.18
4       2004          Office-Where         Home     927.25

--- Missing Values ---
ProductID      0
ProductName    0
Category       0
UnitPrice      0
dtype: int64

--- Descriptive Statistics (Numeric Columns) -

Sales Data

In [20]:
# ----------------------------------
# Import required packages
# ----------------------------------
import pandas as pd
import os
import re

# ----------------------------------
# Define file path
# ----------------------------------
file_path = r"C:\Repos\smart-store-agnesmrutu\data\raw\sales_data.csv"

# ----------------------------------
# Check if file exists before loading
# ----------------------------------
if not os.path.exists(file_path):
    print("⚠️ File not found! Please check the file path:", file_path)
else:
    # ----------------------------------
    # Load dataset
    # ----------------------------------
    sales_df = pd.read_csv(file_path)

    # ----------------------------------
    # Clean column names (strip spaces)
    # ----------------------------------
    sales_df.columns = sales_df.columns.str.strip()

    # ----------------------------------
    # Use the exact 'SaleAmount' column
    # ----------------------------------
    target_sales_col = 'SaleAmount'

    if target_sales_col not in sales_df.columns:
        raise ValueError(
            f"⚠️ Column '{target_sales_col}' not found in dataset. Available columns: {list(sales_df.columns)}"
        )

    # ----------------------------------
    # Clean sales column: remove commas, currency symbols, whitespace
    # ----------------------------------
    sales_df[target_sales_col] = sales_df[target_sales_col].astype(str)
    sales_df[target_sales_col] = sales_df[target_sales_col].str.replace(r'[^\d.]', '', regex=True)
    sales_df[target_sales_col] = pd.to_numeric(sales_df[target_sales_col], errors='coerce')

    # ----------------------------------
    # Calculate summary statistics
    # ----------------------------------
    avg_sales = sales_df[target_sales_col].mean()
    min_sales = sales_df[target_sales_col].min()
    max_sales = sales_df[target_sales_col].max()

    print("\n--- Sales Amount Summary ---")
    print(f"Column used: {target_sales_col}")
    print(f"Estimated Average Sales Amount: ${avg_sales:,.2f}")
    print(f"Minimum Sales Amount: ${min_sales:,.2f}")
    print(f"Maximum Sales Amount: ${max_sales:,.2f}")


--- Sales Amount Summary ---
Column used: SaleAmount
Estimated Average Sales Amount: $1,014.27
Minimum Sales Amount: $0.00
Maximum Sales Amount: $6,619.20
