Data Cleaning 

In [4]:
import pandas as pd

# 1. Leer el CSV
df = pd.read_csv("Sample-Superstore.csv", encoding="latin-1")

# 2. Revisar la estructura
print(df.info())       # tipos de datos, nulls
print(df.head())       # primeras filas
print(df.columns)      # nombres de columnas

# 3. Quitar espacios en nombres de columnas
df.columns = df.columns.str.strip()

# 4. Eliminar duplicados
df = df.drop_duplicates()

# 5. Manejar valores nulos (ejemplo: borrar filas vacías)
df = df.dropna()  

# 6. Asegurar que columnas numéricas sean realmente numéricas
numeric_cols = ["Sales", "Quantity", "Discount", "Profit"]  # ajusta según tu dataset
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

# 7. Verificar nuevamente
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9994 non-null   int64  
 1   Order ID       9994 non-null   object 
 2   Order Date     9994 non-null   object 
 3   Ship Date      9994 non-null   object 
 4   Ship Mode      9994 non-null   object 
 5   Customer ID    9994 non-null   object 
 6   Customer Name  9994 non-null   object 
 7   Segment        9994 non-null   object 
 8   Country        9994 non-null   object 
 9   City           9994 non-null   object 
 10  State          9994 non-null   object 
 11  Postal Code    9994 non-null   int64  
 12  Region         9994 non-null   object 
 13  Product ID     9994 non-null   object 
 14  Category       9994 non-null   object 
 15  Sub-Category   9994 non-null   object 
 16  Product Name   9994 non-null   object 
 17  Sales          9994 non-null   float64
 18  Quantity

In [5]:
df.to_csv("Sample-Superstore-clean.csv", index=False)


EDA

In [18]:
# df.info()

df.head(10)

# df.shape

# df.columns

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164
5,6,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,...,90032,West,FUR-FU-10001487,Furniture,Furnishings,Eldon Expressions Wood and Plastic Desk Access...,48.86,7,0.0,14.1694
6,7,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,...,90032,West,OFF-AR-10002833,Office Supplies,Art,Newell 322,7.28,4,0.0,1.9656
7,8,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,...,90032,West,TEC-PH-10002275,Technology,Phones,Mitel 5320 IP Phone VoIP phone,907.152,6,0.2,90.7152
8,9,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,...,90032,West,OFF-BI-10003910,Office Supplies,Binders,DXL Angle-View Binders with Locking Rings by S...,18.504,3,0.2,5.7825
9,10,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,...,90032,West,OFF-AP-10002892,Office Supplies,Appliances,Belkin F5C206VTEL 6 Outlet Surge,114.9,5,0.0,34.47


In [6]:
import pandas as pd
import sqlite3

df = pd.read_csv("Sample-Superstore-clean.csv", encoding="latin-1")

# Conectar a SQLite (esto crea el archivo superstore.db si no existe)
conn = sqlite3.connect("superstore.db")

# Guardar los datos en una tabla llamada "Superstore"
df.to_sql("Superstore", conn, if_exists="replace", index=False)



9994

In [7]:
# Ejemplo: ver las primeras 5 filas
query = "SELECT * FROM Superstore LIMIT 5;"
result = pd.read_sql(query, conn)
print(result)

# Ejemplo: total de ventas por categoría
query = """
SELECT Category, SUM(Sales) AS Total_Sales
FROM Superstore
GROUP BY Category
ORDER BY Total_Sales DESC;
"""
result = pd.read_sql(query, conn)
print(result)


   Row ID        Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
0       1  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
1       2  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
2       3  CA-2016-138688   6/12/2016   6/16/2016    Second Class    DV-13045   
3       4  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   
4       5  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   

     Customer Name    Segment        Country             City  ...  \
0      Claire Gute   Consumer  United States        Henderson  ...   
1      Claire Gute   Consumer  United States        Henderson  ...   
2  Darrin Van Huff  Corporate  United States      Los Angeles  ...   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...   

  Postal Code  Region       Product ID         Category Sub-Category  \
0       42420   Sout

In [11]:

conn = sqlite3.connect("superstore.db")

query = """
SELECT *
FROM Superstore
LIMIT 10;
"""
df_sql = pd.read_sql(query, conn)
print(df_sql)

conn.close()

   Row ID        Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
0       1  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
1       2  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
2       3  CA-2016-138688   6/12/2016   6/16/2016    Second Class    DV-13045   
3       4  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   
4       5  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   
5       6  CA-2014-115812    6/9/2014   6/14/2014  Standard Class    BH-11710   
6       7  CA-2014-115812    6/9/2014   6/14/2014  Standard Class    BH-11710   
7       8  CA-2014-115812    6/9/2014   6/14/2014  Standard Class    BH-11710   
8       9  CA-2014-115812    6/9/2014   6/14/2014  Standard Class    BH-11710   
9      10  CA-2014-115812    6/9/2014   6/14/2014  Standard Class    BH-11710   

     Customer Name    Segment        Country             City  ...  \
0      Claire Gute   Consumer  United 

SQL Lite

In [58]:
import sqlite3
from pathlib import Path

def get_base_dir():
    try:
        return Path(__file__).resolve().parent
    except NameError:
        return Path.cwd()
BASE = get_base_dir()
DB_PATH = BASE / "shop.db"

SCHEMA_SQL = """
PRAGMA foreign_keys = ON;

DROP TABLE IF EXISTS sales;
DROP TABLE IF EXISTS product;
DROP TABLE IF EXISTS address;
DROP TABLE IF EXISTS orders;
DROP TABLE IF EXISTS customer;

CREATE TABLE customer (
  customer_id   INTEGER PRIMARY KEY,
  customername  TEXT NOT NULL,
  segment       TEXT,
  created_at    TEXT
);

CREATE TABLE orders (
  order_id    INTEGER PRIMARY KEY,
  customer_id INTEGER NOT NULL,
  ship_mode   TEXT,
  ship_date   TEXT,
  created_at  TEXT,
  FOREIGN KEY (customer_id) REFERENCES customer(customer_id)
);

CREATE TABLE address (
  order_id    INTEGER NOT NULL,
  customer_id INTEGER NOT NULL,
  country     TEXT,
  city        TEXT,
  postalcode  TEXT,
  region      TEXT,
  PRIMARY KEY (order_id, customer_id),
  FOREIGN KEY (customer_id) REFERENCES customer(customer_id),
  FOREIGN KEY (order_id)    REFERENCES orders(order_id)
);

CREATE TABLE product (
  product_id   INTEGER PRIMARY KEY,
  order_id     INTEGER NOT NULL,
  productname  TEXT,
  category     TEXT,
  subcategory  TEXT,
  FOREIGN KEY (order_id) REFERENCES orders(order_id)
);

CREATE TABLE sales (
  order_id    INTEGER NOT NULL,
  product_id  INTEGER NOT NULL,
  sales       NUMERIC,
  quantity    INTEGER,
  discount    NUMERIC,
  profit      NUMERIC,
  PRIMARY KEY (order_id, product_id),
  FOREIGN KEY (product_id) REFERENCES product(product_id),
  FOREIGN KEY (order_id)   REFERENCES orders(order_id)
);

CREATE INDEX idx_orders_customer ON orders(customer_id);
CREATE INDEX idx_addr_order     ON address(order_id);
CREATE INDEX idx_product_order  ON product(order_id);
CREATE INDEX idx_sales_order    ON sales(order_id);
CREATE INDEX idx_sales_product  ON sales(product_id);
"""

def create_tables(db_path=DB_PATH):
    conn = sqlite3.connect(db_path.as_posix())
    try:
        conn.execute("PRAGMA foreign_keys = ON;")
        conn.executescript(SCHEMA_SQL)
        conn.commit()
        print("Schema created in:", db_path.resolve())
    finally:
        conn.close()

if __name__ == "__main__":
    create_tables()


Schema created in: /Users/brunoaro/Desktop/CodeAcademy/GitHub_Repository/Project_5/shop.db


In [59]:
import sqlite3
import pandas as pd
from pathlib import Path

def get_base_dir():
    try:
        return Path(__file__).resolve().parent
    except NameError:
        return Path.cwd()
BASE = get_base_dir()

DB_PATH  = BASE / "shop.db"
CSV_PATH = BASE / "Sample-Superstore-clean.csv"  # adjust if your filename differs

def load_simple(csv_path=CSV_PATH, db_path=DB_PATH):
    # 1) Read CSV
    try:
        df = pd.read_csv(csv_path)
    except UnicodeDecodeError:
        df = pd.read_csv(csv_path, encoding="latin-1")

    # 2) Normalize names
    df.columns = [c.strip().lower().replace(" ", "_").replace("-", "_") for c in df.columns]
    df = df.rename(columns={"product_name":"productname", "sub_category":"subcategory", "postal_code":"postalcode"})

    # 3) Dates
    for col in ["order_date", "ship_date"]:
        df[col] = pd.to_datetime(df[col], errors="coerce").dt.strftime("%Y-%m-%d")

    # 4) Surrogate keys
    df["order_pk"]    = pd.factorize(df["order_id"].astype(str))[0] + 1
    df["customer_pk"] = pd.factorize(df["customer_id"].astype(str))[0] + 1
    df = df.reset_index(drop=True)
    df["product_pk"]  = df.index + 1

    # 5) Slice into tables
    customers = (df.sort_values(["customer_pk","order_date"])
                   .drop_duplicates("customer_pk")
                   .rename(columns={"customer_name":"customername"}))
    customer_tbl = customers[["customer_pk","customername","segment","order_date"]].rename(
        columns={"customer_pk":"customer_id","order_date":"created_at"})

    orders = df.sort_values(["order_pk","order_date"]).drop_duplicates("order_pk")
    orders_tbl = orders[["order_pk","customer_pk","ship_mode","ship_date","order_date"]].rename(
        columns={"order_pk":"order_id","customer_pk":"customer_id","order_date":"created_at"})

    address_tbl = orders[["order_pk","customer_pk","country","city","postalcode","region"]].rename(
        columns={"order_pk":"order_id","customer_pk":"customer_id"})

    product_tbl = df[["product_pk","order_pk","productname","category","subcategory"]].rename(
        columns={"product_pk":"product_id","order_pk":"order_id"})

    sales_tbl = df[["order_pk","product_pk","sales","quantity","discount","profit"]].rename(
        columns={"order_pk":"order_id","product_pk":"product_id"})

    # 6) Write
    with sqlite3.connect(db_path.as_posix()) as conn:
        conn.execute("PRAGMA foreign_keys = ON;")
        customer_tbl.to_sql("customer", conn, if_exists="append", index=False)
        orders_tbl.to_sql("orders",   conn, if_exists="append", index=False)
        address_tbl.to_sql("address", conn, if_exists="append", index=False)
        product_tbl.to_sql("product", conn, if_exists="append", index=False)
        sales_tbl.to_sql("sales",     conn, if_exists="append", index=False)

    print("Loaded:",
          len(customer_tbl), "customers,",
          len(orders_tbl), "orders,",
          len(address_tbl), "addresses,",
          len(product_tbl), "products,",
          len(sales_tbl), "sales rows")

if __name__ == "__main__":
    if not DB_PATH.exists():
        print("Run: python create_schema.py  (to create shop.db first)")
    elif not CSV_PATH.exists():
        print("CSV not found at:", CSV_PATH)
    else:
        print("Using DB:", DB_PATH.resolve())
        print("Using CSV:", CSV_PATH.resolve())
        load_simple()



Using DB: /Users/brunoaro/Desktop/CodeAcademy/GitHub_Repository/Project_5/shop.db
Using CSV: /Users/brunoaro/Desktop/CodeAcademy/GitHub_Repository/Project_5/Sample-Superstore-clean.csv
Loaded: 793 customers, 5009 orders, 5009 addresses, 9994 products, 9994 sales rows


In [61]:
import sqlite3, pandas as pd
from pathlib import Path

DB = "shop.db"
print("CWD:", Path.cwd())
print("DB exists:", Path(DB).exists(), "→", Path(DB).resolve())

with sqlite3.connect(DB) as conn:
    # What tables do we have?
    print(pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' ORDER BY 1;", conn))
    # Row counts
    counts = pd.read_sql("""
        SELECT 'customer' AS table_name, COUNT(*) AS rows FROM customer
        UNION ALL SELECT 'orders', COUNT(*) FROM orders
        UNION ALL SELECT 'address', COUNT(*) FROM address
        UNION ALL SELECT 'product', COUNT(*) FROM product
        UNION ALL SELECT 'sales', COUNT(*) FROM sales;
    """, conn)
    print(counts)
    # Are order dates missing?
    print("orders.created_at NULLs:",
          pd.read_sql("SELECT COUNT(*) AS nulls FROM orders WHERE created_at IS NULL;", conn))


CWD: /Users/brunoaro/Desktop/CodeAcademy/GitHub_Repository/Project_5
DB exists: True → /Users/brunoaro/Desktop/CodeAcademy/GitHub_Repository/Project_5/shop.db
       name
0   address
1  customer
2    orders
3   product
4     sales
  table_name  rows
0   customer   793
1     orders  5009
2    address  5009
3    product  9994
4      sales  9994
orders.created_at NULLs:    nulls
0      0
