# Project 6: **e-commerce** 

The project's initial objectives include working with SQL and RDBMS, from desiging a DB to establishing a python-sql connector to work with a DBs and python for more flexible analysis.  

In [63]:
import numpy as np
import pandas as pd
import sqlite3
from sqlite3 import Error

**Step 1 - Data import**

In [None]:
df_ec = pd.read_csv('Sample-Superstore.csv', sep=',', encoding='utf-8') # given it's not UTC-8 encoded it will return an error

In [65]:
# code to review the encoding of files  

import chardet

# look at the first 100k bytes to guess the character encoding
with open("Sample-Superstore.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))

# check what the character encoding might be
print(result)

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [66]:

# code to overcome erorr, reads encoding until match is found   
try:
    df_ec = pd.read_csv('Sample-Superstore.csv', sep=',', encoding='utf-8')
except UnicodeDecodeError:
    try:
        df_ec = pd.read_csv('Sample-Superstore.csv', sep=',', encoding='latin1')
    except UnicodeDecodeError:
        try:
            df_ec = pd.read_csv('Sample-Superstore.csv', sep=',', encoding='ISO-8859-1')
        except UnicodeDecodeError:
            df_ec = pd.read_csv('Sample-Superstore.csv', sep=',', encoding='cp1252')

**Step 2 - Intial data overview**

In [67]:
# data frame inspection
df_ec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9994 non-null   int64  
 1   Order ID       9994 non-null   object 
 2   Order Date     9994 non-null   object 
 3   Ship Date      9994 non-null   object 
 4   Ship Mode      9994 non-null   object 
 5   Customer ID    9994 non-null   object 
 6   Customer Name  9994 non-null   object 
 7   Segment        9994 non-null   object 
 8   Country        9994 non-null   object 
 9   City           9994 non-null   object 
 10  State          9994 non-null   object 
 11  Postal Code    9994 non-null   int64  
 12  Region         9994 non-null   object 
 13  Product ID     9994 non-null   object 
 14  Category       9994 non-null   object 
 15  Sub-Category   9994 non-null   object 
 16  Product Name   9994 non-null   object 
 17  Sales          9994 non-null   float64
 18  Quantity

In [68]:
# quick sample of data frame 
df_ec.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [69]:
df_ec['Category'].value_counts()

Category
Office Supplies    6026
Furniture          2121
Technology         1847
Name: count, dtype: int64

In [70]:
df_ec['Sub-Category'].value_counts()

Sub-Category
Binders        1523
Paper          1370
Furnishings     957
Phones          889
Storage         846
Art             796
Accessories     775
Chairs          617
Appliances      466
Labels          364
Tables          319
Envelopes       254
Bookcases       228
Fasteners       217
Supplies        190
Machines        115
Copiers          68
Name: count, dtype: int64

**Step 3 - Adjusting dtypes for specific columns (e.g., Date columns)**

In [71]:
df_ec['Order Date'] = pd.to_datetime(df_ec['Order Date'])
df_ec['Ship Date'] = pd.to_datetime(df_ec['Ship Date'])

**Step 4 - New Features - Numeric, uniqueIDs, etc** 

In [72]:
# creating a price column for products

df_ec['Price'] = df_ec['Sales'] / df_ec['Quantity']

In [73]:
# creating a cost per product column

df_ec['ProductCost'] = ((df_ec['Sales'] - df_ec['Profit']) / df_ec['Quantity'])

In [74]:
# creating a total cost product

df_ec['TotalCost'] = df_ec['Sales'] - df_ec['Profit']

In [75]:
# creating a discount value column to display it as number instead of decimal/rate

df_ec['DiscountValue'] = df_ec['Sales'] * df_ec['Discount']

In [76]:
# creating a OriginalSaleValue value column to display the value before applying the discount rate

df_ec['OriginalSaleValue'] = (df_ec['Sales'] * ( 1 + df_ec['Discount']))


In [77]:
# creation of a SaleID column for the new Sales data frame 

# addition of new Unique Identifier column 
df_ec.insert(0,'SaleID', None)


# flling column with a created unique identifier per row 
length = 9994
df_ec['SaleID'] = ['S' + str(i) for i in range (1, length+1)]

In [97]:
df_ec.head()


Unnamed: 0,SaleID,RowID,OrderID,OrderDate,ShipDate,ShipMode,CustomerID,CustomerName,Segment,Country,...,Profit,Price,ProductCost,TotalCost,DiscountValue,OriginalSaleValue,UniqueOrderID,UniqueProductID,Year,Month
0,S1,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,...,41.9136,130.98,110.0232,220.0464,0.0,261.96,CA-2016-152156-1,0,2016,11
1,S2,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,...,219.582,243.98,170.786,512.358,0.0,731.94,CA-2016-152156-2,1,2016,11
2,S3,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,...,6.8714,7.31,3.8743,7.7486,0.0,14.62,CA-2016-138688-1,2,2016,6
3,S4,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,...,-383.031,191.5155,268.1217,1340.6085,430.909875,1388.487375,US-2015-108966-1,3,2015,10
4,S5,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,...,2.5164,11.184,9.9258,19.8516,4.4736,26.8416,US-2015-108966-2,4,2015,10


In [78]:
# creation of UniqueOrderID column for the new Order data frame 
# new dictionary to store the count of occurrences for each OrderID
order_count = {}

# function to generate the unique order ID
def generate_unique_order_id(order_id):
    if order_id not in order_count:
        order_count[order_id] = 1
    else:
        order_count[order_id] += 1
    return f"{order_id}-{order_count[order_id]}"

# applying the function to create the UniqueOrderID column in df
df_ec['UniqueOrderID'] = df_ec['Order ID'].apply(generate_unique_order_id)

In [80]:
# creating a UniqueProductID given a ProductID can have multiple ProductNames

# inserting new column UniqueProductID
df_ec['UniqueProductID'] = df_ec['Product ID'].astype(str) + '_' + df_ec['Product Name']

# mapping unique combinations for a unique identifier (which could work as composite key in sql)
unique_ids = {val: idx for idx, val in enumerate(df_ec['UniqueProductID'].unique())}
df_ec['UniqueProductID'] = df_ec['UniqueProductID'].map(unique_ids)

print(df_ec)

     SaleID  Row ID        Order ID Order Date  Ship Date       Ship Mode  \
0        S1       1  CA-2016-152156 2016-11-08 2016-11-11    Second Class   
1        S2       2  CA-2016-152156 2016-11-08 2016-11-11    Second Class   
2        S3       3  CA-2016-138688 2016-06-12 2016-06-16    Second Class   
3        S4       4  US-2015-108966 2015-10-11 2015-10-18  Standard Class   
4        S5       5  US-2015-108966 2015-10-11 2015-10-18  Standard Class   
...     ...     ...             ...        ...        ...             ...   
9989  S9990    9990  CA-2014-110422 2014-01-21 2014-01-23    Second Class   
9990  S9991    9991  CA-2017-121258 2017-02-26 2017-03-03  Standard Class   
9991  S9992    9992  CA-2017-121258 2017-02-26 2017-03-03  Standard Class   
9992  S9993    9993  CA-2017-121258 2017-02-26 2017-03-03  Standard Class   
9993  S9994    9994  CA-2017-119914 2017-05-04 2017-05-09    Second Class   

     Customer ID     Customer Name    Segment        Country  ... Quantity 

In [81]:
# creating column with order month and year given SQLite does not suport YEAR() or MONTH()

df_ec['Year'], df_ec['Month'] = df_ec["Order Date"].dt.year, df_ec["Order Date"].dt.month

**Step 5 - Consistent Column labelling**

In [82]:
# adjusting column labels to PascalCase to comply with SQL convention
df_ec.rename(columns={'SaleID':'SaleID',
              'Row ID':'RowID',
              'Order ID': 'OrderID',
              'UniqueOrderID':'UniqueOrderID',
              'Order Date': 'OrderDate',
              'Ship Date': 'ShipDate',
              'Ship Mode': 'ShipMode',
              'Customer ID': 'CustomerID',
              'Customer Name': 'CustomerName',
              'Segment': 'Segment',
              'Country': 'Country',
              'City': 'City',
              'State': 'State',
              'Postal Code': 'PostalCode',
              'Region': 'Region',
              'Product ID': 'ProductID',
              'Category': 'Category',
              'Sub-Category': 'SubCategory',
              'Product Name': 'ProductName',
              'Sales': 'Sales',
              'Quantity': 'Quantity',
              'Discount': 'Discount',
              'Profit': 'Profit',  
              'Price': 'Price',
              'ProductCost':'ProductCost',
              'TotalCost': 'TotalCost',
              'DiscountValue': 'DiscountValue',
              'OriginalSaleValue': 'OriginalSaleValue'}, inplace=True)

In [83]:
df_ec.head()

Unnamed: 0,SaleID,RowID,OrderID,OrderDate,ShipDate,ShipMode,CustomerID,CustomerName,Segment,Country,...,Profit,Price,ProductCost,TotalCost,DiscountValue,OriginalSaleValue,UniqueOrderID,UniqueProductID,Year,Month
0,S1,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,...,41.9136,130.98,110.0232,220.0464,0.0,261.96,CA-2016-152156-1,0,2016,11
1,S2,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,...,219.582,243.98,170.786,512.358,0.0,731.94,CA-2016-152156-2,1,2016,11
2,S3,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,...,6.8714,7.31,3.8743,7.7486,0.0,14.62,CA-2016-138688-1,2,2016,6
3,S4,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,...,-383.031,191.5155,268.1217,1340.6085,430.909875,1388.487375,US-2015-108966-1,3,2015,10
4,S5,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,...,2.5164,11.184,9.9258,19.8516,4.4736,26.8416,US-2015-108966-2,4,2015,10


**Step 6 - SQL DB preparation - splitting tables**

In [84]:
# splitting the data frame into new tables for SQL DB design 

df_orders = df_ec[['UniqueOrderID','OrderID','OrderDate', 'ShipDate', 'ShipMode', 'PostalCode']].copy()

df_sales = df_ec[['SaleID', 'Sales', 'Quantity', 'Discount', 'Profit', 'UniqueOrderID']].copy()

df_product = df_ec[['UniqueProductID', 'ProductID','Category', 'SubCategory', 'ProductName', 'UniqueOrderID']].copy()

df_location = df_ec[['PostalCode', 'Country', 'City', 'State', 'Region', 'OrderID']].copy()

df_customer = df_ec[['CustomerID','CustomerName', 'Segment', 'SaleID']].copy()

In [85]:
df_customer['CustomerID'].value_counts()

CustomerID
WB-21850    37
JL-15835    34
MA-17560    34
PP-18955    34
CK-12205    32
            ..
LD-16855     1
AO-10810     1
CJ-11875     1
RE-19405     1
JR-15700     1
Name: count, Length: 793, dtype: int64

In [86]:
# Droping duplicates in dataframes where applicable

df_customer.drop_duplicates(subset=['CustomerID'], keep='first', inplace=True)
df_product.drop_duplicates(subset=['UniqueProductID'], keep='first', inplace=True)
df_location.drop_duplicates(subset=['PostalCode'], keep='first', inplace=True)


In [87]:
# checking the unique contraint for primary keys is met across the dataframes tables 

print('Duplicates in df_product:', df_product['UniqueProductID'].duplicated().any())
print('Duplicates in df_customer:', df_customer['CustomerID'].duplicated().any())
print('Duplicates in df_location:', df_location['PostalCode'].duplicated().any())
print('Duplicates in df_sales:', df_sales['SaleID'].duplicated().any())
print('Duplicates in df_orders:', df_orders['UniqueOrderID'].duplicated().any())

Duplicates in df_product: False
Duplicates in df_customer: False
Duplicates in df_location: False
Duplicates in df_sales: False
Duplicates in df_orders: False


In [88]:
# reseting the index of the new created dataframes

df_customer.reset_index(drop=True, inplace=True)

df_sales.reset_index(drop=True, inplace=True)

df_orders.reset_index(drop=True, inplace=True)

df_product.reset_index(drop=True, inplace=True)

df_location.reset_index(drop=True, inplace=True)

**Step 7 - Connecting Python and SQL**

In [89]:
# new sqldatabase file while assiging a connection object stored in the db_conn variable

db_conn = sqlite3.connect("Sample-Superstore.db")

# creation of the cursor serves to execute SQL statements and it is connected to the variable/database used in this particular case

c = db_conn.cursor()


**Step 8 - Creating the specific tables for the DB using the connector**

In [90]:
# new tables will be created empty and have to match the columns in their corresponding data frame


# LOCATION
c.execute(
    """
    CREATE TABLE Location (
        PostalCode INT NOT NULL,
        Country TEXT NOT NULL,
        City TEXT NOT NULL,
        State TEXT NOT NULL,
        Region TEXT NOT NULL,
        OrderID VARCHAR NOT NULL, 
        PRIMARY KEY(PostalCode)
        );
    """
)

# ORDERS
c.execute(
    """
    CREATE TABLE Orders (
        UniqueOrderID VARCHAR NOT NULL,
        OrderID VARCHAR NOT NULL,
        OrderDate DATE NOT NULL,
        ShipDate DATE NOT NULL,
        ShipMode TEXT NOT NULL,
        PostalCode INT NOT NULL,
        PRIMARY KEY(UniqueOrderID),
        FOREIGN KEY(PostalCode) REFERENCES Location(PostalCode)
        );
    """
)

# SALES
c.execute(
    """
    CREATE TABLE Sales (
        SaleID VARCHAR NOT NULL,
        Sales REAL NOT NULL,
        Quantity INTEGER NOT NULL,
        Discount REAL NOT NULL,
        Profit REAL NOT NULL,
        UniqueOrderID VARCHAR NOT NULL,
        PRIMARY KEY(SaleID),
        FOREIGN KEY(UniqueOrderID) REFERENCES Orders(UniqueOrderID)
        );
     """
)

# PRODUCTS
c.execute(
    """
    CREATE TABLE Products (
        UniqueProductID VARCHAR NOT NULL,
        ProductID VARCHAR NOT NULL,
        Category TEXT NOT NULL,
        SubCategory TEXT NOT NULL,
        ProductName TEXT NOT NULL,
        UniqueOrderID VARCHAR NOT NULL,
        PRIMARY KEY(UniqueProductID),
        FOREIGN KEY(UniqueOrderID) REFERENCES Orders(UniqueOrderID)
        );
    """
)

# CUSTOMER
c.execute(
    """
    CREATE TABLE Customers (
        CustomerID VARCHAR NOT NULL,
        CustomerName TEXT,
        Segment TEXT,
        SaleID VARCHAR NOT NULL,
        PRIMARY KEY(CustomerID),
        FOREIGN KEY(SaleID) REFERENCES Sales(SaleID)
        );
   """
)


OperationalError: table Location already exists

In [None]:
# if needed when restarting or running all cells again one should first delete the created tables, otherwise the cell above will return an error 

# List of tables to delete
#tables_to_delete = ['Location', 'Orders', 'Sales', 'Products', 'Customers']

# Loop through the list and delete each table
#for table_name in tables_to_delete:
#    c.execute(f"DROP TABLE IF EXISTS {table_name};")

# Commit the changes and close the connection
#db_conn.commit()

**Step 9 - Filling in SQL tables with the dataframes**

In [None]:
# SQL tables will be populated with the content of references dataframe

df_location.to_sql('Location', db_conn, if_exists='append', index=False)
df_orders.to_sql('Orders', db_conn, if_exists='append', index=False)
df_sales.to_sql('Sales', db_conn, if_exists='append', index=False)
df_product.to_sql('Products', db_conn, if_exists='append', index=False)
df_customer.to_sql('Customers', db_conn, if_exists='append', index=False)

793

**Step 10 - Query excution to retrieve for answers**

In [93]:
# simple query example to ensure it works   

c.execute("SELECT * FROM Location LIMIT 5")
rows = c.fetchall()


<sqlite3.Cursor at 0x23f3a9afbc0>