In [1]:
#Creating a copy of database file for backup
import shutil
shutil.copyfile('WalmartSalesData.db', 'WalmartSalesData_copy.db')

#Load the SQL Extension
%load_ext sql

#Connecting to SQLite database (use sqlalchemy-1.4.49)
%sql sqlite:///WalmartSalesData_copy.db

%sql SELECT * FROM sales LIMIT 5;

 * sqlite:///WalmartSalesData_copy.db
Done.


invoice_id,branch,city,customer_type,gender,product_line,unit_price,quantity,tax_pct,total,date,time,payment,cogs,gross_margin_pct,gross_income,rating
750-67-8428,A,Yangon,Member,Female,,74.69,7,26.1415,548.9715,2019-01-05,13:08:00,Ewallet,522.83,4.761904762,26.1415,9.1
373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2019-02-08,10:37:00,Ewallet,604.17,4.761904762,30.2085,5.3
699-14-3026,C,Naypyitaw,Normal,Male,Electronic accessories,85.39,7,29.8865,627.6165,2019-03-25,18:30:00,Ewallet,597.73,4.761904762,29.8865,4.1
355-53-5943,A,Yangon,Member,Female,Electronic accessories,68.84,6,20.652,433.692,2019-02-25,14:36:00,Ewallet,413.04,4.761904762,20.652,5.8
315-22-5665,C,Naypyitaw,Normal,Female,Home and lifestyle,73.56,10,36.78,772.38,2019-02-24,11:38:00,Ewallet,735.6,4.761904762,36.78,8.0


#### Checklist for data cleaning
Data type, Data range, Null, Unique data, Regex, Cross-field validation, Accurate, Complete, Consistent

#### Removing white spaces, deleting rows with NULL or empty strings

In [2]:
%%sql
UPDATE sales
SET
    invoice_id       = TRIM(invoice_id),
    branch           = TRIM(branch),
    city             = TRIM(city),
    customer_type    = TRIM(customer_type),
    gender           = TRIM(gender),
    product_line     = TRIM(product_line),
    date             = TRIM(date),
    time             = TRIM(time),
    payment          = TRIM(payment)
    ;

DELETE FROM sales
WHERE 
    invoice_id IS NULL OR invoice_id = ''
OR  branch IS NULL OR branch = ''
OR  city IS NULL OR city = ''
OR  customer_type IS NULL OR customer_type = ''
OR  gender IS NULL OR gender = ''
OR  product_line IS NULL OR product_line = ''
OR  date IS NULL OR date = ''
OR  time IS NULL OR time = ''
OR  payment IS NULL OR payment = ''
;

 * sqlite:///WalmartSalesData_copy.db
994 rows affected.
7 rows affected.


[]

In [3]:
%%sql 
SELECT invoice_id, branch, city, customer_type, product_line, date, time, payment
FROM sales 
LIMIT 5;

 * sqlite:///WalmartSalesData_copy.db
Done.


invoice_id,branch,city,customer_type,product_line,date,time,payment
373-73-7910,A,Yangon,Normal,Sports and travel,2019-02-08,10:37:00,Ewallet
699-14-3026,C,Naypyitaw,Normal,Electronic accessories,2019-03-25,18:30:00,Ewallet
355-53-5943,A,Yangon,Member,Electronic accessories,2019-02-25,14:36:00,Ewallet
315-22-5665,C,Naypyitaw,Normal,Home and lifestyle,2019-02-24,11:38:00,Ewallet
665-32-9167,A,Yangon,Member,Health and beauty,2019-01-10,17:15:00,Credit card


In [4]:
%%sql
SELECT invoice_id
FROM sales
WHERE 
    strftime('%Y-%m-%d', date) != date
OR  strftime('%H:%M:%S', time) != time;

 * sqlite:///WalmartSalesData_copy.db
Done.


invoice_id


In [5]:
%%sql
SELECT 
    DISTINCT branch
FROM sales;

 * sqlite:///WalmartSalesData_copy.db
Done.


branch
A
C
B


In [6]:
%%sql
SELECT 
    DISTINCT city
FROM sales

 * sqlite:///WalmartSalesData_copy.db
Done.


city
Yangon
Naypyitaw
Mandalay


In [7]:
%%sql
SELECT 
    DISTINCT customer_type
FROM sales

 * sqlite:///WalmartSalesData_copy.db
Done.


customer_type
Normal
Member


In [8]:
%%sql
SELECT 
    DISTINCT gender
FROM sales

 * sqlite:///WalmartSalesData_copy.db
Done.


gender
Male
Female


 * sqlite:///WalmartSalesData_copy.db
Done.


product_line
Sports and travel
Electronic accessories
Home and lifestyle
Health and beauty
Food and beverages
Fashion accessories


In [10]:
%%sql
SELECT 
    DISTINCT payment
FROM sales

 * sqlite:///WalmartSalesData_copy.db
Done.


payment
Ewallet
Credit card
Cash


#### Feature Engineering

In [11]:
%%sql
ALTER TABLE sales ADD COLUMN time_period TEXT;
ALTER TABLE sales ADD COLUMN day TEXT;
ALTER TABLE sales ADD COLUMN month TEXT;
ALTER TABLE sales ADD COLUMN weekday TEXT;

UPDATE sales
SET time_period =  
    CASE
        WHEN strftime('%H', time) BETWEEN '05' AND '08' THEN 'Early Morning'
        WHEN strftime('%H', time) BETWEEN '09' AND '11' THEN 'Morning'
        WHEN strftime('%H', time) BETWEEN '12' AND '16' THEN 'Afternoon'
        WHEN strftime('%H', time) BETWEEN '17' AND '20' THEN 'Evening'
        WHEN strftime('%H', time) BETWEEN '21' AND '23' THEN 'Night'
        WHEN strftime('%H', time) BETWEEN '00' AND '04' THEN 'Late Night'
    END;

UPDATE sales
SET day = strftime('%d', date);

UPDATE sales
SET month = strftime('%m', date);

UPDATE sales 
SET weekday = CASE strftime('%w', date)
                  WHEN '0' THEN 'Sunday'
                  WHEN '1' THEN 'Monday'
                  WHEN '2' THEN 'Tuesday'
                  WHEN '3' THEN 'Wednesday'
                  WHEN '4' THEN 'Thursday'
                  WHEN '5' THEN 'Friday'
                  WHEN '6' THEN 'Saturday'
              END;

 * sqlite:///WalmartSalesData_copy.db
Done.
Done.
Done.
Done.
987 rows affected.
987 rows affected.
987 rows affected.
987 rows affected.


[]

In [12]:
%%sql 
SELECT invoice_id, date, time, time_period, day, month, weekday 
FROM sales 
LIMIT 5;

 * sqlite:///WalmartSalesData_copy.db
Done.


invoice_id,date,time,time_period,day,month,weekday
373-73-7910,2019-02-08,10:37:00,Morning,8,2,Friday
699-14-3026,2019-03-25,18:30:00,Evening,25,3,Monday
355-53-5943,2019-02-25,14:36:00,Afternoon,25,2,Monday
315-22-5665,2019-02-24,11:38:00,Morning,24,2,Sunday
665-32-9167,2019-01-10,17:15:00,Evening,10,1,Thursday


#### Product Analysis

- How many unique product lines does the data have?

In [13]:
%%sql
SELECT 
    DISTINCT product_line
FROM sales

 * sqlite:///WalmartSalesData_copy.db
Done.


product_line
Sports and travel
Electronic accessories
Home and lifestyle
Health and beauty
Food and beverages
Fashion accessories


- What is the most selling product line?

In [19]:
%%sql
SELECT 
    product_line, 
    SUM(quantity) AS qty
FROM sales
GROUP BY product_line
ORDER BY qty DESC

 * sqlite:///WalmartSalesData_copy.db
Done.


product_line,qty
Electronic accessories,963
Food and beverages,942
Sports and travel,903
Fashion accessories,902
Home and lifestyle,885
Health and beauty,835


- What is the total revenue by month?

In [26]:
%%sql
SELECT 
    MIN(strftime('%Y-%m-%d', date)) as start_date,
    MAX(strftime('%Y-%m-%d', date)) as end_date
FROM sales

 * sqlite:///WalmartSalesData_copy.db
Done.


start_date,end_date
2019-01-01,2019-03-30
