## Setup

In [1]:
# imports
from kaggle.api.kaggle_api_extended import KaggleApi
import pandas as pd
import string
from datetime import datetime

#### Function for trailing spaces

In [2]:
# strip trailing spaces for columns with string dtype
def strip_trailing_spaces(df):

    # make copy
    main_df = df.copy()

    # loop through columns
    for col in main_df.columns:
        # if string column then strip
        if df[col].dtype == 'object':
            df[col] = main_df[col].str.strip()

    # return df
    return main_df

#### Get sales df

In [3]:
# import dataset through kaggle api
api = KaggleApi()
api.authenticate()

api.dataset_download_files('kyanyoga/sample-sales-data', path='./Resources', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/kyanyoga/sample-sales-data


In [4]:
# convert to df
sales_df = pd.read_csv('Resources/sales_data_sample.csv', encoding='latin1')

# strip leading/trailing spaces
sales_df = strip_trailing_spaces(sales_df)

# display df
print(sales_df.shape)
sales_df.head()

(2823, 25)


Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,...,897 Long Airport Avenue,,NYC,NY,10022.0,USA,,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,5/7/2003 0:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100.0,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium
3,10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,...,78934 Hillside Dr.,,Pasadena,CA,90003.0,USA,,Young,Julie,Medium
4,10159,49,100.0,14,5205.27,10/10/2003 0:00,Shipped,4,10,2003,...,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium


## Data Check
---

### Quick checks

In [5]:
# check nulls
sales_df.isna().sum()

ORDERNUMBER            0
QUANTITYORDERED        0
PRICEEACH              0
ORDERLINENUMBER        0
SALES                  0
ORDERDATE              0
STATUS                 0
QTR_ID                 0
MONTH_ID               0
YEAR_ID                0
PRODUCTLINE            0
MSRP                   0
PRODUCTCODE            0
CUSTOMERNAME           0
PHONE                  0
ADDRESSLINE1           0
ADDRESSLINE2        2521
CITY                   0
STATE               1486
POSTALCODE            76
COUNTRY                0
TERRITORY           1074
CONTACTLASTNAME        0
CONTACTFIRSTNAME       0
DEALSIZE               0
dtype: int64

In [6]:
# look at unique count
sales_df.nunique()

ORDERNUMBER          307
QUANTITYORDERED       58
PRICEEACH           1016
ORDERLINENUMBER       18
SALES               2763
ORDERDATE            252
STATUS                 6
QTR_ID                 4
MONTH_ID              12
YEAR_ID                3
PRODUCTLINE            7
MSRP                  80
PRODUCTCODE          109
CUSTOMERNAME          92
PHONE                 91
ADDRESSLINE1          92
ADDRESSLINE2           9
CITY                  73
STATE                 16
POSTALCODE            73
COUNTRY               19
TERRITORY              3
CONTACTLASTNAME       77
CONTACTFIRSTNAME      72
DEALSIZE               3
dtype: int64

In [7]:
# data types check
sales_df.dtypes

ORDERNUMBER           int64
QUANTITYORDERED       int64
PRICEEACH           float64
ORDERLINENUMBER       int64
SALES               float64
ORDERDATE            object
STATUS               object
QTR_ID                int64
MONTH_ID              int64
YEAR_ID               int64
PRODUCTLINE          object
MSRP                  int64
PRODUCTCODE          object
CUSTOMERNAME         object
PHONE                object
ADDRESSLINE1         object
ADDRESSLINE2         object
CITY                 object
STATE                object
POSTALCODE           object
COUNTRY              object
TERRITORY            object
CONTACTLASTNAME      object
CONTACTFIRSTNAME     object
DEALSIZE             object
dtype: object

### Create datetime column

In [8]:
# convert orderdate
sales_df['ORDERDATE'] = pd.to_datetime(sales_df['ORDERDATE'])

# check dtypes
sales_df.dtypes

ORDERNUMBER                  int64
QUANTITYORDERED              int64
PRICEEACH                  float64
ORDERLINENUMBER              int64
SALES                      float64
ORDERDATE           datetime64[ns]
STATUS                      object
QTR_ID                       int64
MONTH_ID                     int64
YEAR_ID                      int64
PRODUCTLINE                 object
MSRP                         int64
PRODUCTCODE                 object
CUSTOMERNAME                object
PHONE                       object
ADDRESSLINE1                object
ADDRESSLINE2                object
CITY                        object
STATE                       object
POSTALCODE                  object
COUNTRY                     object
TERRITORY                   object
CONTACTLASTNAME             object
CONTACTFIRSTNAME            object
DEALSIZE                    object
dtype: object

In [10]:
# order data by datetime
sales_df = sales_df.sort_values(by='ORDERDATE')

## Create Relational Tables
---

### Check for duplicate naming issues (ex: "Company Inc." and "company inc")

In [15]:
# list of cols that could be duplicates (object type)
dupe_cols_check = list(sales_df.select_dtypes(include=['object']).columns)

# check for dupes
for col in dupe_cols_check:
    # create dupe col check
    col_check = f'{col}_CHECK'
    sales_df[col_check] = sales_df[col].str.title().str.translate(str.maketrans('', '', string.punctuation))

    # check if naming is already unique, if not use new col
    if sales_df[col_check].nunique() == sales_df[col].nunique():
        # drop new col
        sales_df = sales_df.drop(columns=[col_check]).copy()
        print(f'{col}: no duplicate naming issues')
    else:
        # drop old col
        sales_df = sales_df.drop(columns=[col]).copy()
        print(f'{col}: replaced column - found duplicate naming issues!!!')

STATUS: no duplicate naming issues
PRODUCTLINE: no duplicate naming issues
PRODUCTCODE: no duplicate naming issues
CUSTOMERNAME: no duplicate naming issues
PHONE: no duplicate naming issues
ADDRESSLINE1: no duplicate naming issues
ADDRESSLINE2: no duplicate naming issues
CITY: no duplicate naming issues
STATE: no duplicate naming issues
POSTALCODE: no duplicate naming issues
COUNTRY: no duplicate naming issues
TERRITORY: no duplicate naming issues
CONTACTLASTNAME: no duplicate naming issues
CONTACTFIRSTNAME: no duplicate naming issues
DEALSIZE: no duplicate naming issues


### Customer Table