### Index
1.  [Load & Discover Data](#1)<br>
#### Review Columns and Values
2.  [PurchaseId Column ](#2) <br>
3.  [PurchaseDate - ShipDate Columns](#3) <br>
4.  [ShipMode Column](#4) <br>
5.  [CustomerId Column](#5) <br>
6.  [CustomerName Column](#6) <br>
7.  [Segment Column](#7) <br>
8.  [Country Column](#8) <br>
9.  [City Column](#9) <br>
10. [FirstName-LastName & CustomerId Matches](#10) <br> 
11. [State Column](#11) <br> 
12. [PostalCode Column](#12) <br> 
13. [Region Column](#13) <br> 
14. [ProductId Column](#14) <br> 
15. [Category Column](#15) <br> 
16. [SubCategory Column](#16) <br> 
17. [ProductName Column](#17) <br> 
18. [Create SupplierPrice and CatalogPrice Columns](#18) <br> 
19. [ProductName-ProductId Pairs Review](#19) <br> 
20. [Sales, Quantity, Discount, Profit Columns](#20) <br> 
#### Create Data Frames for SQLite Tables
21. [PurchaseId-ProductId Composite PK for Sales Table](#21) <br> 
22. [Create Data Frames](#22) <br> 
23. [CustomerLocation  CustomerId Ratio](#23) <br> 
24. [Create and Populate Tables](#24) <br> 
25. [Run Queries to discover data](#25-run-some-queries-to-discover-data) <br> 
26. [More Insights](#26) <br> 

### 1: Load & Discover Data<a id="1"></a>

In [None]:
# import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sqlite3
from sqlite3 import Error

plt.style.use('ggplot')
pd.set_option('display.max_columns', 200) 

In [3]:
# Check encoding of dataset

import chardet

with open('Sample-Superstore.csv', 'rb') as file:
    rawdata = file.read(10000)
    result = chardet.detect(rawdata)
    
result

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

In [None]:
# Load the data 

df = pd.read_csv('Sample-Superstore.csv', sep=',', header=0, encoding='ISO-8859-1').copy()

In [5]:
df.shape

(9994, 21)

In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
Row ID,1,2,3,4,5
Order ID,CA-2016-152156,CA-2016-152156,CA-2016-138688,US-2015-108966,US-2015-108966
Order Date,11/8/2016,11/8/2016,6/12/2016,10/11/2015,10/11/2015
Ship Date,11/11/2016,11/11/2016,6/16/2016,10/18/2015,10/18/2015
Ship Mode,Second Class,Second Class,Second Class,Standard Class,Standard Class
Customer ID,CG-12520,CG-12520,DV-13045,SO-20335,SO-20335
Customer Name,Claire Gute,Claire Gute,Darrin Van Huff,Sean O'Donnell,Sean O'Donnell
Segment,Consumer,Consumer,Corporate,Consumer,Consumer
Country,United States,United States,United States,United States,United States
City,Henderson,Henderson,Los Angeles,Fort Lauderdale,Fort Lauderdale


In [7]:
df = df.drop(columns=['Row ID'])

In [8]:
# Rename columns 

df.rename(columns={
    'Order ID': 'PurchaseId',
    'Order Date': 'PurchaseDate',
    'Ship Date': 'ShipDate',
    'Ship Mode': 'ShipMode',  
    'Customer ID': 'CustomerId',
    'Customer Name': 'CustomerName',
    'Segment': 'Segment',
    'Country': 'Country',
    'City': 'City',
    'State': 'State',
    'Postal Code': 'PostalCode',
    'Region': 'Region',
    'Product ID': 'ProductId',
    'Category': 'Category',
    'Sub-Category': 'SubCategory',
    'Product Name': 'ProductName',
    'Sales': 'Sales',
    'Quantity': 'Quantity',
    'Discount': 'Discount',
    'Profit': 'Profit'
}, inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PurchaseId    9994 non-null   object 
 1   PurchaseDate  9994 non-null   object 
 2   ShipDate      9994 non-null   object 
 3   ShipMode      9994 non-null   object 
 4   CustomerId    9994 non-null   object 
 5   CustomerName  9994 non-null   object 
 6   Segment       9994 non-null   object 
 7   Country       9994 non-null   object 
 8   City          9994 non-null   object 
 9   State         9994 non-null   object 
 10  PostalCode    9994 non-null   int64  
 11  Region        9994 non-null   object 
 12  ProductId     9994 non-null   object 
 13  Category      9994 non-null   object 
 14  SubCategory   9994 non-null   object 
 15  ProductName   9994 non-null   object 
 16  Sales         9994 non-null   float64
 17  Quantity      9994 non-null   int64  
 18  Discount      9994 non-null 

In [10]:
df.dtypes

PurchaseId       object
PurchaseDate     object
ShipDate         object
ShipMode         object
CustomerId       object
CustomerName     object
Segment          object
Country          object
City             object
State            object
PostalCode        int64
Region           object
ProductId        object
Category         object
SubCategory      object
ProductName      object
Sales           float64
Quantity          int64
Discount        float64
Profit          float64
dtype: object

In [11]:
# Convert data types for columns 

df['PurchaseId'] = df['PurchaseId'].astype(str)

df['PurchaseDate'] = pd.to_datetime(df['PurchaseDate'])  

df['ShipDate'] = pd.to_datetime(df['ShipDate'])  

df['ShipMode'] = df['ShipMode'].astype(str)

df['CustomerId'] = df['CustomerId'].astype(str)

df['CustomerName'] = df['CustomerName'].astype(str)

df['Segment'] = df['Segment'].astype(str)

df['Country'] = df['Country'].astype(str)

df['City'] = df['City'].astype(str)

df['State'] = df['State'].astype(str)

df['PostalCode'] = df['PostalCode'].astype(str)  

df['Region'] = df['Region'].astype(str)

df['ProductId'] = df['ProductId'].astype(str)

df['Category'] = df['Category'].astype(str)

df['SubCategory'] = df['SubCategory'].astype(str)

df['ProductName'] = df['ProductName'].astype(str)

df['Sales'] = df['Sales'].astype(float)  

df['Quantity'] = df['Quantity'].astype(int)

df['Discount'] = df['Discount'].astype(float)

df['Profit'] = df['Profit'].astype(float)

In [12]:
df.isna().sum()

PurchaseId      0
PurchaseDate    0
ShipDate        0
ShipMode        0
CustomerId      0
CustomerName    0
Segment         0
Country         0
City            0
State           0
PostalCode      0
Region          0
ProductId       0
Category        0
SubCategory     0
ProductName     0
Sales           0
Quantity        0
Discount        0
Profit          0
dtype: int64

In [13]:
df.nunique()

PurchaseId      5009
PurchaseDate    1237
ShipDate        1334
ShipMode           4
CustomerId       793
CustomerName     793
Segment            3
Country            1
City             531
State             49
PostalCode       631
Region             4
ProductId       1862
Category           3
SubCategory       17
ProductName     1850
Sales           5825
Quantity          14
Discount          12
Profit          7287
dtype: int64

In [14]:
df.columns

Index(['PurchaseId', 'PurchaseDate', 'ShipDate', 'ShipMode', 'CustomerId',
       'CustomerName', 'Segment', 'Country', 'City', 'State', 'PostalCode',
       'Region', 'ProductId', 'Category', 'SubCategory', 'ProductName',
       'Sales', 'Quantity', 'Discount', 'Profit'],
      dtype='object')

### 2: PurchaseId<a id="2"></a>

In [15]:
# check if entries have the same format like in dataframe

import re

pattern = r'^[A-Z]{2}-\d{4}-\d{6}$'

df[~df['PurchaseId'].str.match(pattern)]

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit


In [16]:
# Trim whitespaces

df['PurchaseId'] = df['PurchaseId'].str.strip()

In [17]:
# Ensure that the case is uppercase across all entries.

df['PurchaseId'] = df['PurchaseId'].str.upper()

In [18]:
# Check if there is any unexpected characters or symbols 

df[df['PurchaseId'].str.contains(r'[^A-Z0-9-]')]

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit


In [19]:
# Check if the year starts with 20 and followed with same pattern 

df[~df['PurchaseId'].str.contains(r'-20\d{2}-')]

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit


### 3: PurchaseDate - ShipDate<a id="3"></a>

In [20]:
# ensure that the PurchaseDate is always earlier than the ShipDate 

df[df['PurchaseDate'] > df['ShipDate']]

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit


In [21]:
# check entries where OrderDate and ShipDate is same

df_order_ship_same = df[df['PurchaseDate'] == df['ShipDate']]

df_order_ship_same.head()

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit
366,CA-2016-155516,2016-10-21,2016-10-21,Same Day,MK-17905,Michael Kennedy,Corporate,United States,Manchester,Connecticut,6040,East,OFF-BI-10002412,Office Supplies,Binders,Wilson Jones Snap Scratch Pad Binder Tool fo...,23.2,4,0.0,10.44
367,CA-2016-155516,2016-10-21,2016-10-21,Same Day,MK-17905,Michael Kennedy,Corporate,United States,Manchester,Connecticut,6040,East,OFF-SU-10001225,Office Supplies,Supplies,Staple remover,7.36,2,0.0,0.1472
368,CA-2016-155516,2016-10-21,2016-10-21,Same Day,MK-17905,Michael Kennedy,Corporate,United States,Manchester,Connecticut,6040,East,OFF-ST-10002406,Office Supplies,Storage,Pizazz Global Quick File,104.79,7,0.0,29.3412
369,CA-2016-155516,2016-10-21,2016-10-21,Same Day,MK-17905,Michael Kennedy,Corporate,United States,Manchester,Connecticut,6040,East,FUR-BO-10002545,Furniture,Bookcases,"Atlantic Metals Mobile 3-Shelf Bookcases, Cust...",1043.92,4,0.0,271.4192
657,US-2016-156097,2016-09-19,2016-09-19,Same Day,EH-14125,Eugene Hildebrand,Home Office,United States,Aurora,Illinois,60505,Central,FUR-CH-10001215,Furniture,Chairs,Global Troy Executive Leather Low-Back Tilter,701.372,2,0.3,-50.098


In [22]:
df_order_ship_same.info()

<class 'pandas.core.frame.DataFrame'>
Index: 519 entries, 366 to 9963
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   PurchaseId    519 non-null    object        
 1   PurchaseDate  519 non-null    datetime64[ns]
 2   ShipDate      519 non-null    datetime64[ns]
 3   ShipMode      519 non-null    object        
 4   CustomerId    519 non-null    object        
 5   CustomerName  519 non-null    object        
 6   Segment       519 non-null    object        
 7   Country       519 non-null    object        
 8   City          519 non-null    object        
 9   State         519 non-null    object        
 10  PostalCode    519 non-null    object        
 11  Region        519 non-null    object        
 12  ProductId     519 non-null    object        
 13  Category      519 non-null    object        
 14  SubCategory   519 non-null    object        
 15  ProductName   519 non-null    object      

#### 519 rows have the same Order Date and Ship Date which are the Same Day shipment. 
#### No cases of Ship Date occurring before Order Date.
#### All entries in these columns are consistent.

## 4: ShipMode<a id="4"></a>

In [23]:
# check unique values in ShipMode column

df['ShipMode'].unique()

array(['Second Class', 'Standard Class', 'First Class', 'Same Day'],
      dtype=object)

In [24]:
# Ensure that the values are consistently spelled and capitalized.

df['ShipMode'] = df['ShipMode'].str.strip().str.title()

In [25]:
# Review distribution of 'ShipMode' values

df['ShipMode'].value_counts()

ShipMode
Standard Class    5968
Second Class      1945
First Class       1538
Same Day           543
Name: count, dtype: int64

## 5: CustomerId<a id="5"></a>

In [26]:
# Make sure all CustomerId values follow a consistent format 

valid_id_pattern = r'^[A-Z]{2}-\d{5}$'

df[~df['CustomerId'].str.match(valid_id_pattern)].T

Unnamed: 0,261,701,715,1489,1900,2760,2877,2878,2879,2966,2967,3362,3363,3385,3386,3387,3388,3389,4000,4001,4084,4643,4644,4734,5006,5007,5283,5284,5316,5317,5318,5319,5320,5321,5322,5475,5476,5477,6104,7919,7920,8145,8146
PurchaseId,US-2017-155299,CA-2017-114552,CA-2014-153150,CA-2014-136280,CA-2016-140543,CA-2014-129574,CA-2016-152072,CA-2016-152072,CA-2016-152072,CA-2014-162866,CA-2014-162866,CA-2015-139962,CA-2015-139962,CA-2017-148404,CA-2017-148404,CA-2017-148404,CA-2017-148404,CA-2017-148404,CA-2014-116834,CA-2014-116834,CA-2017-163692,CA-2015-147501,CA-2015-147501,CA-2016-120530,CA-2015-169796,CA-2015-169796,CA-2014-133424,CA-2014-133424,US-2017-162558,US-2017-162558,US-2017-162558,US-2017-162558,US-2017-162558,US-2017-162558,US-2017-162558,CA-2017-169691,CA-2017-169691,CA-2017-169691,US-2017-132381,CA-2017-139822,CA-2017-139822,US-2014-112949,US-2014-112949
PurchaseDate,2017-06-08 00:00:00,2017-09-02 00:00:00,2014-07-01 00:00:00,2014-11-29 00:00:00,2016-06-29 00:00:00,2014-05-26 00:00:00,2016-01-15 00:00:00,2016-01-15 00:00:00,2016-01-15 00:00:00,2014-12-27 00:00:00,2014-12-27 00:00:00,2015-12-13 00:00:00,2015-12-13 00:00:00,2017-10-07 00:00:00,2017-10-07 00:00:00,2017-10-07 00:00:00,2017-10-07 00:00:00,2017-10-07 00:00:00,2014-10-11 00:00:00,2014-10-11 00:00:00,2017-09-07 00:00:00,2015-08-02 00:00:00,2015-08-02 00:00:00,2016-04-07 00:00:00,2015-11-09 00:00:00,2015-11-09 00:00:00,2014-03-30 00:00:00,2014-03-30 00:00:00,2017-10-02 00:00:00,2017-10-02 00:00:00,2017-10-02 00:00:00,2017-10-02 00:00:00,2017-10-02 00:00:00,2017-10-02 00:00:00,2017-10-02 00:00:00,2017-06-15 00:00:00,2017-06-15 00:00:00,2017-06-15 00:00:00,2017-08-22 00:00:00,2017-09-15 00:00:00,2017-09-15 00:00:00,2014-06-20 00:00:00,2014-06-20 00:00:00
ShipDate,2017-06-12 00:00:00,2017-09-08 00:00:00,2014-07-06 00:00:00,2014-12-06 00:00:00,2016-07-03 00:00:00,2014-05-29 00:00:00,2016-01-19 00:00:00,2016-01-19 00:00:00,2016-01-19 00:00:00,2014-12-31 00:00:00,2014-12-31 00:00:00,2015-12-20 00:00:00,2015-12-20 00:00:00,2017-10-11 00:00:00,2017-10-11 00:00:00,2017-10-11 00:00:00,2017-10-11 00:00:00,2017-10-11 00:00:00,2014-10-16 00:00:00,2014-10-16 00:00:00,2017-09-09 00:00:00,2015-08-06 00:00:00,2015-08-06 00:00:00,2016-04-12 00:00:00,2015-11-14 00:00:00,2015-11-14 00:00:00,2014-04-04 00:00:00,2014-04-04 00:00:00,2017-10-05 00:00:00,2017-10-05 00:00:00,2017-10-05 00:00:00,2017-10-05 00:00:00,2017-10-05 00:00:00,2017-10-05 00:00:00,2017-10-05 00:00:00,2017-06-18 00:00:00,2017-06-18 00:00:00,2017-06-18 00:00:00,2017-08-24 00:00:00,2017-09-21 00:00:00,2017-09-21 00:00:00,2014-06-27 00:00:00,2014-06-27 00:00:00
ShipMode,Standard Class,Standard Class,Second Class,Standard Class,Second Class,First Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,First Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,First Class,First Class,First Class,First Class,First Class,First Class,First Class,First Class,First Class,First Class,First Class,Standard Class,Standard Class,Standard Class,Standard Class
CustomerId,Dl-13600,Dl-13600,Dl-13600,Co-12640,Co-12640,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Co-12640,Co-12640,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Co-12640,Co-12640,Dl-13600,Dp-13240,Dp-13240,Dl-13600,Dl-13600,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Dp-13240,Co-12640,Co-12640
CustomerName,Dorris liebe,Dorris liebe,Dorris liebe,Corey-Lock,Corey-Lock,Dean percer,Dean percer,Dean percer,Dean percer,Corey-Lock,Corey-Lock,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Corey-Lock,Corey-Lock,Dorris liebe,Dean percer,Dean percer,Dorris liebe,Dorris liebe,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Dean percer,Corey-Lock,Corey-Lock
Segment,Corporate,Corporate,Corporate,Consumer,Consumer,Home Office,Home Office,Home Office,Home Office,Consumer,Consumer,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Consumer,Consumer,Corporate,Home Office,Home Office,Corporate,Corporate,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Consumer,Consumer
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Pasadena,Cleveland,Seattle,Philadelphia,Florence,Murray,Westfield,Westfield,Westfield,Skokie,Skokie,Revere,Revere,Charlotte,Charlotte,Charlotte,Charlotte,Charlotte,Seattle,Seattle,Phoenix,Seattle,Seattle,New York City,New York City,New York City,Seattle,Seattle,Knoxville,Knoxville,Knoxville,Knoxville,Knoxville,Knoxville,Knoxville,Maple Grove,Maple Grove,Maple Grove,Philadelphia,Waterbury,Waterbury,Lawton,Lawton
State,Texas,Ohio,Washington,Pennsylvania,South Carolina,Utah,New Jersey,New Jersey,New Jersey,Illinois,Illinois,Massachusetts,Massachusetts,North Carolina,North Carolina,North Carolina,North Carolina,North Carolina,Washington,Washington,Arizona,Washington,Washington,New York,New York,New York,Washington,Washington,Tennessee,Tennessee,Tennessee,Tennessee,Tennessee,Tennessee,Tennessee,Minnesota,Minnesota,Minnesota,Pennsylvania,Connecticut,Connecticut,Oklahoma,Oklahoma


#### There seem to be upper-lower case issues for CustomerId.

In [27]:
# Convert all 'CustomerId' values to uppercase

df['CustomerId'] = df['CustomerId'].str.upper()

In [28]:
# Confirm Changes

df[~df['CustomerId'].str.match(valid_id_pattern)]

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit


In [29]:
# Clean probable leading or trailing spaces.

df['CustomerId'] = df['CustomerId'].str.strip()

### All values in CustomerId column have the same format now

## 6: CustomerName<a id="6"></a>

In [30]:
# Trim whitespaces from 'CustomerName' values

df['CustomerName'] = df['CustomerName'].str.strip()

# Convert 'CustomerName' values to title case

df['CustomerName'] = df['CustomerName'].str.title()

#### Noticed some names which have hypnes in between such as: Corey-Lock

In [31]:
# Noticed some names which have hypnes in between such as: Corey-Lock
# Regular expression pattern to detect hyphenated names

hyphenated_pattern = r'\b\w+-\w+\b'

# Function to check if a name contains a hyphen

def contains_hyphen(name):
    return bool(re.search(hyphenated_pattern, name))

# Apply the function to get a boolean Series

mask1 = df['CustomerName'].apply(contains_hyphen)

In [32]:
df['CustomerName'][mask1]

1489    Corey-Lock
1900    Corey-Lock
2966    Corey-Lock
2967    Corey-Lock
4643    Corey-Lock
4644    Corey-Lock
8145    Corey-Lock
8146    Corey-Lock
Name: CustomerName, dtype: object

It is the only Name with hypnes between names. 

In [33]:
# Correct the specific entry "Corey-Lock" to "Corey Lock"

df.loc[df['CustomerName'] == 'Corey-Lock', 'CustomerName'] = 'Corey Lock'

In [34]:
# Also noticed some name with apostrophes, filter and review values to be sure that there is no issue 
# Regular expression pattern to detect names with apostrophes 

apostrophe_pattern = r"\b\w*'\w*\b"

# Function to check if a name contains an apostrophe

def contains_apostrophe(name):
    return bool(re.search(apostrophe_pattern, name))

# Apply the function to create a boolean mask

mask2 = df['CustomerName'].apply(contains_apostrophe)

df[mask2].T

Unnamed: 0,3,4,46,85,282,326,327,328,329,330,494,626,669,670,671,906,907,908,1008,1015,1041,1042,1241,1604,1605,1606,1607,1608,1609,1750,1895,2303,2304,3026,3373,3374,3375,3376,3444,3502,3503,3554,3555,3951,4023,4024,4297,4298,4299,4300,4301,4303,4329,4330,4355,4356,4622,4623,4624,4625,4663,4711,4827,4828,4842,4926,5271,5272,5278,5583,5837,6133,6227,6260,6347,6348,6349,6422,6424,6577,6640,6641,6642,6742,6850,6851,6943,6979,7121,7122,7180,7303,7304,7904,8044,8109,8110,8217,8218,8219,8220,8221,8293,8373,8518,8519,8520,8591,8592,8677,8958,8959,8960,8961,9027,9028,9252,9253,9352,9353,9376,9427,9450,9691
PurchaseId,US-2015-108966,US-2015-108966,CA-2014-146703,CA-2017-140088,CA-2015-130890,US-2016-141544,US-2016-141544,US-2016-141544,US-2016-141544,US-2016-141544,US-2016-120929,CA-2017-163020,US-2017-106663,US-2017-106663,US-2017-106663,CA-2017-143259,CA-2017-143259,CA-2017-143259,US-2017-106705,CA-2015-133025,CA-2016-102981,CA-2016-102981,CA-2016-128727,US-2016-115819,US-2016-115819,US-2016-115819,US-2016-115819,US-2016-115819,US-2016-115819,CA-2015-139094,CA-2015-109197,CA-2017-157931,CA-2017-157931,CA-2017-169054,CA-2015-161718,CA-2015-161718,CA-2015-161718,CA-2015-161718,CA-2016-116344,CA-2017-125115,CA-2017-125115,CA-2014-120838,CA-2014-120838,CA-2014-121167,CA-2017-130764,CA-2017-130764,CA-2017-129021,CA-2017-129021,CA-2017-129021,CA-2017-129021,CA-2017-129021,CA-2016-121601,CA-2017-149853,CA-2017-149853,CA-2015-155600,CA-2015-155600,CA-2017-147228,CA-2017-147228,CA-2017-147228,CA-2017-147228,CA-2016-111409,CA-2014-112403,CA-2014-152562,CA-2014-152562,CA-2017-118402,CA-2017-117653,CA-2017-125913,CA-2017-125913,CA-2014-159681,CA-2014-116673,CA-2017-133207,CA-2016-148096,CA-2014-148782,CA-2015-162607,CA-2015-104486,CA-2015-104486,CA-2015-104486,CA-2014-159121,CA-2017-115777,CA-2015-153752,CA-2017-128328,CA-2017-128328,CA-2017-128328,US-2017-101784,US-2016-100461,US-2016-100461,CA-2015-139248,CA-2017-149076,CA-2017-166926,CA-2017-166926,CA-2014-106054,US-2017-117450,US-2017-117450,CA-2015-126669,CA-2017-165008,CA-2017-160122,CA-2017-160122,CA-2014-120775,CA-2014-120775,CA-2014-120775,CA-2014-120775,CA-2014-120775,US-2017-168802,CA-2016-152940,CA-2017-118003,CA-2017-118003,CA-2017-118003,CA-2017-101700,CA-2017-101700,CA-2017-141705,CA-2017-150266,CA-2017-150266,CA-2017-150266,CA-2017-150266,US-2016-152415,US-2016-152415,CA-2017-102309,CA-2017-102309,CA-2017-148411,CA-2017-148411,CA-2017-108756,CA-2014-167486,CA-2017-145506,CA-2015-130183
PurchaseDate,2015-10-11 00:00:00,2015-10-11 00:00:00,2014-10-20 00:00:00,2017-05-28 00:00:00,2015-11-02 00:00:00,2016-08-30 00:00:00,2016-08-30 00:00:00,2016-08-30 00:00:00,2016-08-30 00:00:00,2016-08-30 00:00:00,2016-03-18 00:00:00,2017-09-15 00:00:00,2017-06-09 00:00:00,2017-06-09 00:00:00,2017-06-09 00:00:00,2017-12-30 00:00:00,2017-12-30 00:00:00,2017-12-30 00:00:00,2017-12-26 00:00:00,2015-09-17 00:00:00,2016-09-06 00:00:00,2016-09-06 00:00:00,2016-08-29 00:00:00,2016-04-19 00:00:00,2016-04-19 00:00:00,2016-04-19 00:00:00,2016-04-19 00:00:00,2016-04-19 00:00:00,2016-04-19 00:00:00,2015-11-22 00:00:00,2015-12-31 00:00:00,2017-09-17 00:00:00,2017-09-17 00:00:00,2017-04-22 00:00:00,2015-12-04 00:00:00,2015-12-04 00:00:00,2015-12-04 00:00:00,2015-12-04 00:00:00,2016-07-29 00:00:00,2017-04-10 00:00:00,2017-04-10 00:00:00,2014-03-23 00:00:00,2014-03-23 00:00:00,2014-11-28 00:00:00,2017-10-27 00:00:00,2017-10-27 00:00:00,2017-08-23 00:00:00,2017-08-23 00:00:00,2017-08-23 00:00:00,2017-08-23 00:00:00,2017-08-23 00:00:00,2016-10-04 00:00:00,2017-10-03 00:00:00,2017-10-03 00:00:00,2015-12-04 00:00:00,2015-12-04 00:00:00,2017-09-09 00:00:00,2017-09-09 00:00:00,2017-09-09 00:00:00,2017-09-09 00:00:00,2016-09-18 00:00:00,2014-03-31 00:00:00,2014-11-01 00:00:00,2014-11-01 00:00:00,2017-09-29 00:00:00,2017-10-19 00:00:00,2017-01-16 00:00:00,2017-01-16 00:00:00,2014-12-07 00:00:00,2014-12-15 00:00:00,2017-11-27 00:00:00,2016-08-16 00:00:00,2014-11-02 00:00:00,2015-05-12 00:00:00,2015-05-01 00:00:00,2015-05-01 00:00:00,2015-05-01 00:00:00,2014-07-26 00:00:00,2017-08-19 00:00:00,2015-12-06 00:00:00,2017-08-05 00:00:00,2017-08-05 00:00:00,2017-08-05 00:00:00,2017-07-06 00:00:00,2016-01-08 00:00:00,2016-01-08 00:00:00,2015-07-25 00:00:00,2017-01-14 00:00:00,2017-12-01 00:00:00,2017-12-01 00:00:00,2014-01-06 00:00:00,2017-09-04 00:00:00,2017-09-04 00:00:00,2015-11-07 00:00:00,2017-09-15 00:00:00,2017-11-18 00:00:00,2017-11-18 00:00:00,2014-10-03 00:00:00,2014-10-03 00:00:00,2014-10-03 00:00:00,2014-10-03 00:00:00,2014-10-03 00:00:00,2017-11-03 00:00:00,2016-11-10 00:00:00,2017-12-04 00:00:00,2017-12-04 00:00:00,2017-12-04 00:00:00,2017-04-23 00:00:00,2017-04-23 00:00:00,2017-10-24 00:00:00,2017-11-25 00:00:00,2017-11-25 00:00:00,2017-11-25 00:00:00,2017-11-25 00:00:00,2016-09-17 00:00:00,2016-09-17 00:00:00,2017-09-23 00:00:00,2017-09-23 00:00:00,2017-09-24 00:00:00,2017-09-24 00:00:00,2017-12-25 00:00:00,2014-11-27 00:00:00,2017-06-03 00:00:00,2015-11-13 00:00:00
ShipDate,2015-10-18 00:00:00,2015-10-18 00:00:00,2014-10-25 00:00:00,2017-05-30 00:00:00,2015-11-06 00:00:00,2016-09-01 00:00:00,2016-09-01 00:00:00,2016-09-01 00:00:00,2016-09-01 00:00:00,2016-09-01 00:00:00,2016-03-21 00:00:00,2017-09-19 00:00:00,2017-06-13 00:00:00,2017-06-13 00:00:00,2017-06-13 00:00:00,2018-01-03 00:00:00,2018-01-03 00:00:00,2018-01-03 00:00:00,2018-01-01 00:00:00,2015-09-19 00:00:00,2016-09-09 00:00:00,2016-09-09 00:00:00,2016-09-04 00:00:00,2016-04-24 00:00:00,2016-04-24 00:00:00,2016-04-24 00:00:00,2016-04-24 00:00:00,2016-04-24 00:00:00,2016-04-24 00:00:00,2015-11-27 00:00:00,2016-01-04 00:00:00,2017-09-22 00:00:00,2017-09-22 00:00:00,2017-04-26 00:00:00,2015-12-10 00:00:00,2015-12-10 00:00:00,2015-12-10 00:00:00,2015-12-10 00:00:00,2016-08-02 00:00:00,2017-04-10 00:00:00,2017-04-10 00:00:00,2014-03-26 00:00:00,2014-03-26 00:00:00,2014-11-30 00:00:00,2017-10-28 00:00:00,2017-10-28 00:00:00,2017-08-26 00:00:00,2017-08-26 00:00:00,2017-08-26 00:00:00,2017-08-26 00:00:00,2017-08-26 00:00:00,2016-10-04 00:00:00,2017-10-09 00:00:00,2017-10-09 00:00:00,2015-12-07 00:00:00,2015-12-07 00:00:00,2017-09-14 00:00:00,2017-09-14 00:00:00,2017-09-14 00:00:00,2017-09-14 00:00:00,2016-09-22 00:00:00,2014-03-31 00:00:00,2014-11-08 00:00:00,2014-11-08 00:00:00,2017-10-04 00:00:00,2017-10-23 00:00:00,2017-01-16 00:00:00,2017-01-16 00:00:00,2014-12-13 00:00:00,2014-12-19 00:00:00,2017-12-03 00:00:00,2016-08-19 00:00:00,2014-11-07 00:00:00,2015-05-18 00:00:00,2015-05-06 00:00:00,2015-05-06 00:00:00,2015-05-06 00:00:00,2014-08-01 00:00:00,2017-08-24 00:00:00,2015-12-11 00:00:00,2017-08-09 00:00:00,2017-08-09 00:00:00,2017-08-09 00:00:00,2017-07-11 00:00:00,2016-01-12 00:00:00,2016-01-12 00:00:00,2015-07-30 00:00:00,2017-01-19 00:00:00,2017-12-08 00:00:00,2017-12-08 00:00:00,2014-01-07 00:00:00,2017-09-08 00:00:00,2017-09-08 00:00:00,2015-11-13 00:00:00,2017-09-17 00:00:00,2017-11-23 00:00:00,2017-11-23 00:00:00,2014-10-07 00:00:00,2014-10-07 00:00:00,2014-10-07 00:00:00,2014-10-07 00:00:00,2014-10-07 00:00:00,2017-11-07 00:00:00,2016-11-13 00:00:00,2017-12-10 00:00:00,2017-12-10 00:00:00,2017-12-10 00:00:00,2017-04-26 00:00:00,2017-04-26 00:00:00,2017-10-26 00:00:00,2017-11-30 00:00:00,2017-11-30 00:00:00,2017-11-30 00:00:00,2017-11-30 00:00:00,2016-09-22 00:00:00,2016-09-22 00:00:00,2017-09-25 00:00:00,2017-09-25 00:00:00,2017-09-26 00:00:00,2017-09-26 00:00:00,2017-12-29 00:00:00,2014-12-01 00:00:00,2017-06-07 00:00:00,2015-11-17 00:00:00
ShipMode,Standard Class,Standard Class,Second Class,Second Class,Standard Class,First Class,First Class,First Class,First Class,First Class,Second Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Second Class,Second Class,Second Class,Standard Class,Second Class,Second Class,Second Class,Second Class,Second Class,Second Class,Standard Class,Standard Class,Second Class,Second Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Same Day,Same Day,Second Class,Second Class,Second Class,First Class,First Class,Second Class,Second Class,Second Class,Second Class,Second Class,Same Day,Standard Class,Standard Class,Second Class,Second Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Same Day,Standard Class,Standard Class,Standard Class,Standard Class,Same Day,Same Day,Standard Class,Second Class,Standard Class,First Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Second Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,First Class,Standard Class,Standard Class,Standard Class,Second Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,First Class,Standard Class,Standard Class,Standard Class,First Class,First Class,First Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Second Class,Second Class,First Class,First Class,Standard Class,Standard Class,Standard Class,Standard Class
CustomerId,SO-20335,SO-20335,PO-18865,PO-18865,JO-15280,PO-18850,PO-18850,PO-18850,PO-18850,PO-18850,RO-19780,MO-17800,MO-17800,MO-17800,MO-17800,PO-18865,PO-18865,PO-18865,PO-18850,MO-17800,MO-17500,MO-17500,MO-17800,JO-15280,JO-15280,JO-15280,JO-15280,JO-15280,JO-15280,MO-17800,JO-15280,MO-17800,MO-17800,MO-17800,SO-20335,SO-20335,SO-20335,SO-20335,JO-15145,RD-19930,RD-19930,PO-18865,PO-18865,MO-17500,JO-15145,JO-15145,PO-18850,PO-18850,PO-18850,PO-18850,PO-18850,MO-17500,PO-18850,PO-18850,RO-19780,RO-19780,SO-20335,SO-20335,SO-20335,SO-20335,PO-18850,JO-15280,JO-15145,JO-15145,JO-15280,MO-17500,JO-15145,JO-15145,PO-18850,JO-15280,DO-13645,AO-10810,PO-18850,RO-19780,PO-18850,PO-18850,PO-18850,JO-15145,DO-13645,RO-19780,PO-18865,PO-18865,PO-18865,PO-18850,JO-15145,JO-15145,RD-19930,SO-20335,SO-20335,SO-20335,JO-15145,DO-13645,DO-13645,DO-13645,DO-13645,RD-19930,RD-19930,RD-19930,RD-19930,RD-19930,RD-19930,RD-19930,JO-15145,RO-19780,DO-13645,DO-13645,DO-13645,SO-20335,SO-20335,PO-18850,RO-19780,RO-19780,RO-19780,RO-19780,PO-18865,PO-18865,DO-13645,DO-13645,RO-19780,RO-19780,PO-18865,JO-15145,MO-17800,PO-18850
CustomerName,Sean O'Donnell,Sean O'Donnell,Patrick O'Donnell,Patrick O'Donnell,Jas O'Carroll,Patrick O'Brill,Patrick O'Brill,Patrick O'Brill,Patrick O'Brill,Patrick O'Brill,Rose O'Brian,Meg O'Connel,Meg O'Connel,Meg O'Connel,Meg O'Connel,Patrick O'Donnell,Patrick O'Donnell,Patrick O'Donnell,Patrick O'Brill,Meg O'Connel,Mary O'Rourke,Mary O'Rourke,Meg O'Connel,Jas O'Carroll,Jas O'Carroll,Jas O'Carroll,Jas O'Carroll,Jas O'Carroll,Jas O'Carroll,Meg O'Connel,Jas O'Carroll,Meg O'Connel,Meg O'Connel,Meg O'Connel,Sean O'Donnell,Sean O'Donnell,Sean O'Donnell,Sean O'Donnell,Jack O'Briant,Russell D'Ascenzo,Russell D'Ascenzo,Patrick O'Donnell,Patrick O'Donnell,Mary O'Rourke,Jack O'Briant,Jack O'Briant,Patrick O'Brill,Patrick O'Brill,Patrick O'Brill,Patrick O'Brill,Patrick O'Brill,Mary O'Rourke,Patrick O'Brill,Patrick O'Brill,Rose O'Brian,Rose O'Brian,Sean O'Donnell,Sean O'Donnell,Sean O'Donnell,Sean O'Donnell,Patrick O'Brill,Jas O'Carroll,Jack O'Briant,Jack O'Briant,Jas O'Carroll,Mary O'Rourke,Jack O'Briant,Jack O'Briant,Patrick O'Brill,Jas O'Carroll,Doug O'Connell,Anthony O'Donnell,Patrick O'Brill,Rose O'Brian,Patrick O'Brill,Patrick O'Brill,Patrick O'Brill,Jack O'Briant,Doug O'Connell,Rose O'Brian,Patrick O'Donnell,Patrick O'Donnell,Patrick O'Donnell,Patrick O'Brill,Jack O'Briant,Jack O'Briant,Russell D'Ascenzo,Sean O'Donnell,Sean O'Donnell,Sean O'Donnell,Jack O'Briant,Doug O'Connell,Doug O'Connell,Doug O'Connell,Doug O'Connell,Russell D'Ascenzo,Russell D'Ascenzo,Russell D'Ascenzo,Russell D'Ascenzo,Russell D'Ascenzo,Russell D'Ascenzo,Russell D'Ascenzo,Jack O'Briant,Rose O'Brian,Doug O'Connell,Doug O'Connell,Doug O'Connell,Sean O'Donnell,Sean O'Donnell,Patrick O'Brill,Rose O'Brian,Rose O'Brian,Rose O'Brian,Rose O'Brian,Patrick O'Donnell,Patrick O'Donnell,Doug O'Connell,Doug O'Connell,Rose O'Brian,Rose O'Brian,Patrick O'Donnell,Jack O'Briant,Meg O'Connel,Patrick O'Brill
Segment,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Home Office,Home Office,Home Office,Home Office,Consumer,Consumer,Consumer,Consumer,Home Office,Consumer,Consumer,Home Office,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Home Office,Consumer,Home Office,Home Office,Home Office,Consumer,Consumer,Consumer,Consumer,Corporate,Consumer,Consumer,Consumer,Consumer,Consumer,Corporate,Corporate,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Corporate,Corporate,Consumer,Consumer,Corporate,Corporate,Consumer,Consumer,Consumer,Corporate,Consumer,Consumer,Consumer,Consumer,Consumer,Corporate,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Corporate,Corporate,Consumer,Consumer,Consumer,Consumer,Corporate,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Corporate,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Corporate,Home Office,Consumer
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Fort Lauderdale,Fort Lauderdale,Westland,Columbia,Los Angeles,Philadelphia,Philadelphia,Philadelphia,Philadelphia,Philadelphia,Memphis,New York City,Chicago,Chicago,Chicago,New York City,New York City,New York City,Burlington,Los Angeles,New York City,New York City,New York City,Los Angeles,Los Angeles,Los Angeles,Los Angeles,Los Angeles,Los Angeles,San Antonio,Missoula,Roswell,Roswell,Philadelphia,Hempstead,Hempstead,Hempstead,Hempstead,Philadelphia,Austin,Austin,Los Angeles,Los Angeles,Freeport,San Francisco,San Francisco,Tallahassee,Tallahassee,Tallahassee,Tallahassee,Tallahassee,The Colony,Hialeah,Hialeah,Clarksville,Clarksville,Columbia,Columbia,Columbia,Columbia,Jacksonville,Philadelphia,Richmond,Richmond,Memphis,Chicago,Los Angeles,Los Angeles,Virginia Beach,San Diego,Los Angeles,Los Angeles,Irving,Seattle,San Francisco,San Francisco,San Francisco,Draper,Lawrence,Arlington,Indianapolis,Indianapolis,Indianapolis,Los Angeles,Franklin,Franklin,Los Angeles,Los Angeles,Seattle,Seattle,Athens,Boynton Beach,Boynton Beach,Houston,Salt Lake City,Chicago,Chicago,Dallas,Dallas,Dallas,Dallas,Dallas,Seattle,San Francisco,Paterson,Paterson,Paterson,Greeley,Greeley,Mansfield,Houston,Houston,Houston,Houston,Marlborough,Marlborough,Pine Bluff,Pine Bluff,Chicago,Chicago,East Orange,Buffalo,Spokane,Houston
State,Florida,Florida,Michigan,South Carolina,California,Pennsylvania,Pennsylvania,Pennsylvania,Pennsylvania,Pennsylvania,Tennessee,New York,Illinois,Illinois,Illinois,New York,New York,New York,Iowa,California,New York,New York,New York,California,California,California,California,California,California,Texas,Montana,Georgia,Georgia,Pennsylvania,New York,New York,New York,New York,Pennsylvania,Texas,Texas,California,California,New York,California,California,Florida,Florida,Florida,Florida,Florida,Texas,Florida,Florida,Tennessee,Tennessee,Tennessee,Tennessee,Tennessee,Tennessee,Florida,Pennsylvania,Kentucky,Kentucky,Tennessee,Illinois,California,California,Virginia,California,California,California,Texas,Washington,California,California,California,Utah,Massachusetts,Virginia,Indiana,Indiana,Indiana,California,Wisconsin,Wisconsin,California,California,Washington,Washington,Georgia,Florida,Florida,Texas,Utah,Illinois,Illinois,Texas,Texas,Texas,Texas,Texas,Washington,California,New Jersey,New Jersey,New Jersey,Colorado,Colorado,Texas,Texas,Texas,Texas,Texas,Massachusetts,Massachusetts,Arkansas,Arkansas,Illinois,Illinois,New Jersey,New York,Washington,Texas


In [35]:
# Ensure there is space between name and surname 

# Regular expression pattern to detect names with at least one space 
space_pattern = r'\s'

# Function to check if a name contains a space
def contains_space(name):
    return bool(re.search(space_pattern, name))

# Apply the function to create a boolean mask
mask3 = df['CustomerName'].apply(contains_space)

# Filter the DataFrame to find names without spaces
df[~mask3]

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit


In [36]:
# Split the CustomerName column based on the last space and create 2 columns as: FirstName, LastName

df[['FirstName', 'LastName']] = df['CustomerName'].str.rsplit(n=1, expand=True)

In [37]:
# Check the frequency distribution of names to identify common and rare names.

# Get frequency count of each name

first_name_counts = df['FirstName'].value_counts()

 # Display most common names
 
first_name_counts.tail(20)

FirstName
Grant         6
Angele        5
Georgia       5
Claire        5
Aleksandra    5
Melanie       5
Astrea        5
Victor        5
Elpida        5
Jenna         4
Eileen        4
Jasper        4
Ionia         4
Ritsa         3
Thais         2
Patricia      2
Stefanie      2
Anemone       2
Jocasta       1
Lela          1
Name: count, dtype: int64

In [38]:
# Get frequency count of each lastname

last_name_counts = df['LastName'].value_counts()

 # Display least common lastnames

last_name_counts.tail(20)

LastName
Caffey       4
Taslimi      3
Weirich      3
Skaria       3
Hightower    3
Jenkins      3
Shami        3
Tron         3
Thomas       3
Hale         3
Blacks       3
Ratner       2
Holloman     2
Hirasaki     2
Sissman      2
Odegard      2
Breyer       2
Donovan      1
Rupert       1
Emerson      1
Name: count, dtype: int64

In [39]:
# Function to check for unusual characters and patterns

def is_unusual_name(name):
    # Check for names with digits
    if re.search(r'[0-9]', name):  # Names with digits
        return True
    # Check for names with invalid special characters, excluding apostrophes and 'ö' since there are real names includes both
    if re.search(r'[^a-zA-Z\s\'ö-]', name):  # Names with invalid special characters
        return True
    # Check for unusually short or long names
    if len(name) < 2 or len(name) > 50:  # Unusually short or long names
        return True
    return False

# Apply function to identify unusual names

df['UnusualName'] = df['FirstName'].apply(is_unusual_name)

# Filter names flagged as unusual

unusual_names_df = df[df['UnusualName']]

unusual_names_df

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,UnusualName


In [40]:
# Function to check for unusual characters and patterns
# included 'ö', 'ä', 'ü' as we have Eurepean names which have these characters 

def is_unusual_lastname(name):
    # Check for names with digits
    if re.search(r'[0-9]', name):
        return True
    # Check for names with invalid special characters, excluding apostrophes, 'ö', 'ä', 'ü', and hyphens
    if re.search(r'[^a-zA-Z\s\'öäü-]', name):  # Names with invalid special characters
        return True
    # Check for unusually short or long names
    if len(name) < 2 or len(name) > 50:  # Unusually short or long names
        return True
    return False

# Apply function to identify unusual last names

df['UnusualLastName'] = df['LastName'].apply(is_unusual_lastname)

# Filter names flagged as unusual

unusual_lastnames_df = df[df['UnusualLastName']]

# Display the DataFrame with unusual last names

unusual_lastnames_df.T

Unnamed: 0,912,2004,3661,4840,4998,4999,5635,5636,7327,7328,7329,9687,9769,9770,9771
PurchaseId,CA-2015-133627,US-2017-143028,CA-2016-155005,CA-2017-154123,CA-2016-129238,CA-2016-129238,CA-2017-123022,CA-2017-123022,US-2014-131275,US-2014-131275,US-2014-131275,US-2017-130603,CA-2016-123533,CA-2016-123533,CA-2016-123533
PurchaseDate,2015-05-31 00:00:00,2017-04-11 00:00:00,2016-06-13 00:00:00,2017-11-20 00:00:00,2016-01-31 00:00:00,2016-01-31 00:00:00,2017-09-03 00:00:00,2017-09-03 00:00:00,2014-03-18 00:00:00,2014-03-18 00:00:00,2014-03-18 00:00:00,2017-09-30 00:00:00,2016-11-24 00:00:00,2016-11-24 00:00:00,2016-11-24 00:00:00
ShipDate,2015-06-07 00:00:00,2017-04-18 00:00:00,2016-06-15 00:00:00,2017-11-25 00:00:00,2016-02-04 00:00:00,2016-02-04 00:00:00,2017-09-08 00:00:00,2017-09-08 00:00:00,2014-03-24 00:00:00,2014-03-24 00:00:00,2014-03-24 00:00:00,2017-10-06 00:00:00,2016-11-30 00:00:00,2016-11-30 00:00:00,2016-11-30 00:00:00
ShipMode,Standard Class,Standard Class,Second Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class
CustomerId,SC-20050,SC-20050,SC-20050,SC-20050,SC-20050,SC-20050,SC-20050,SC-20050,SC-20050,SC-20050,SC-20050,SC-20050,SC-20050,SC-20050,SC-20050
CustomerName,Sample Company A,Sample Company A,Sample Company A,Sample Company A,Sample Company A,Sample Company A,Sample Company A,Sample Company A,Sample Company A,Sample Company A,Sample Company A,Sample Company A,Sample Company A,Sample Company A,Sample Company A
Segment,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office,Home Office
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Norwich,Lubbock,Jackson,Henderson,Los Angeles,Los Angeles,La Mesa,La Mesa,Burbank,Burbank,Burbank,Arlington,Hialeah,Hialeah,Hialeah
State,Connecticut,Texas,Michigan,Kentucky,California,California,California,California,California,California,California,Texas,Florida,Florida,Florida


In [41]:
unusual_lastnames_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, 912 to 9771
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   PurchaseId       15 non-null     object        
 1   PurchaseDate     15 non-null     datetime64[ns]
 2   ShipDate         15 non-null     datetime64[ns]
 3   ShipMode         15 non-null     object        
 4   CustomerId       15 non-null     object        
 5   CustomerName     15 non-null     object        
 6   Segment          15 non-null     object        
 7   Country          15 non-null     object        
 8   City             15 non-null     object        
 9   State            15 non-null     object        
 10  PostalCode       15 non-null     object        
 11  Region           15 non-null     object        
 12  ProductId        15 non-null     object        
 13  Category         15 non-null     object        
 14  SubCategory      15 non-null     object      

In [42]:
# Identify and correct Sample Company entry in FirstName', 'LastName columns

df.loc[(df['FirstName'] == 'Sample Company') & (df['LastName'] == 'A'), ['FirstName', 'LastName']] = ['Sample', 'Company']

In [43]:
# Ensure that values are updated in CustomerName, FirstName', 'LastName columns


# Apply function to identify unusual last names

df['UnusualLastName'] = df['LastName'].apply(is_unusual_lastname)

# Filter names flagged as unusual

unusual_lastnames_df = df[df['UnusualLastName']]

# Display the DataFrame with unusual last names

unusual_lastnames_df

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,UnusualName,UnusualLastName


In [44]:
# Sort the DataFrame alphabetically by 'FirstName' to check some values in FirstName column

sorted_df = df.sort_values(by='FirstName', ascending=True)

sorted_df['FirstName'].unique()

array(['Aaron', 'Adam', 'Adrian', 'Aimee', 'Alan', 'Alejandro',
       'Aleksandra', 'Alex', 'Alice', 'Allen', 'Alyssa', 'Amy', 'Andrew',
       'Andy', 'Anemone', 'Angele', 'Ann', 'Anna', 'Anne', 'Annie',
       'Anthony', 'Arianne', 'Art', 'Arthur', 'Ashley', 'Astrea',
       'Barbara', 'Barry', 'Bart', 'Becky', 'Ben', 'Benjamin', 'Berenike',
       'Beth', 'Bill', 'Bobby', 'Brad', 'Bradley', 'Brenda', 'Brendan',
       'Brian', 'Brooke', 'Brosina', 'Bruce', 'Bryan', 'Candace', 'Cari',
       'Carl', 'Carlos', 'Carol', 'Caroline', 'Cassandra', 'Catherine',
       'Cathy', 'Chad', 'Charles', 'Charlotte', 'Chloris', 'Chris',
       'Christina', 'Christine', 'Christopher', 'Christy', 'Chuck',
       'Cindy', 'Claire', 'Claudia', 'Clay', 'Clytie', 'Corey', 'Corinna',
       'Craig', 'Cyma', 'Cynthia', 'Cyra', 'Damala', 'Dan', 'Dana',
       'Daniel', 'Dario', 'Darren', 'Darrin', 'Darrin Van', 'Dave',
       'David', 'Dean', 'Deanra', 'Deborah', 'Debra', 'Deirdre',
       'Delfina', 'Deni

In [45]:
# Some Entries has Zuschuss as FirstName value which does not seem like a real name. 

# Step 1: Create a boolean mask where 'FirstName' is 'Zuschuss'
df_name_test = df['FirstName'] == 'Zuschuss'

# Step 2: Apply the mask to filter the DataFrame
filtered_df = df[df_name_test]

filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40 entries, 18 to 8923
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   PurchaseId       40 non-null     object        
 1   PurchaseDate     40 non-null     datetime64[ns]
 2   ShipDate         40 non-null     datetime64[ns]
 3   ShipMode         40 non-null     object        
 4   CustomerId       40 non-null     object        
 5   CustomerName     40 non-null     object        
 6   Segment          40 non-null     object        
 7   Country          40 non-null     object        
 8   City             40 non-null     object        
 9   State            40 non-null     object        
 10  PostalCode       40 non-null     object        
 11  Region           40 non-null     object        
 12  ProductId        40 non-null     object        
 13  Category         40 non-null     object        
 14  SubCategory      40 non-null     object       

#### Entries with 'Zuschuss' as FirstName:

    There are 40 records where the FirstName is 'Zuschuss'. These entries belong to 2 distinct customers.
    We will not be removing or editing these records at this time, as we can not communicate and solve it with business owner. However, this has been noted.

#### Entries with 'Sample Company' and 'A':

    There are 15 records where the CustomerName is 'Sample Company' and the LastName is 'A'.
    We have decided not to remove these records as they contain valuable information in other fields. This issue has also been noted for future correction.

#### The FirstName, LastName, and CustomerName columns have been reviewed to ensure they have the correct format and do not contain unusual values.
#### 8 Entry were corrected which include an invalid character(hypens) (1 Customer with 8 entries: Corey-Lock)

## 7: Segment<a id="7"></a>

In [46]:
df['Segment'].unique()

array(['Consumer', 'Corporate', 'Home Office'], dtype=object)

In [47]:
df['Segment'].value_counts()

Segment
Consumer       5191
Corporate      3020
Home Office    1783
Name: count, dtype: int64

## 8: Country<a id="8"></a>

In [48]:
df['Country'].nunique()

1

In [49]:
df['Country'].unique()

array(['United States'], dtype=object)

## 9: City<a id="9"></a>

In [50]:
df.nunique()

PurchaseId         5009
PurchaseDate       1237
ShipDate           1334
ShipMode              4
CustomerId          793
CustomerName        793
Segment               3
Country               1
City                531
State                49
PostalCode          631
Region                4
ProductId          1862
Category              3
SubCategory          17
ProductName        1850
Sales              5825
Quantity             14
Discount             12
Profit             7287
FirstName           340
LastName            588
UnusualName           1
UnusualLastName       1
dtype: int64

In [51]:
# Create a new DataFrame that shows the count of each unique city in the City column

city_counts = df['City'].value_counts()

city_counts_df = city_counts.reset_index()
city_counts_df.columns = ['City', 'Count']

city_counts_df = city_counts_df.sort_values(by='Count', ascending=False)

city_counts_df

Unnamed: 0,City,Count
0,New York City,915
1,Los Angeles,747
2,Philadelphia,537
3,San Francisco,510
4,Seattle,428
...,...,...
483,Hagerstown,1
482,Arlington Heights,1
481,Baytown,1
479,San Luis Obispo,1


In [52]:
city_counts_df.head(50).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,35,36,34,37,38,39,40,41,42,43,44,45,46,47,48,49
City,New York City,Los Angeles,Philadelphia,San Francisco,Seattle,Houston,Chicago,Columbus,San Diego,Springfield,Dallas,Jacksonville,Detroit,Newark,Richmond,Jackson,Columbia,Aurora,Phoenix,Long Beach,Arlington,San Antonio,Miami,Louisville,Rochester,Charlotte,Henderson,Lakewood,Lancaster,Fairfield,Milwaukee,Denver,Lawrence,Baltimore,Pasadena,San Jose,Cleveland,Fayetteville,Salem,Atlanta,Austin,Franklin,Tampa,Wilmington,Huntsville,Decatur,Toledo,Tucson,Providence,Lafayette
Count,915,747,537,510,428,377,314,222,170,163,157,125,115,95,90,82,81,68,63,61,60,59,57,57,53,52,51,49,46,45,45,44,44,43,42,42,42,41,40,39,39,37,36,36,36,35,32,32,31,31


In [53]:
city_counts_df.tail(50).T

Unnamed: 0,496,527,526,525,524,522,515,521,520,519,518,517,516,497,480,495,468,474,473,472,471,470,469,467,476,466,465,464,463,462,461,475,477,494,487,493,492,491,490,489,488,486,478,485,484,483,482,481,479,530
City,Redwood City,Missouri City,Glenview,San Mateo,Commerce City,Holyoke,Normal,Goldsboro,Montebello,Waukesha,Orland Park,Conroe,Abilene,Bartlett,Champaign,La Quinta,Tinley Park,Antioch,Ontario,Melbourne,Atlantic City,Portage,Davis,Littleton,Ormond Beach,Citrus Heights,Linden,Rogers,Danbury,Iowa City,Kissimmee,Deer Park,Jefferson City,Conway,Rock Hill,Port Orange,Margate,Missoula,Lake Elsinore,Romeoville,Murrieta,Saint Peters,Springdale,Aberdeen,Elyria,Hagerstown,Arlington Heights,Baytown,San Luis Obispo,Manhattan
Count,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [54]:
city_counts_df = city_counts_df.sort_values(by='City')

city_counts_df.head(50).T

Unnamed: 0,485,516,80,118,97,335,212,407,167,58,345,272,474,210,178,399,20,482,340,225,192,39,471,70,17,40,264,99,33,266,497,383,481,280,269,166,251,386,268,369,498,106,353,320,259,241,152,154,424,103
City,Aberdeen,Abilene,Akron,Albuquerque,Alexandria,Allen,Allentown,Altoona,Amarillo,Anaheim,Andover,Ann Arbor,Antioch,Apopka,Apple Valley,Appleton,Arlington,Arlington Heights,Arvada,Asheville,Athens,Atlanta,Atlantic City,Auburn,Aurora,Austin,Avondale,Bakersfield,Baltimore,Bangor,Bartlett,Bayonne,Baytown,Beaumont,Bedford,Belleville,Bellevue,Bellingham,Bethlehem,Beverly,Billings,Bloomington,Boca Raton,Boise,Bolingbrook,Bossier City,Bowling Green,Boynton Beach,Bozeman,Brentwood
Count,1,1,21,14,16,4,7,2,10,27,4,5,1,7,9,2,60,1,4,7,8,39,1,24,68,39,6,16,43,5,1,3,1,5,5,10,6,3,5,3,1,15,3,4,6,6,10,10,2,16


In [55]:
city_counts_df.tail(50).T

Unnamed: 0,158,278,206,468,46,315,125,55,47,63,451,432,284,357,195,509,252,293,203,98,321,257,440,294,153,147,514,162,519,309,164,444,300,356,238,122,93,215,513,218,43,233,358,382,275,305,108,296,523,311
City,Thornton,Thousand Oaks,Tigard,Tinley Park,Toledo,Torrance,Trenton,Troy,Tucson,Tulsa,Tuscaloosa,Twin Falls,Tyler,Urbandale,Utica,Vacaville,Vallejo,Vancouver,Vineland,Virginia Beach,Visalia,Waco,Warner Robins,Warwick,Washington,Waterbury,Waterloo,Watertown,Waukesha,Wausau,Waynesboro,West Allis,West Jordan,West Palm Beach,Westfield,Westland,Westminster,Wheeling,Whittier,Wichita,Wilmington,Wilson,Woodbury,Woodland,Woodstock,Woonsocket,Yonkers,York,Yucaipa,Yuma
Count,10,5,8,1,32,4,13,29,32,26,2,2,5,3,8,1,6,5,8,16,4,6,2,5,10,11,1,10,1,4,10,2,5,3,6,13,17,7,1,7,36,6,3,3,5,4,15,5,1,4


In [56]:
df['City'] = df['City'].str.strip()

df['City'] = df['City'].str.replace(r'\s+', ' ', regex=True)

In [57]:
df['City'].nunique()

531

## 10: FirstName-LastName & CustomerId Matches<a id="10"></a>

### First Letter of Customer Name and Surname should be the first 2 letters of CustomerId according to dataset 

In [58]:
# Define a function to validate the CustomerId

def validate_customer_id(row):
    expected_prefix = row['FirstName'][0] + row['LastName'][0]
    return row['CustomerId'].startswith(expected_prefix)

df['ValidCustomerId'] = df.apply(validate_customer_id, axis=1)

# Filter rows where CustomerId does not follow the rule
invalid_customer_ids_df = df[~df['ValidCustomerId']]

invalid_customer_ids_df.head()

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,UnusualName,UnusualLastName,ValidCustomerId
2,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714,Darrin Van,Huff,False,False,False
487,CA-2014-154627,2014-10-29,2014-10-31,First Class,SA-20830,Sue Ann Reed,Consumer,United States,Chicago,Illinois,60610,Central,TEC-PH-10001363,Technology,Phones,Apple iPhone 5S,2735.952,6,0.2,341.994,Sue Ann,Reed,False,False,False
788,CA-2015-115938,2015-06-26,2015-06-30,Standard Class,SA-20830,Sue Ann Reed,Consumer,United States,Richmond,Virginia,23223,South,OFF-BI-10001543,Office Supplies,Binders,GBC VeloBinder Manual Binding System,143.96,4,0.0,69.1008,Sue Ann,Reed,False,False,False
789,CA-2015-115938,2015-06-26,2015-06-30,Standard Class,SA-20830,Sue Ann Reed,Consumer,United States,Richmond,Virginia,23223,South,OFF-ST-10001321,Office Supplies,Storage,"Decoflex Hanging Personal Folder File, Blue",15.42,1,0.0,4.1634,Sue Ann,Reed,False,False,False
790,CA-2015-115938,2015-06-26,2015-06-30,Standard Class,SA-20830,Sue Ann Reed,Consumer,United States,Richmond,Virginia,23223,South,OFF-BI-10001132,Office Supplies,Binders,"Acco PRESSTEX Data Binder with Storage Hooks, ...",43.04,8,0.0,21.0896,Sue Ann,Reed,False,False,False


In [59]:
invalid_customer_ids_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 66 entries, 2 to 9018
Data columns (total 25 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   PurchaseId       66 non-null     object        
 1   PurchaseDate     66 non-null     datetime64[ns]
 2   ShipDate         66 non-null     datetime64[ns]
 3   ShipMode         66 non-null     object        
 4   CustomerId       66 non-null     object        
 5   CustomerName     66 non-null     object        
 6   Segment          66 non-null     object        
 7   Country          66 non-null     object        
 8   City             66 non-null     object        
 9   State            66 non-null     object        
 10  PostalCode       66 non-null     object        
 11  Region           66 non-null     object        
 12  ProductId        66 non-null     object        
 13  Category         66 non-null     object        
 14  SubCategory      66 non-null     object        

In [60]:
invalid_customer_ids_df['CustomerName'].value_counts()

CustomerName
Sue Ann Reed       21
Mark Van Huff      15
Paul Van Hugh      13
Darrin Van Huff     9
Corey Lock          8
Name: count, dtype: int64

In [61]:
# Correct the CustomerId values based on CustomerName

df.loc[df['CustomerName'] == 'Corey Lock', 'CustomerId'] = 'CL-12640'
df.loc[df['CustomerName'] == 'Sue Ann Reed', 'CustomerId'] = 'SR-20830'

In [62]:
# Darrin Van Huff
df.loc[df['CustomerName'] == 'Darrin Van Huff', ['FirstName', 'LastName']] = ['Darrin', 'Van Huff']

# Mark Van Huff
df.loc[df['CustomerName'] == 'Mark Van Huff', ['FirstName', 'LastName']] = ['Mark', 'Van Huff']

# Paul Van Hugh
df.loc[df['CustomerName'] == 'Paul Van Hugh', ['FirstName', 'LastName']] = ['Paul', 'Van Hugh']

In [63]:
# confirm changes

df['ValidCustomerId'] = df.apply(validate_customer_id, axis=1)

invalid_customer_ids_df = df[~df['ValidCustomerId']]

invalid_customer_ids_df

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,UnusualName,UnusualLastName,ValidCustomerId


### 66 records from 5 Customer were updated. Listed below: 
* Sue Ann Reed       21
* Mark Van Huff      15 
* Paul Van Hugh      13
* Darrin Van Huff     9
* Corey Lock          8

In [64]:
df.columns

Index(['PurchaseId', 'PurchaseDate', 'ShipDate', 'ShipMode', 'CustomerId',
       'CustomerName', 'Segment', 'Country', 'City', 'State', 'PostalCode',
       'Region', 'ProductId', 'Category', 'SubCategory', 'ProductName',
       'Sales', 'Quantity', 'Discount', 'Profit', 'FirstName', 'LastName',
       'UnusualName', 'UnusualLastName', 'ValidCustomerId'],
      dtype='object')

In [65]:
df = df.drop(columns=['UnusualName', 'UnusualLastName', 'ValidCustomerId'])

df = df.reset_index(drop=True).copy()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   PurchaseId    9994 non-null   object        
 1   PurchaseDate  9994 non-null   datetime64[ns]
 2   ShipDate      9994 non-null   datetime64[ns]
 3   ShipMode      9994 non-null   object        
 4   CustomerId    9994 non-null   object        
 5   CustomerName  9994 non-null   object        
 6   Segment       9994 non-null   object        
 7   Country       9994 non-null   object        
 8   City          9994 non-null   object        
 9   State         9994 non-null   object        
 10  PostalCode    9994 non-null   object        
 11  Region        9994 non-null   object        
 12  ProductId     9994 non-null   object        
 13  Category      9994 non-null   object        
 14  SubCategory   9994 non-null   object        
 15  ProductName   9994 non-null   object  

## 11: State<a id="11"></a>

In [66]:
df['State'].nunique()

49

In [67]:
df['State'].unique()

array(['Kentucky', 'California', 'Florida', 'North Carolina',
       'Washington', 'Texas', 'Wisconsin', 'Utah', 'Nebraska',
       'Pennsylvania', 'Illinois', 'Minnesota', 'Michigan', 'Delaware',
       'Indiana', 'New York', 'Arizona', 'Virginia', 'Tennessee',
       'Alabama', 'South Carolina', 'Oregon', 'Colorado', 'Iowa', 'Ohio',
       'Missouri', 'Oklahoma', 'New Mexico', 'Louisiana', 'Connecticut',
       'New Jersey', 'Massachusetts', 'Georgia', 'Nevada', 'Rhode Island',
       'Mississippi', 'Arkansas', 'Montana', 'New Hampshire', 'Maryland',
       'District of Columbia', 'Kansas', 'Vermont', 'Maine',
       'South Dakota', 'Idaho', 'North Dakota', 'Wyoming',
       'West Virginia'], dtype=object)

In [68]:
df['State'] = df['State'].str.strip()

## 12: PostalCode<a id="12"></a>

In [69]:
# Check if all PostalCode entries are numeric and have 5 digits

df['PostalCode_Check'] = df['PostalCode'].apply(lambda x: x.isdigit() and len(x) == 5)

# Display rows where PostalCode_Check is False

invalid_postal_codes_df = df[~df['PostalCode_Check']]

invalid_postal_codes_df.head(20).T

Unnamed: 0,185,197,267,298,299,300,301,302,306,307,313,346,347,348,366,367,368,369,377,395
PurchaseId,CA-2016-105018,CA-2017-107720,CA-2016-111010,CA-2016-142545,CA-2016-142545,CA-2016-142545,CA-2016-142545,CA-2016-142545,CA-2014-111003,CA-2014-111003,CA-2014-120887,CA-2017-134306,CA-2017-134306,CA-2017-134306,CA-2016-155516,CA-2016-155516,CA-2016-155516,CA-2016-155516,US-2017-134481,CA-2017-165603
PurchaseDate,2016-11-28 00:00:00,2017-11-06 00:00:00,2016-01-22 00:00:00,2016-10-28 00:00:00,2016-10-28 00:00:00,2016-10-28 00:00:00,2016-10-28 00:00:00,2016-10-28 00:00:00,2014-06-01 00:00:00,2014-06-01 00:00:00,2014-09-27 00:00:00,2017-07-08 00:00:00,2017-07-08 00:00:00,2017-07-08 00:00:00,2016-10-21 00:00:00,2016-10-21 00:00:00,2016-10-21 00:00:00,2016-10-21 00:00:00,2017-08-27 00:00:00,2017-10-17 00:00:00
ShipDate,2016-12-02 00:00:00,2017-11-13 00:00:00,2016-01-28 00:00:00,2016-11-03 00:00:00,2016-11-03 00:00:00,2016-11-03 00:00:00,2016-11-03 00:00:00,2016-11-03 00:00:00,2014-06-06 00:00:00,2014-06-06 00:00:00,2014-10-03 00:00:00,2017-07-12 00:00:00,2017-07-12 00:00:00,2017-07-12 00:00:00,2016-10-21 00:00:00,2016-10-21 00:00:00,2016-10-21 00:00:00,2016-10-21 00:00:00,2017-09-01 00:00:00,2017-10-19 00:00:00
ShipMode,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Same Day,Same Day,Same Day,Same Day,Standard Class,Second Class
CustomerId,SK-19990,VM-21685,PG-18895,JD-15895,JD-15895,JD-15895,JD-15895,JD-15895,CR-12625,CR-12625,TS-21205,TD-20995,TD-20995,TD-20995,MK-17905,MK-17905,MK-17905,MK-17905,AR-10405,SS-20140
CustomerName,Sally Knutson,Valerie Mitchum,Paul Gonzalez,Jonathan Doherty,Jonathan Doherty,Jonathan Doherty,Jonathan Doherty,Jonathan Doherty,Corey Roper,Corey Roper,Thomas Seio,Tamara Dahlen,Tamara Dahlen,Tamara Dahlen,Michael Kennedy,Michael Kennedy,Michael Kennedy,Michael Kennedy,Allen Rosenblatt,Saphhira Shifley
Segment,Consumer,Home Office,Consumer,Corporate,Corporate,Corporate,Corporate,Corporate,Home Office,Home Office,Corporate,Consumer,Consumer,Consumer,Corporate,Corporate,Corporate,Corporate,Corporate,Corporate
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Fairfield,Westfield,Morristown,Belleville,Belleville,Belleville,Belleville,Belleville,Lakewood,Lakewood,Hackensack,Lowell,Lowell,Lowell,Manchester,Manchester,Manchester,Manchester,Franklin,Warwick
State,Connecticut,New Jersey,New Jersey,New Jersey,New Jersey,New Jersey,New Jersey,New Jersey,New Jersey,New Jersey,New Jersey,Massachusetts,Massachusetts,Massachusetts,Connecticut,Connecticut,Connecticut,Connecticut,Massachusetts,Rhode Island


In [70]:
invalid_postal_codes_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 449 entries, 185 to 9969
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   PurchaseId        449 non-null    object        
 1   PurchaseDate      449 non-null    datetime64[ns]
 2   ShipDate          449 non-null    datetime64[ns]
 3   ShipMode          449 non-null    object        
 4   CustomerId        449 non-null    object        
 5   CustomerName      449 non-null    object        
 6   Segment           449 non-null    object        
 7   Country           449 non-null    object        
 8   City              449 non-null    object        
 9   State             449 non-null    object        
 10  PostalCode        449 non-null    object        
 11  Region            449 non-null    object        
 12  ProductId         449 non-null    object        
 13  Category          449 non-null    object        
 14  SubCategory       449 non-nu

#### There are 449 entries which have 4 digit Postalcode in USA, 
#### standard ZIP codes typically have 5 digits. However, leading zeros are common in certain regions, meaning that a ZIP code like "04356" is indeed valid
#### we will keep ZIP codes as strings in DataFrame to preserve any leading zeros and add 0 to the begining of the codes with 4 digits

In [71]:
# Make all PostalCodes have 5 digits

df['PostalCode'] = df['PostalCode'].str.zfill(5)

In [72]:
# Confirm Changes

df['PostalCode_Check'] = df['PostalCode'].apply(lambda x: x.isdigit() and len(x) == 5)

# Display rows where PostalCode_Check is False

invalid_postal_codes_df = df[~df['PostalCode_Check']]

invalid_postal_codes_df

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,PostalCode_Check


## 13: Region<a id="13"></a>

In [73]:
df['Region'].unique()

array(['South', 'West', 'Central', 'East'], dtype=object)

In [74]:
df['Region'].value_counts()

Region
West       3203
East       2848
Central    2323
South      1620
Name: count, dtype: int64

## 14: ProductId<a id="14"></a>

In [75]:
# ensure that the ProductId column starts with the same three letters of the Category column

df['ProductIdLower'] = df['ProductId'].str.lower()
df['CategoryLower'] = df['Category'].str.lower()

# Extract the first 3 characters from the lowercase versions

df['ProductIdPrefix'] = df['ProductIdLower'].str[:3]
df['CategoryPrefix'] = df['CategoryLower'].str[:3]

# Check if the prefixes match

df['Match'] = df['ProductIdPrefix'] == df['CategoryPrefix']

# Filter rows where ProductId and Category prefixes do not match

mismatched_entries = df[~df['Match']]

mismatched_entries

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,PostalCode_Check,ProductIdLower,CategoryLower,ProductIdPrefix,CategoryPrefix,Match


In [76]:
# Drop unnecessary columns
df = df.drop(columns=['PostalCode_Check', 'ProductIdLower', 'CategoryLower',
                      'ProductIdPrefix', 'CategoryPrefix', 'Match'])



In [77]:
# ensure that the two-letter code after the first hyphen in ProductId matches the first two letters of SubCategory

df['ProductIdLower'] = df['ProductId'].str.lower()
df['SubCategoryLower'] = df['SubCategory'].str.lower()

# Extract the two-letter code after the first hyphen in ProductId

df['ProductIdCode'] = df['ProductIdLower'].apply(lambda x: x.split('-')[1])

# Extract the first two characters of SubCategory

df['SubCategoryCode'] = df['SubCategoryLower'].str[:2]

# Check if the codes match

df['Match'] = df['ProductIdCode'] == df['SubCategoryCode']

# Filter rows where the codes do not match

mismatched_entries = df[~df['Match']]

mismatched_entries

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,ProductIdLower,SubCategoryLower,ProductIdCode,SubCategoryCode,Match


In [78]:

df = df.drop(columns=['ProductIdLower', 'SubCategoryLower', 'ProductIdCode',
                      'SubCategoryCode', 'Match'])

df = df.reset_index(drop=True)

## 15: Category<a id="15"></a>

In [79]:
df['Category'].nunique()

3

In [80]:
df['Category'].value_counts()

Category
Office Supplies    6026
Furniture          2121
Technology         1847
Name: count, dtype: int64

## 16: SubCategory<a id="16"></a>

In [81]:
df['SubCategory'].nunique()

17

In [82]:
df['SubCategory'].unique()

array(['Bookcases', 'Chairs', 'Labels', 'Tables', 'Storage',
       'Furnishings', 'Art', 'Phones', 'Binders', 'Appliances', 'Paper',
       'Accessories', 'Envelopes', 'Fasteners', 'Supplies', 'Machines',
       'Copiers'], dtype=object)

In [83]:
df['SubCategory'].value_counts()

SubCategory
Binders        1523
Paper          1370
Furnishings     957
Phones          889
Storage         846
Art             796
Accessories     775
Chairs          617
Appliances      466
Labels          364
Tables          319
Envelopes       254
Bookcases       228
Fasteners       217
Supplies        190
Machines        115
Copiers          68
Name: count, dtype: int64

## 17: ProductName<a id="17"></a>

In [84]:
df['ProductName'].nunique()

1850

In [85]:
# Get the counts of each unique ProductName
product_name_counts = df['ProductName'].value_counts()

# Create a new DataFrame from the counts
ProductName_df = product_name_counts.reset_index()
ProductName_df.columns = ['ProductName', 'Count']

# Sort the DataFrame alphabetically by ProductName
ProductName_df = ProductName_df.sort_values(by='ProductName').reset_index(drop=True)

ProductName_df.head(50).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
ProductName,"""While you Were Out"" Message Book, One Form pe...","#10 Gummed Flap White Envelopes, 100/Box",#10 Self-Seal White Envelopes,"#10 White Business Envelopes,4 1/8 x 9 1/2","#10- 4 1/8"" x 9 1/2"" Recycled Envelopes","#10- 4 1/8"" x 9 1/2"" Security-Tint Envelopes","#10-4 1/8"" x 9 1/2"" Premium Diagonal Seam Enve...",#6 3/4 Gummed Flap White Envelopes,"1.7 Cubic Foot Compact ""Cube"" Office Refrigera...",1/4 Fold Party Design Invitations & White Enve...,12 Colored Short Pencils,12-1/2 Diameter Round Wall Clock,14-7/8 x 11 Blue Bar Computer Printout Paper,2300 Heavy-Duty Transfer File Systems by Perma,"24 Capacity Maxi Data Binder Racks, Pearl",24-Hour Round Wall Clock,3-ring staple pack,3.6 Cubic Foot Counter Height Office Refrigerator,36X48 HARDFLOOR CHAIRMAT,"3D Systems Cube Printer, 2nd Generation, Magenta","3D Systems Cube Printer, 2nd Generation, White",3M Hangers With Command Adhesive,3M Office Air Cleaner,3M Organizer Strips,3M Polarizing Light Filter Sleeves,"3M Polarizing Task Lamp with Clamp Arm, Light ...",3M Replacement Filter for Office Air Cleaner f...,4009 Highlighters,4009 Highlighters by Sanford,50 Colored Long Pencils,"6"" Cubicle Wall Clock, Black",9-3/4 Diameter Round Wall Clock,"ACCOHIDE 3-Ring Binder, Blue, 1""",ACCOHIDE Binder by Acco,APC 7 Outlet Network SurgeArrest Surge Protector,ARKON Windshield Dashboard Air Vent Car Mount ...,AT&T 1070 Corded Phone,AT&T 1080 Corded phone,AT&T 1080 Phone,AT&T 17929 Lendline Telephone,AT&T 841000 Phone,AT&T CL2909,AT&T CL82213,AT&T CL83451 4-Handset Telephone,AT&T EL51110 DECT,AT&T SB67148 SynJ,AT&T TR1909W,Aastra 57i VoIP phone,Aastra 6757i CT Wireless VoIP phone,Acco 3-Hole Punch
Count,3,4,4,7,10,8,2,4,6,2,3,8,4,4,4,6,6,5,6,2,2,8,2,7,4,4,4,1,8,5,4,8,13,4,8,4,3,7,3,10,7,3,5,4,1,3,4,7,6,9


In [86]:
ProductName_df.tail(50).T

Unnamed: 0,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815,1816,1817,1818,1819,1820,1821,1822,1823,1824,1825,1826,1827,1828,1829,1830,1831,1832,1833,1834,1835,1836,1837,1838,1839,1840,1841,1842,1843,1844,1845,1846,1847,1848,1849
ProductName,Xerox 205,Xerox 206,Xerox 207,Xerox 208,Xerox 209,Xerox 21,Xerox 210,Xerox 211,Xerox 212,Xerox 213,Xerox 214,Xerox 215,Xerox 216,Xerox 217,Xerox 218,Xerox 219,Xerox 22,Xerox 220,Xerox 221,Xerox 222,Xerox 223,Xerox 224,Xerox 225,Xerox 226,Xerox 227,Xerox 228,Xerox 229,Xerox 23,Xerox 230,Xerox 231,Xerox 232,Xerox 4200 Series MultiUse Premium Copy Paper ...,Xerox Blank Computer Paper,"Xerox Color Copier Paper, 11"" x 17"", Ream",Xerox WorkCentre 6505DN Laser Multifunction Pr...,Xiaomi Mi3,"XtraLife ClearVue Slant-D Ring Binder, White, 3""",XtraLife ClearVue Slant-D Ring Binders by Card...,Zebra GK420t Direct Thermal/Thermal Transfer P...,Zebra GX420t Direct Thermal/Thermal Transfer P...,Zebra ZM400 Thermal Label Printer,Zebra Zazzle Fluorescent Highlighters,Zipper Ring Binder Pockets,i.Sound Portable Power - 8000 mAh,iHome FM Clock Radio with Lightning Dock,iKross Bluetooth Portable Keyboard + Cell Phon...,iOttie HLCRIO102 Car Mount,iOttie XL Car Mount,invisibleSHIELD by ZAGG Smudge-Free Screen Pro...,netTALK DUO VoIP Telephone Service
Count,4,5,2,2,5,7,6,4,7,6,9,2,8,8,5,4,3,8,4,7,3,6,9,11,8,4,3,6,5,5,4,8,1,6,1,1,10,5,1,3,2,6,13,5,5,5,5,2,7,7


#### There are entries which start with # symbol, we will erase this from the beginning of the product name. 
#### There are also entries with i(small) letter which locate themself after letter Z (capital), i should be capitol letter to be able to sort ProductNames alphabetically.  

In [87]:
df['ProductName'] = df['ProductName'].str.lstrip('#')

In [88]:
df['ProductName'] = df['ProductName'].str.strip()

In [89]:
df['ProductName'] = df['ProductName'].str.title()

In [90]:
# Confirm Changes 

product_name_counts = df['ProductName'].value_counts()

ProductName_df = product_name_counts.reset_index()
ProductName_df.columns = ['ProductName', 'Count']

ProductName_df = ProductName_df.sort_values(by='ProductName').reset_index(drop=True)

ProductName_df.head(20).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
ProductName,"""While You Were Out"" Message Book, One Form Pe...","1.7 Cubic Foot Compact ""Cube"" Office Refrigera...",1/4 Fold Party Design Invitations & White Enve...,"10 Gummed Flap White Envelopes, 100/Box",10 Self-Seal White Envelopes,"10 White Business Envelopes,4 1/8 X 9 1/2","10- 4 1/8"" X 9 1/2"" Recycled Envelopes","10- 4 1/8"" X 9 1/2"" Security-Tint Envelopes","10-4 1/8"" X 9 1/2"" Premium Diagonal Seam Envel...",12 Colored Short Pencils,12-1/2 Diameter Round Wall Clock,14-7/8 X 11 Blue Bar Computer Printout Paper,2300 Heavy-Duty Transfer File Systems By Perma,"24 Capacity Maxi Data Binder Racks, Pearl",24-Hour Round Wall Clock,3-Ring Staple Pack,3.6 Cubic Foot Counter Height Office Refrigerator,36X48 Hardfloor Chairmat,"3D Systems Cube Printer, 2Nd Generation, Magenta","3D Systems Cube Printer, 2Nd Generation, White"
Count,3,6,2,4,4,7,10,8,2,3,8,4,4,4,6,6,5,6,2,2


In [91]:
ProductName_df.tail(20).T

Unnamed: 0,1830,1831,1832,1833,1834,1835,1836,1837,1838,1839,1840,1841,1842,1843,1844,1845,1846,1847,1848,1849
ProductName,Xerox 226,Xerox 227,Xerox 228,Xerox 229,Xerox 23,Xerox 230,Xerox 231,Xerox 232,Xerox 4200 Series Multiuse Premium Copy Paper ...,Xerox Blank Computer Paper,"Xerox Color Copier Paper, 11"" X 17"", Ream",Xerox Workcentre 6505Dn Laser Multifunction Pr...,Xiaomi Mi3,"Xtralife Clearvue Slant-D Ring Binder, White, 3""",Xtralife Clearvue Slant-D Ring Binders By Card...,Zebra Gk420T Direct Thermal/Thermal Transfer P...,Zebra Gx420T Direct Thermal/Thermal Transfer P...,Zebra Zazzle Fluorescent Highlighters,Zebra Zm400 Thermal Label Printer,Zipper Ring Binder Pockets
Count,11,8,4,3,6,5,5,4,8,1,6,1,1,10,5,1,3,6,2,13


In [92]:
## 1 entry left which does not start with a number or letter, ("While You Were Out"). 
## Edit entries with "While You Were Out"  manually

# Filtering the DataFrame to find the row(s) containing the While you Were Out in product name

filtered_df= df[df['ProductName'].str.contains("While you Were Out", case=False, na=False)]

filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21 entries, 220 to 8323
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   PurchaseId    21 non-null     object        
 1   PurchaseDate  21 non-null     datetime64[ns]
 2   ShipDate      21 non-null     datetime64[ns]
 3   ShipMode      21 non-null     object        
 4   CustomerId    21 non-null     object        
 5   CustomerName  21 non-null     object        
 6   Segment       21 non-null     object        
 7   Country       21 non-null     object        
 8   City          21 non-null     object        
 9   State         21 non-null     object        
 10  PostalCode    21 non-null     object        
 11  Region        21 non-null     object        
 12  ProductId     21 non-null     object        
 13  Category      21 non-null     object        
 14  SubCategory   21 non-null     object        
 15  ProductName   21 non-null     object       

In [93]:
filtered_df['ProductName'].unique()

array(['While You Were Out Pads, 50 Per Pad, 4 X 5 1/4, Green Cycle',
       'Recycled Desk Saver Line "While You Were Out" Book, 5 1/2" X 4"',
       'Message Book, Standard Line "While You Were Out", 5 1/2" X 4", 200 Sets/Book',
       '"While You Were Out" Message Book, One Form Per Page',
       'Adams "While You Were Out" Message Pads',
       'Standard Line \x93While You Were Out\x94 Hardbound Telephone Message Book'],
      dtype=object)

In [94]:
# Update specific entries in the ProductName column

df.loc[df['ProductName'] == 'Recycled Desk Saver Line "While You Were Out" Book, 5 1/2" X 4"', 'ProductName'] = 'Recycled Desk Saver Line, While You Were Out, Book, 5 1/2" X 4"'

df.loc[df['ProductName'] == '"While You Were Out" Message Book, One Form Per Page', 'ProductName'] = 'While You Were Out, Message Book, One Form Per Page'

df.loc[df['ProductName'] == 'Adams "While You Were Out" Message Pads', 'ProductName'] = 'Adams, While You Were Out, Message Pads'

df.loc[df['ProductName'] == 'Standard Line \x93While You Were Out\x94 Hardbound Telephone Message Book', 'ProductName'] = 'Standard Line, While You Were Out, Hardbound Telephone Message Book'

In [95]:
# Confirm Changes 

product_name_counts = df['ProductName'].value_counts()

ProductName_df = product_name_counts.reset_index()
ProductName_df.columns = ['ProductName', 'Count']

ProductName_df = ProductName_df.sort_values(by='ProductName').reset_index(drop=True)

ProductName_df

Unnamed: 0,ProductName,Count
0,"1.7 Cubic Foot Compact ""Cube"" Office Refrigera...",6
1,1/4 Fold Party Design Invitations & White Enve...,2
2,"10 Gummed Flap White Envelopes, 100/Box",4
3,10 Self-Seal White Envelopes,4
4,"10 White Business Envelopes,4 1/8 X 9 1/2",7
...,...,...
1845,Zebra Gk420T Direct Thermal/Thermal Transfer P...,1
1846,Zebra Gx420T Direct Thermal/Thermal Transfer P...,3
1847,Zebra Zazzle Fluorescent Highlighters,6
1848,Zebra Zm400 Thermal Label Printer,2


In [96]:
df.nunique()

PurchaseId      5009
PurchaseDate    1237
ShipDate        1334
ShipMode           4
CustomerId       793
CustomerName     793
Segment            3
Country            1
City             531
State             49
PostalCode       631
Region             4
ProductId       1862
Category           3
SubCategory       17
ProductName     1850
Sales           5825
Quantity          14
Discount          12
Profit          7287
FirstName        337
LastName         588
dtype: int64

## 18: Create CatalogPrice and SupplierPrice Columns<a id="18"></a>

In [97]:
# CatalogPrice
df['CatalogPrice'] = (df['Sales'] / (df['Quantity'] * (1 - df['Discount']))).round(2)

# SupplierPrice
df['SupplierPrice'] = (df['CatalogPrice'] - (df['Profit'] / df['Quantity'])).round(2)

# Ensure no negative SupplierPrice values
df.loc[df['SupplierPrice'] < 0, 'SupplierPrice'] = 0

In [98]:
df = df.copy().reset_index(drop=True)

In [99]:
df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
PurchaseId,CA-2016-152156,CA-2016-152156,CA-2016-138688,US-2015-108966,US-2015-108966,CA-2014-115812,CA-2014-115812,CA-2014-115812,CA-2014-115812,CA-2014-115812
PurchaseDate,2016-11-08 00:00:00,2016-11-08 00:00:00,2016-06-12 00:00:00,2015-10-11 00:00:00,2015-10-11 00:00:00,2014-06-09 00:00:00,2014-06-09 00:00:00,2014-06-09 00:00:00,2014-06-09 00:00:00,2014-06-09 00:00:00
ShipDate,2016-11-11 00:00:00,2016-11-11 00:00:00,2016-06-16 00:00:00,2015-10-18 00:00:00,2015-10-18 00:00:00,2014-06-14 00:00:00,2014-06-14 00:00:00,2014-06-14 00:00:00,2014-06-14 00:00:00,2014-06-14 00:00:00
ShipMode,Second Class,Second Class,Second Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class
CustomerId,CG-12520,CG-12520,DV-13045,SO-20335,SO-20335,BH-11710,BH-11710,BH-11710,BH-11710,BH-11710
CustomerName,Claire Gute,Claire Gute,Darrin Van Huff,Sean O'Donnell,Sean O'Donnell,Brosina Hoffman,Brosina Hoffman,Brosina Hoffman,Brosina Hoffman,Brosina Hoffman
Segment,Consumer,Consumer,Corporate,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Henderson,Henderson,Los Angeles,Fort Lauderdale,Fort Lauderdale,Los Angeles,Los Angeles,Los Angeles,Los Angeles,Los Angeles
State,Kentucky,Kentucky,California,Florida,Florida,California,California,California,California,California


In [100]:
df.head(20).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
PurchaseId,CA-2016-152156,CA-2016-152156,CA-2016-138688,US-2015-108966,US-2015-108966,CA-2014-115812,CA-2014-115812,CA-2014-115812,CA-2014-115812,CA-2014-115812,CA-2014-115812,CA-2014-115812,CA-2017-114412,CA-2016-161389,US-2015-118983,US-2015-118983,CA-2014-105893,CA-2014-167164,CA-2014-143336,CA-2014-143336
PurchaseDate,2016-11-08 00:00:00,2016-11-08 00:00:00,2016-06-12 00:00:00,2015-10-11 00:00:00,2015-10-11 00:00:00,2014-06-09 00:00:00,2014-06-09 00:00:00,2014-06-09 00:00:00,2014-06-09 00:00:00,2014-06-09 00:00:00,2014-06-09 00:00:00,2014-06-09 00:00:00,2017-04-15 00:00:00,2016-12-05 00:00:00,2015-11-22 00:00:00,2015-11-22 00:00:00,2014-11-11 00:00:00,2014-05-13 00:00:00,2014-08-27 00:00:00,2014-08-27 00:00:00
ShipDate,2016-11-11 00:00:00,2016-11-11 00:00:00,2016-06-16 00:00:00,2015-10-18 00:00:00,2015-10-18 00:00:00,2014-06-14 00:00:00,2014-06-14 00:00:00,2014-06-14 00:00:00,2014-06-14 00:00:00,2014-06-14 00:00:00,2014-06-14 00:00:00,2014-06-14 00:00:00,2017-04-20 00:00:00,2016-12-10 00:00:00,2015-11-26 00:00:00,2015-11-26 00:00:00,2014-11-18 00:00:00,2014-05-15 00:00:00,2014-09-01 00:00:00,2014-09-01 00:00:00
ShipMode,Second Class,Second Class,Second Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Second Class,Second Class,Second Class
CustomerId,CG-12520,CG-12520,DV-13045,SO-20335,SO-20335,BH-11710,BH-11710,BH-11710,BH-11710,BH-11710,BH-11710,BH-11710,AA-10480,IM-15070,HP-14815,HP-14815,PK-19075,AG-10270,ZD-21925,ZD-21925
CustomerName,Claire Gute,Claire Gute,Darrin Van Huff,Sean O'Donnell,Sean O'Donnell,Brosina Hoffman,Brosina Hoffman,Brosina Hoffman,Brosina Hoffman,Brosina Hoffman,Brosina Hoffman,Brosina Hoffman,Andrew Allen,Irene Maddox,Harold Pawlan,Harold Pawlan,Pete Kriz,Alejandro Grove,Zuschuss Donatelli,Zuschuss Donatelli
Segment,Consumer,Consumer,Corporate,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Home Office,Home Office,Consumer,Consumer,Consumer,Consumer
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Henderson,Henderson,Los Angeles,Fort Lauderdale,Fort Lauderdale,Los Angeles,Los Angeles,Los Angeles,Los Angeles,Los Angeles,Los Angeles,Los Angeles,Concord,Seattle,Fort Worth,Fort Worth,Madison,West Jordan,San Francisco,San Francisco
State,Kentucky,Kentucky,California,Florida,Florida,California,California,California,California,California,California,California,North Carolina,Washington,Texas,Texas,Wisconsin,Utah,California,California


## 19: ProductName-ProductId Pairs<a id="19"></a>

### Review the unmatch of number of unique values between ProductId (1862) and ProductName (1850) columns   

In [101]:
# check if there are duplicated ProductId values in unique ProductName values

# Count unique ProductId for each ProductName

product_name_counts = df.groupby('ProductName')['ProductId'].nunique().reset_index()

product_name_counts.columns = ['ProductName', 'UniqueProductIdCount']

# Find ProductNames which have more than one ProductId

non_unique_products = product_name_counts[product_name_counts['UniqueProductIdCount'] > 1]


if not non_unique_products.empty:
    print("Non-unique ProductNames:")
    print(non_unique_products)
else:
    print("All ProductNames are unique for ProductIds.")

Non-unique ProductNames:
                                        ProductName  UniqueProductIdCount
5            10- 4 1/8" X 9 1/2" Recycled Envelopes                     2
258                         Avery Non-Stick Binders                     2
536                               Easy-Staple Paper                     8
590                     Eldon Wave Desk Accessories                     2
962                      Ki Adjustable-Height Table                     2
1223                          Okidata C610N Printer                     2
1254  Peel & Seel Recycled Catalog Envelopes, Brown                     2
1303                       Prang Drawing Pencil Set                     2
1499                                Staple Envelope                     9
1500                                  Staple Holder                     3
1501                                  Staple Magnet                     2
1502                                 Staple Remover                     3
1503         

In [102]:
non_unique_products.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, 5 to 1517
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ProductName           16 non-null     object
 1   UniqueProductIdCount  16 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 384.0+ bytes


In [103]:
# Get the list of ProductNames with more than one unique ProductId

affected_product_names = non_unique_products['ProductName']

# Filter the main DataFrame to include only rows with these ProductNames

affected_entries = df[df['ProductName'].isin(affected_product_names)]

affected_entries.shape[0]

314

#### There are 16 ProductName which have multiple ProductId values
#### Total number of entries with these names is 314 of all entries
#### We will check ProductName values if they are same product or not

In [104]:
# check some examples to understand the issue

df_id_name = df[df['ProductName'] == '10- 4 1/8" X 9 1/2" Recycled Envelopes']

df_id_name = df_id_name.sort_values(by='SupplierPrice')

df_id_name.head(20).T

Unnamed: 0,1989,2446,2948,4378,5216,7720,985,4992,5688,6301
PurchaseId,CA-2015-127509,CA-2015-100573,CA-2017-169859,CA-2015-140830,US-2017-159562,CA-2015-108588,CA-2017-100314,CA-2015-153038,CA-2014-131541,US-2014-161305
PurchaseDate,2015-11-09 00:00:00,2015-09-25 00:00:00,2017-12-14 00:00:00,2015-11-30 00:00:00,2017-09-09 00:00:00,2015-01-05 00:00:00,2017-09-29 00:00:00,2015-12-18 00:00:00,2014-07-28 00:00:00,2014-06-06 00:00:00
ShipDate,2015-11-13 00:00:00,2015-10-01 00:00:00,2017-12-18 00:00:00,2015-12-02 00:00:00,2017-09-15 00:00:00,2015-01-10 00:00:00,2017-10-05 00:00:00,2015-12-25 00:00:00,2014-07-28 00:00:00,2014-06-12 00:00:00
ShipMode,Standard Class,Standard Class,Standard Class,Second Class,Standard Class,Standard Class,Standard Class,Standard Class,Same Day,Standard Class
CustomerId,AS-10090,AM-10705,MP-18175,PS-18970,JB-16000,BG-11695,AS-10630,RB-19645,CK-12205,SB-20170
CustomerName,Adam Shillingsburg,Anne Mcfarland,Mike Pelletier,Paul Stevenson,Joy Bell-,Brooke Gillingham,Ann Steele,Robert Barroso,Chloris Kastensmidt,Sarah Bern
Segment,Consumer,Consumer,Home Office,Home Office,Consumer,Corporate,Home Office,Corporate,Consumer,Consumer
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Springfield,Los Angeles,San Diego,Henderson,Roseville,New York City,Pasadena,Memphis,Apopka,Chicago
State,Missouri,California,California,Kentucky,Michigan,New York,Texas,Tennessee,Florida,Illinois


In [105]:
df_id_name[['PurchaseId', 'PurchaseDate', 'ProductId', 'ProductName', 'CatalogPrice', 'SupplierPrice']]

Unnamed: 0,PurchaseId,PurchaseDate,ProductId,ProductName,CatalogPrice,SupplierPrice
1989,CA-2015-127509,2015-11-09,OFF-EN-10000781,"10- 4 1/8"" X 9 1/2"" Recycled Envelopes",8.74,4.63
2446,CA-2015-100573,2015-09-25,OFF-EN-10000461,"10- 4 1/8"" X 9 1/2"" Recycled Envelopes",8.74,4.63
2948,CA-2017-169859,2017-12-14,OFF-EN-10000461,"10- 4 1/8"" X 9 1/2"" Recycled Envelopes",8.74,4.63
4378,CA-2015-140830,2015-11-30,OFF-EN-10000461,"10- 4 1/8"" X 9 1/2"" Recycled Envelopes",8.74,4.63
5216,US-2017-159562,2017-09-09,OFF-EN-10000461,"10- 4 1/8"" X 9 1/2"" Recycled Envelopes",8.74,4.63
7720,CA-2015-108588,2015-01-05,OFF-EN-10000461,"10- 4 1/8"" X 9 1/2"" Recycled Envelopes",8.74,4.63
985,CA-2017-100314,2017-09-29,OFF-EN-10000461,"10- 4 1/8"" X 9 1/2"" Recycled Envelopes",8.74,6.38
4992,CA-2015-153038,2015-12-18,OFF-EN-10000461,"10- 4 1/8"" X 9 1/2"" Recycled Envelopes",8.74,6.38
5688,CA-2014-131541,2014-07-28,OFF-EN-10000781,"10- 4 1/8"" X 9 1/2"" Recycled Envelopes",8.74,6.38
6301,US-2014-161305,2014-06-06,OFF-EN-10000461,"10- 4 1/8"" X 9 1/2"" Recycled Envelopes",8.74,6.38


In [106]:
df_id_name.value_counts('ProductId')

ProductId
OFF-EN-10000461    8
OFF-EN-10000781    2
Name: count, dtype: int64

In [107]:
df_id_name = df[df['ProductName'] == 'Avery Non-Stick Binders']

df_id_name = df_id_name.sort_values(by='SupplierPrice')

df_id_name.head(20).T

Unnamed: 0,6268,2531,5386,7010,5522,2651,3099,336,5143,5114,4274,3842,1848,4844,7191,3687,5486,2167,1102,7548
PurchaseId,CA-2015-115392,CA-2014-111500,CA-2017-152786,US-2014-135881,CA-2016-110982,CA-2017-112515,CA-2017-131233,CA-2015-137946,CA-2016-111213,CA-2016-150658,CA-2015-142692,CA-2014-101931,CA-2017-128370,US-2014-128685,CA-2016-116918,CA-2016-108567,CA-2015-156608,CA-2016-154018,US-2017-145863,CA-2014-103492
PurchaseDate,2015-10-01 00:00:00,2014-08-17 00:00:00,2017-03-12 00:00:00,2014-05-23 00:00:00,2016-06-05 00:00:00,2017-09-17 00:00:00,2017-04-14 00:00:00,2015-09-01 00:00:00,2016-04-01 00:00:00,2016-11-17 00:00:00,2015-10-23 00:00:00,2014-10-28 00:00:00,2017-09-10 00:00:00,2014-04-04 00:00:00,2016-10-01 00:00:00,2016-05-20 00:00:00,2015-10-24 00:00:00,2016-10-13 00:00:00,2017-04-21 00:00:00,2014-10-10 00:00:00
ShipDate,2015-10-04 00:00:00,2014-08-21 00:00:00,2017-03-17 00:00:00,2014-05-27 00:00:00,2016-06-07 00:00:00,2017-09-21 00:00:00,2017-04-19 00:00:00,2015-09-04 00:00:00,2016-04-05 00:00:00,2016-11-22 00:00:00,2015-10-28 00:00:00,2014-10-31 00:00:00,2017-09-10 00:00:00,2014-04-05 00:00:00,2016-10-06 00:00:00,2016-05-24 00:00:00,2015-10-29 00:00:00,2016-10-19 00:00:00,2017-04-27 00:00:00,2014-10-15 00:00:00
ShipMode,Second Class,Standard Class,Standard Class,Standard Class,First Class,Second Class,Standard Class,Second Class,Standard Class,Standard Class,Standard Class,First Class,Same Day,First Class,Second Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class
CustomerId,RM-19675,DJ-13510,GA-14725,GT-14710,CK-12205,AS-10225,CS-12355,DB-13615,FP-14320,MS-17365,AG-10495,TS-21370,FH-14275,MZ-17515,JK-15205,DB-13210,MT-18070,HA-14920,RP-19390,CM-12715
CustomerName,Robert Marley,Don Jones,Guy Armstrong,Greg Tran,Chloris Kastensmidt,Alan Schoenberger,Christine Sundaresam,Doug Bickford,Frank Preis,Maribeth Schnelling,Andrew Gjertsen,Todd Sumrall,Frank Hawley,Mary Zewe,Jamie Kunitz,Dean Braden,Michelle Tran,Helen Andreada,Resi Pölking,Craig Molinari
Segment,Home Office,Corporate,Consumer,Consumer,Consumer,Corporate,Consumer,Consumer,Consumer,Consumer,Corporate,Corporate,Corporate,Corporate,Consumer,Consumer,Home Office,Consumer,Consumer,Corporate
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Cambridge,Little Rock,Rogers,New York City,Santa Clara,Provo,New York City,Los Angeles,New York City,Carlsbad,Seattle,Los Angeles,Los Angeles,Los Angeles,Hialeah,Port Saint Lucie,San Antonio,Laredo,Houston,Huntsville
State,Massachusetts,Arkansas,Arkansas,New York,California,Utah,New York,California,New York,New Mexico,Washington,California,California,California,Florida,Florida,Texas,Texas,Texas,Texas


### Both ProductId values have same CatalogPrice but different SupplierPrices. Hard to detect if It is same product or 2 different products.

* Despite having the same ProductId, CatalogPrice and ProductName, there are differences at SupplierPrices which shows they may be different products. 
* they may also be the same product which is coming with different prices in different times.  

In [108]:
# Filter the DataFrame to include only rows where ProductName is 'Easy-Staple Paper'
df_id_name = df[df['ProductName'] == 'Easy-Staple Paper']

# Sort by SupplierPrice (ascending order by default)
df_id_name = df_id_name.sort_values(by='SupplierPrice')

# Display the first 50 rows transposed for better readability
df_id_name.head(10).T

Unnamed: 0,5896,8669,9965,1080,1702,3653,4930,2601,7468,1747
PurchaseId,CA-2014-168305,CA-2015-158918,CA-2016-146374,CA-2015-110016,CA-2015-114069,CA-2017-109960,US-2014-138828,CA-2016-165848,CA-2014-138100,CA-2016-155670
PurchaseDate,2014-11-01 00:00:00,2015-12-20 00:00:00,2016-12-05 00:00:00,2015-11-29 00:00:00,2015-07-13 00:00:00,2017-12-09 00:00:00,2014-09-02 00:00:00,2016-06-04 00:00:00,2014-09-15 00:00:00,2016-08-13 00:00:00
ShipDate,2014-11-08 00:00:00,2015-12-25 00:00:00,2016-12-10 00:00:00,2015-12-04 00:00:00,2015-07-15 00:00:00,2017-12-11 00:00:00,2014-09-03 00:00:00,2016-06-04 00:00:00,2014-09-20 00:00:00,2016-08-17 00:00:00
ShipMode,Standard Class,Second Class,Second Class,Standard Class,Second Class,Second Class,First Class,Same Day,Standard Class,Second Class
CustomerId,PL-18925,AI-10855,HE-14800,BT-11395,ND-18370,DB-13210,KD-16345,EN-13780,AA-10315,EM-14065
CustomerName,Paul Lucas,Arianne Irving,Harold Engle,Bill Tyler,Natalie Decherney,Dean Braden,Katherine Ducich,Edward Nazzal,Alex Avila,Erin Mull
Segment,Home Office,Consumer,Corporate,Corporate,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Los Angeles,Los Angeles,Newark,Detroit,New York City,Detroit,New York City,New York City,New York City,Jacksonville
State,California,California,Delaware,Michigan,New York,Michigan,New York,New York,New York,North Carolina


In [109]:
df_id_name.value_counts('ProductId')

ProductId
OFF-PA-10000474    12
OFF-PA-10000349     9
OFF-PA-10000249     6
OFF-PA-10003127     6
OFF-PA-10001685     5
OFF-PA-10002764     3
OFF-PA-10004947     3
OFF-PA-10000565     2
Name: count, dtype: int64

In [110]:
df_id_name[['PurchaseId', 'PurchaseDate', 'ProductId', 'ProductName', 'CatalogPrice', 'SupplierPrice']].sort_values(by=['ProductId','PurchaseDate'])

Unnamed: 0,PurchaseId,PurchaseDate,ProductId,ProductName,CatalogPrice,SupplierPrice
2734,CA-2015-129770,2015-02-21,OFF-PA-10000249,Easy-Staple Paper,12.28,6.51
8573,CA-2015-164623,2015-10-20,OFF-PA-10000249,Easy-Staple Paper,12.28,6.51
7101,CA-2016-144337,2016-08-01,OFF-PA-10000249,Easy-Staple Paper,12.28,8.96
8147,US-2017-146822,2017-06-09,OFF-PA-10000249,Easy-Staple Paper,12.28,6.51
347,CA-2017-134306,2017-07-08,OFF-PA-10000249,Easy-Staple Paper,12.28,6.51
34,CA-2017-107727,2017-10-19,OFF-PA-10000249,Easy-Staple Paper,12.28,8.96
4930,US-2014-138828,2014-09-02,OFF-PA-10000349,Easy-Staple Paper,4.98,2.64
7468,CA-2014-138100,2014-09-15,OFF-PA-10000349,Easy-Staple Paper,4.98,2.64
9418,CA-2014-148285,2014-10-27,OFF-PA-10000349,Easy-Staple Paper,4.98,3.64
4131,CA-2015-111703,2015-07-02,OFF-PA-10000349,Easy-Staple Paper,4.98,3.64


#### 8 different ProductId with a big Price difference(from 2.35 to 28.55) sharing same name. 

In [111]:
# Run same function to see ProductId values which have multiple ProductNames

# Count unique ProductName per ProductId
product_id_counts = df.groupby('ProductId')['ProductName'].nunique().reset_index()

# Rename columns for clarity
product_id_counts.columns = ['ProductId', 'UniqueProductNameCount']

# Find ProductNames associated with more than one ProductId
non_unique_products = product_id_counts[product_id_counts['UniqueProductNameCount'] > 1]

# Check if there are any such products
if not non_unique_products.empty:
    print("Non-unique ProductNames:")
    print(non_unique_products)
else:
    print("All ProductNames are unique across ProductIds.")

Non-unique ProductNames:
            ProductId  UniqueProductNameCount
18    FUR-BO-10002213                       2
65    FUR-CH-10001146                       2
183   FUR-FU-10001473                       2
284   FUR-FU-10004017                       2
290   FUR-FU-10004091                       2
295   FUR-FU-10004270                       2
310   FUR-FU-10004848                       2
311   FUR-FU-10004864                       2
387   OFF-AP-10000576                       2
506   OFF-AR-10001149                       2
721   OFF-BI-10002026                       2
831   OFF-BI-10004632                       2
832   OFF-BI-10004654                       2
1044  OFF-PA-10000357                       2
1049  OFF-PA-10000477                       2
1064  OFF-PA-10000659                       2
1086  OFF-PA-10001166                       2
1144  OFF-PA-10001970                       2
1156  OFF-PA-10002195                       2
1168  OFF-PA-10002377                       2
1198  OFF

In [112]:
non_unique_products.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, 18 to 1843
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ProductId               32 non-null     object
 1   UniqueProductNameCount  32 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 768.0+ bytes


In [113]:
# Get unique ProductId values from non_unique_products

non_unique_product_ids = non_unique_products['ProductId'].unique()  

# Filter the main DataFrame where ProductId is in non_unique_products

matching_rows = df[df['ProductId'].isin(non_unique_product_ids)]

matching_rows.shape[0]

337

In [114]:
df_id_name = df[df['ProductId'] == 'FUR-BO-10002213']

df_id_name = df_id_name.sort_values(by='SupplierPrice')

df_id_name.head(20).T

Unnamed: 0,2808,5079,8712,2471,2115,9649,5918,6535,9395,9583
PurchaseId,CA-2015-148635,US-2017-133312,US-2015-145422,US-2016-135923,CA-2015-164882,CA-2016-107104,US-2015-126977,CA-2014-128209,CA-2014-125997,CA-2017-116127
PurchaseDate,2015-07-25 00:00:00,2017-11-25 00:00:00,2015-12-03 00:00:00,2016-01-22 00:00:00,2015-10-31 00:00:00,2016-11-26 00:00:00,2015-09-17 00:00:00,2014-11-17 00:00:00,2014-09-20 00:00:00,2017-06-25 00:00:00
ShipDate,2015-07-27 00:00:00,2017-11-29 00:00:00,2015-12-07 00:00:00,2016-01-28 00:00:00,2015-10-31 00:00:00,2016-11-30 00:00:00,2015-09-23 00:00:00,2014-11-22 00:00:00,2014-09-23 00:00:00,2017-06-27 00:00:00
ShipMode,Second Class,Standard Class,Standard Class,Standard Class,Same Day,Standard Class,Standard Class,Standard Class,First Class,Second Class
CustomerId,MH-18025,BD-11500,PW-19240,CM-11935,SG-20080,MS-17365,PF-19120,GT-14710,MW-18220,SB-20185
CustomerName,Michelle Huthwaite,Bradley Drucker,Pierre Wener,Carlos Meador,Sandra Glassco,Maribeth Schnelling,Peter Fuller,Greg Tran,Mitch Webber,Sarah Brown
Segment,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Seattle,San Francisco,San Francisco,Fayetteville,Redlands,Los Angeles,New York City,Buffalo,New York City,New York City
State,Washington,California,California,North Carolina,California,California,New York,New York,New York,New York


#### They are 2 different products with different names and price difference shows it. SupplierPrices has changed within the time for both products too. 

In [115]:
unique_pairs = df[['ProductId', 'ProductName']].drop_duplicates()

unique_pairs.shape[0]

1894

### Despite SupplierPrice may vary, CatalogPrice is always same. So we will take this as reference to standardize ProductId values. 

In [116]:
# Group by ProductId and check unique CatalogPrice values for each group

catalog_price_counts = df.groupby('ProductId')['CatalogPrice'].nunique().reset_index()

# Filter to find ProductIds with more than 1 unique CatalogPrice

non_standard_product_ids = catalog_price_counts[catalog_price_counts['CatalogPrice'] > 1]

# Merge back with the original DataFrame to see the details of these ProductId-CatalogPrice pairs

non_standard_products = df[df['ProductId'].isin(non_standard_product_ids['ProductId'])]

non_standard_products = non_standard_products.sort_values(by='ProductId')

# Display the non-standard ProductId entries

non_standard_products.head(20).T

Unnamed: 0,5079,2471,9649,9583,9395,5918,2808,6535,8712,2115,8456,7940,1267,4559,1067,6742,8891,1881,2003,9254
PurchaseId,US-2017-133312,US-2016-135923,CA-2016-107104,CA-2017-116127,CA-2014-125997,US-2015-126977,CA-2015-148635,CA-2014-128209,US-2015-145422,CA-2015-164882,US-2017-118556,CA-2016-146325,US-2014-167738,CA-2014-110219,CA-2016-157686,US-2017-101784,CA-2016-162159,CA-2015-109512,CA-2017-163510,CA-2014-168368
PurchaseDate,2017-11-25 00:00:00,2016-01-22 00:00:00,2016-11-26 00:00:00,2017-06-25 00:00:00,2014-09-20 00:00:00,2015-09-17 00:00:00,2015-07-25 00:00:00,2014-11-17 00:00:00,2015-12-03 00:00:00,2015-10-31 00:00:00,2017-05-28 00:00:00,2016-12-14 00:00:00,2014-12-24 00:00:00,2014-05-05 00:00:00,2016-10-01 00:00:00,2017-07-06 00:00:00,2016-09-16 00:00:00,2015-03-05 00:00:00,2017-12-25 00:00:00,2014-02-11 00:00:00
ShipDate,2017-11-29 00:00:00,2016-01-28 00:00:00,2016-11-30 00:00:00,2017-06-27 00:00:00,2014-09-23 00:00:00,2015-09-23 00:00:00,2015-07-27 00:00:00,2014-11-22 00:00:00,2015-12-07 00:00:00,2015-10-31 00:00:00,2017-06-02 00:00:00,2016-12-17 00:00:00,2014-12-29 00:00:00,2014-05-08 00:00:00,2016-10-02 00:00:00,2017-07-11 00:00:00,2016-09-18 00:00:00,2015-03-05 00:00:00,2017-12-28 00:00:00,2014-02-15 00:00:00
ShipMode,Standard Class,Standard Class,Standard Class,Second Class,First Class,Standard Class,Second Class,Standard Class,Standard Class,Same Day,Second Class,First Class,Standard Class,First Class,First Class,Standard Class,First Class,Same Day,Second Class,Second Class
CustomerId,BD-11500,CM-11935,MS-17365,SB-20185,MW-18220,PF-19120,MH-18025,GT-14710,PW-19240,SG-20080,TH-21235,DS-13180,JC-16105,EB-13870,BD-11620,PO-18850,CR-12625,LF-17185,JW-15955,GA-14725
CustomerName,Bradley Drucker,Carlos Meador,Maribeth Schnelling,Sarah Brown,Mitch Webber,Peter Fuller,Michelle Huthwaite,Greg Tran,Pierre Wener,Sandra Glassco,Tiffany House,David Smith,Julie Creighton,Emily Burns,Brian Decherney,Patrick O'Brill,Corey Roper,Luke Foster,Joni Wasserman,Guy Armstrong
Segment,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Corporate,Corporate,Corporate,Consumer,Consumer,Consumer,Home Office,Consumer,Consumer,Consumer
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,San Francisco,Fayetteville,Los Angeles,New York City,New York City,New York City,Seattle,Buffalo,San Francisco,Redlands,Chicago,San Diego,Los Angeles,San Antonio,San Francisco,Los Angeles,Columbus,New York City,Louisville,Columbia
State,California,North Carolina,California,New York,New York,New York,Washington,New York,California,California,Illinois,California,California,Texas,California,California,Georgia,New York,Kentucky,Missouri


In [117]:
non_standard_products.info()

<class 'pandas.core.frame.DataFrame'>
Index: 337 entries, 5079 to 8970
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PurchaseId     337 non-null    object        
 1   PurchaseDate   337 non-null    datetime64[ns]
 2   ShipDate       337 non-null    datetime64[ns]
 3   ShipMode       337 non-null    object        
 4   CustomerId     337 non-null    object        
 5   CustomerName   337 non-null    object        
 6   Segment        337 non-null    object        
 7   Country        337 non-null    object        
 8   City           337 non-null    object        
 9   State          337 non-null    object        
 10  PostalCode     337 non-null    object        
 11  Region         337 non-null    object        
 12  ProductId      337 non-null    object        
 13  Category       337 non-null    object        
 14  SubCategory    337 non-null    object        
 15  ProductName    337 non-n

In [118]:
non_standard_products.nunique()

PurchaseId       324
PurchaseDate     281
ShipDate         283
ShipMode           4
CustomerId       273
CustomerName     273
Segment            3
Country            1
City             136
State             39
PostalCode       176
Region             4
ProductId         32
Category           3
SubCategory       11
ProductName       64
Sales            265
Quantity          13
Discount           9
Profit           273
FirstName        175
LastName         236
CatalogPrice      62
SupplierPrice    143
dtype: int64

## 32 ProductId values has different CatalogPrices which effects 337 entries in total 
## We will create new ProductId values if a ProductId has different CatalogPrice.

In [119]:
# find the biggest value in productids columns last 6 digits 

def get_last_6_digits(product_id):
    return int(product_id[-6:])

# Extract the last 6 digits for each ProductId

df['Last6Digits'] = df['ProductId'].apply(get_last_6_digits)

df['Last6Digits'].max()

4999

#### 4999 is the max value for the last digits of ProductId values, we will start from 5000 to create new ones so there will not be duplicates as old values are smaller than 5000. 
#### Last 4 digits will start from 5000 and increase 1 by 1. 

In [120]:
# Function to generate new ProductId by incrementing last 4 digits

def generate_new_product_id(base_product_id, increment):
    prefix = base_product_id[:-4]
    new_suffix = f'{increment:04d}'
    return prefix + new_suffix

# Create a unique identifier for each combination of ProductId and CatalogPrice

df['UniqueID'] = df.groupby(['ProductId', 'CatalogPrice']).ngroup()

# Initialize the new ProductId column with original values

df['NewProductId'] = df['ProductId']

# Start incrementing from 5000

start_increment = 5000

# Update ProductId values with incremented suffix

df['NewProductId'] = df.apply(lambda row: generate_new_product_id(row['ProductId'], start_increment + row['UniqueID']), axis=1)

# Drop the unnecessary columns

df.drop(columns=['UniqueID'], inplace=True)

In [121]:
df_id_name = df[df['ProductId'] == 'FUR-BO-10002213']

df_id_name = df_id_name.sort_values(by='SupplierPrice')

df_id_name.head(20).T

Unnamed: 0,2808,5079,8712,2471,2115,9649,5918,6535,9395,9583
PurchaseId,CA-2015-148635,US-2017-133312,US-2015-145422,US-2016-135923,CA-2015-164882,CA-2016-107104,US-2015-126977,CA-2014-128209,CA-2014-125997,CA-2017-116127
PurchaseDate,2015-07-25 00:00:00,2017-11-25 00:00:00,2015-12-03 00:00:00,2016-01-22 00:00:00,2015-10-31 00:00:00,2016-11-26 00:00:00,2015-09-17 00:00:00,2014-11-17 00:00:00,2014-09-20 00:00:00,2017-06-25 00:00:00
ShipDate,2015-07-27 00:00:00,2017-11-29 00:00:00,2015-12-07 00:00:00,2016-01-28 00:00:00,2015-10-31 00:00:00,2016-11-30 00:00:00,2015-09-23 00:00:00,2014-11-22 00:00:00,2014-09-23 00:00:00,2017-06-27 00:00:00
ShipMode,Second Class,Standard Class,Standard Class,Standard Class,Same Day,Standard Class,Standard Class,Standard Class,First Class,Second Class
CustomerId,MH-18025,BD-11500,PW-19240,CM-11935,SG-20080,MS-17365,PF-19120,GT-14710,MW-18220,SB-20185
CustomerName,Michelle Huthwaite,Bradley Drucker,Pierre Wener,Carlos Meador,Sandra Glassco,Maribeth Schnelling,Peter Fuller,Greg Tran,Mitch Webber,Sarah Brown
Segment,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer,Consumer
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Seattle,San Francisco,San Francisco,Fayetteville,Redlands,Los Angeles,New York City,Buffalo,New York City,New York City
State,Washington,California,California,North Carolina,California,California,New York,New York,New York,New York


In [122]:
df.nunique()

PurchaseId       5009
PurchaseDate     1237
ShipDate         1334
ShipMode            4
CustomerId        793
CustomerName      793
Segment             3
Country             1
City              531
State              49
PostalCode        631
Region              4
ProductId        1862
Category            3
SubCategory        17
ProductName      1850
Sales            5825
Quantity           14
Discount           12
Profit           7287
FirstName         337
LastName          588
CatalogPrice     1095
SupplierPrice    2851
Last6Digits      1595
NewProductId     1894
dtype: int64

In [123]:
# Check the the max value in New ProductId column
# Extract the last 6 digits for each ProductId
df['New_Last6Digits'] = df['NewProductId'].apply(get_last_6_digits)

df['New_Last6Digits'].max()

6893

In [124]:
# Confirm changes by checking NewProductId and CatalogPrice Pairs

# Group by ProductId and check unique CatalogPrice values for each group

catalog_price_counts = df.groupby('NewProductId')['CatalogPrice'].nunique().reset_index()

# Filter to find ProductIds with more than 1 unique CatalogPrice

non_standard_product_ids = catalog_price_counts[catalog_price_counts['CatalogPrice'] > 1]

# Merge back with the original DataFrame to see the details of these ProductId-CatalogPrice pairs

non_standard_products = df[df['NewProductId'].isin(non_standard_product_ids['NewProductId'])]

non_standard_products = non_standard_products.sort_values(by='NewProductId')

non_standard_products.head(20)

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,CatalogPrice,SupplierPrice,Last6Digits,NewProductId,New_Last6Digits


In [125]:
df.drop(columns=['New_Last6Digits', 'Last6Digits'], inplace=True)

#### 32 New ProductId values have been created which shared same ID but had different CatalogPrices and ProductNames. 
#### We have 1894 Unique ProductId, it was 32 less before 
#### Some entries have different ProductId and Pricing but have the same name but we dont have chance to check it or correct it. 
#### So we will not edit Productnames additionally. 

In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PurchaseId     9994 non-null   object        
 1   PurchaseDate   9994 non-null   datetime64[ns]
 2   ShipDate       9994 non-null   datetime64[ns]
 3   ShipMode       9994 non-null   object        
 4   CustomerId     9994 non-null   object        
 5   CustomerName   9994 non-null   object        
 6   Segment        9994 non-null   object        
 7   Country        9994 non-null   object        
 8   City           9994 non-null   object        
 9   State          9994 non-null   object        
 10  PostalCode     9994 non-null   object        
 11  Region         9994 non-null   object        
 12  ProductId      9994 non-null   object        
 13  Category       9994 non-null   object        
 14  SubCategory    9994 non-null   object        
 15  ProductName    9994 n

In [127]:
df.isna().sum()

PurchaseId       0
PurchaseDate     0
ShipDate         0
ShipMode         0
CustomerId       0
CustomerName     0
Segment          0
Country          0
City             0
State            0
PostalCode       0
Region           0
ProductId        0
Category         0
SubCategory      0
ProductName      0
Sales            0
Quantity         0
Discount         0
Profit           0
FirstName        0
LastName         0
CatalogPrice     0
SupplierPrice    0
NewProductId     0
dtype: int64

## 20: Sales, Quantity, Discount, Profit Columns  <a id="20"></a>

In [128]:
df['Sales'] = df['Sales'].round(2)

df['Profit'] = df['Profit'].round(2)

In [129]:
df.describe()

Unnamed: 0,PurchaseDate,ShipDate,Sales,Quantity,Discount,Profit,CatalogPrice,SupplierPrice
count,9994,9994,9994.0,9994.0,9994.0,9994.0,9994.0,9994.0
mean,2016-04-30 00:07:12.259355648,2016-05-03 23:06:58.571142912,229.85798,3.789574,0.156203,28.656848,75.571127,67.771666
min,2014-01-03 00:00:00,2014-01-07 00:00:00,0.44,1.0,0.0,-6599.98,0.99,0.54
25%,2015-05-23 00:00:00,2015-05-27 00:00:00,17.28,2.0,0.0,1.73,6.48,4.67
50%,2016-06-26 00:00:00,2016-06-29 00:00:00,54.49,3.0,0.2,8.665,19.98,15.67
75%,2017-05-14 00:00:00,2017-05-18 00:00:00,209.94,5.0,0.2,29.36,76.98,67.29
max,2017-12-30 00:00:00,2018-01-05 00:00:00,22638.48,14.0,0.8,8399.98,7546.16,7848.01
std,,,623.245104,2.22511,0.206452,234.260149,188.961461,182.4282


In [130]:
# view the 20 highest values in the Payment column

df.nlargest(20, 'Sales').T

Unnamed: 0,2697,6826,8153,2623,4190,9039,4098,4277,8488,6425,2505,165,683,6626,509,6520,7666,6340,8858,5884
PurchaseId,CA-2014-145317,CA-2016-118689,CA-2017-140151,CA-2017-127180,CA-2017-166709,CA-2016-117121,CA-2014-116904,US-2016-107440,CA-2016-158841,CA-2016-143714,CA-2014-143917,CA-2014-139892,US-2017-168116,CA-2014-145541,CA-2015-145352,CA-2017-138289,US-2016-140158,CA-2017-143112,CA-2017-135909,CA-2016-136301
PurchaseDate,2014-03-18 00:00:00,2016-10-02 00:00:00,2017-03-23 00:00:00,2017-10-22 00:00:00,2017-11-17 00:00:00,2016-12-17 00:00:00,2014-09-23 00:00:00,2016-04-16 00:00:00,2016-02-02 00:00:00,2016-05-23 00:00:00,2014-07-25 00:00:00,2014-09-08 00:00:00,2017-11-04 00:00:00,2014-12-14 00:00:00,2015-03-16 00:00:00,2017-01-16 00:00:00,2016-10-04 00:00:00,2017-10-05 00:00:00,2017-10-13 00:00:00,2016-03-13 00:00:00
ShipDate,2014-03-23 00:00:00,2016-10-09 00:00:00,2017-03-25 00:00:00,2017-10-24 00:00:00,2017-11-22 00:00:00,2016-12-21 00:00:00,2014-09-28 00:00:00,2016-04-20 00:00:00,2016-02-04 00:00:00,2016-05-27 00:00:00,2014-07-27 00:00:00,2014-09-12 00:00:00,2017-11-04 00:00:00,2014-12-21 00:00:00,2015-03-22 00:00:00,2017-01-18 00:00:00,2016-10-08 00:00:00,2017-10-09 00:00:00,2017-10-20 00:00:00,2016-03-15 00:00:00
ShipMode,Standard Class,Standard Class,First Class,First Class,Standard Class,Standard Class,Standard Class,Standard Class,Second Class,Standard Class,Second Class,Standard Class,Same Day,Standard Class,Standard Class,Second Class,Standard Class,Standard Class,Standard Class,Second Class
CustomerId,SM-20320,TC-20980,RB-19360,TA-21385,HL-15040,AB-10105,SC-20095,BS-11365,SE-20110,CC-12370,KL-16645,BM-11140,GT-14635,TB-21400,CM-12385,AR-10540,DR-12940,TS-21370,JW-15220,EH-13765
CustomerName,Sean Miller,Tamara Chand,Raymond Buch,Tom Ashbrook,Hunter Lopez,Adrian Barton,Sanjit Chand,Bill Shonely,Sanjit Engle,Christopher Conant,Ken Lonsdale,Becky Martin,Grant Thornton,Tom Boeckenhauer,Christopher Martinez,Andy Reiter,Daniel Raglin,Todd Sumrall,Jane Waco,Edward Hooks
Segment,Home Office,Corporate,Consumer,Home Office,Consumer,Consumer,Consumer,Corporate,Consumer,Consumer,Consumer,Consumer,Corporate,Consumer,Consumer,Consumer,Home Office,Corporate,Corporate,Corporate
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Jacksonville,Lafayette,Seattle,New York City,Newark,Detroit,Minneapolis,Lakewood,Arlington,Philadelphia,San Francisco,San Antonio,Burlington,New York City,Atlanta,Jackson,Providence,New York City,Sacramento,San Francisco
State,Florida,Indiana,Washington,New York,Delaware,Michigan,Minnesota,New Jersey,Virginia,Pennsylvania,California,Texas,North Carolina,New York,Georgia,Michigan,Rhode Island,New York,California,California


In [131]:
# view the 20 lowest values in the Payment column

df.nsmallest(20, 'Sales').T

Unnamed: 0,4101,9292,8658,4711,2106,7548,8033,2761,8024,976,1332,4933,987,2605,1112,1685,4874,6089,3325,5206
PurchaseId,US-2017-102288,CA-2017-124114,CA-2016-168361,CA-2014-112403,US-2014-152723,CA-2014-103492,CA-2015-119690,CA-2017-126536,CA-2014-129189,US-2017-100209,CA-2014-122567,CA-2015-106978,CA-2015-146829,CA-2014-112718,US-2016-110156,CA-2017-149489,CA-2017-164042,US-2017-162068,CA-2014-165309,CA-2015-102015
PurchaseDate,2017-06-19 00:00:00,2017-03-02 00:00:00,2016-06-21 00:00:00,2014-03-31 00:00:00,2014-09-26 00:00:00,2014-10-10 00:00:00,2015-06-25 00:00:00,2017-10-12 00:00:00,2014-07-21 00:00:00,2017-07-09 00:00:00,2014-02-16 00:00:00,2015-09-28 00:00:00,2015-03-10 00:00:00,2014-12-16 00:00:00,2016-11-19 00:00:00,2017-04-24 00:00:00,2017-05-23 00:00:00,2017-12-28 00:00:00,2014-11-11 00:00:00,2015-09-12 00:00:00
ShipDate,2017-06-23 00:00:00,2017-03-02 00:00:00,2016-06-25 00:00:00,2014-03-31 00:00:00,2014-09-26 00:00:00,2014-10-15 00:00:00,2015-06-28 00:00:00,2017-10-14 00:00:00,2014-07-25 00:00:00,2017-07-15 00:00:00,2014-02-21 00:00:00,2015-10-04 00:00:00,2015-03-10 00:00:00,2014-12-21 00:00:00,2016-11-24 00:00:00,2017-04-27 00:00:00,2017-05-27 00:00:00,2017-12-31 00:00:00,2014-11-15 00:00:00,2015-09-18 00:00:00
ShipMode,Standard Class,Same Day,Standard Class,Same Day,Same Day,Standard Class,First Class,First Class,Standard Class,Standard Class,Standard Class,Standard Class,Same Day,Standard Class,Standard Class,First Class,Standard Class,Second Class,Standard Class,Standard Class
CustomerId,ZC-21910,RS-19765,KB-16600,JO-15280,HG-14965,CM-12715,MV-17485,NK-18490,HM-14860,TD-20995,MN-17935,ZC-21910,TS-21340,KN-16450,EH-13945,DK-12835,KL-16645,PC-18745,KD-16270,TM-21010
CustomerName,Zuschuss Carroll,Roland Schwarz,Ken Brennan,Jas O'Carroll,Henry Goldwyn,Craig Molinari,Mark Van Huff,Neil Knudson,Harry Marie,Tamara Dahlen,Michael Nguyen,Zuschuss Carroll,Toby Swindell,Kean Nguyen,Eric Hoffmann,Damala Kotsonis,Ken Lonsdale,Pamela Coakley,Karen Daniels,Tamara Manning
Segment,Consumer,Corporate,Corporate,Consumer,Corporate,Corporate,Consumer,Home Office,Corporate,Consumer,Consumer,Consumer,Consumer,Corporate,Consumer,Corporate,Consumer,Corporate,Consumer,Consumer
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Houston,Waco,Chicago,Philadelphia,Mesquite,Huntsville,Houston,San Francisco,Dallas,Portland,Dallas,Aurora,Houston,Jacksonville,Houston,Philadelphia,Houston,Loveland,Houston,Auburn
State,Texas,Texas,Illinois,Pennsylvania,Texas,Texas,Texas,California,Texas,Oregon,Texas,Colorado,Texas,Florida,Texas,Pennsylvania,Texas,Colorado,Texas,New York


In [132]:
# view the 20 highest values in the Discount column

df.nlargest(20, 'Discount').T

Unnamed: 0,14,15,75,101,169,174,176,203,261,280,378,380,393,469,521,536,549,621,658,662
PurchaseId,US-2015-118983,US-2015-118983,US-2017-118038,CA-2016-158568,CA-2014-139892,US-2014-100853,US-2017-152366,US-2017-116701,US-2017-155299,US-2015-161991,CA-2015-130792,CA-2015-130792,US-2014-134971,US-2016-100419,CA-2015-157812,US-2017-122637,CA-2015-113173,US-2014-111171,US-2016-156097,CA-2015-146563
PurchaseDate,2015-11-22 00:00:00,2015-11-22 00:00:00,2017-12-09 00:00:00,2016-08-29 00:00:00,2014-09-08 00:00:00,2014-09-14 00:00:00,2017-04-21 00:00:00,2017-12-17 00:00:00,2017-06-08 00:00:00,2015-09-26 00:00:00,2015-04-28 00:00:00,2015-04-28 00:00:00,2014-06-07 00:00:00,2016-12-16 00:00:00,2015-03-22 00:00:00,2017-09-03 00:00:00,2015-11-15 00:00:00,2014-12-26 00:00:00,2016-09-19 00:00:00,2015-08-24 00:00:00
ShipDate,2015-11-26 00:00:00,2015-11-26 00:00:00,2017-12-11 00:00:00,2016-09-02 00:00:00,2014-09-12 00:00:00,2014-09-19 00:00:00,2017-04-25 00:00:00,2017-12-21 00:00:00,2017-06-12 00:00:00,2015-09-28 00:00:00,2015-05-05 00:00:00,2015-05-05 00:00:00,2014-06-10 00:00:00,2016-12-20 00:00:00,2015-03-26 00:00:00,2017-09-08 00:00:00,2015-11-17 00:00:00,2014-12-31 00:00:00,2016-09-19 00:00:00,2015-08-28 00:00:00
ShipMode,Standard Class,Standard Class,First Class,Standard Class,Standard Class,Standard Class,Second Class,Second Class,Standard Class,Second Class,Standard Class,Standard Class,Second Class,Second Class,Standard Class,Second Class,Second Class,Standard Class,Same Day,Standard Class
CustomerId,HP-14815,HP-14815,KB-16600,RB-19465,BM-11140,JB-15400,SJ-20500,LC-17140,DL-13600,SC-20725,RA-19915,RA-19915,BP-11095,CC-12670,DB-13210,EP-13915,DK-13225,CA-12265,EH-14125,CB-12025
CustomerName,Harold Pawlan,Harold Pawlan,Ken Brennan,Rick Bensley,Becky Martin,Jennifer Braxton,Shirley Jackson,Logan Currie,Dorris Liebe,Steven Cartwright,Russell Applegate,Russell Applegate,Bart Pistole,Craig Carreira,Dean Braden,Emily Phan,Dean Katz,Christina Anderson,Eugene Hildebrand,Cassandra Brandow
Segment,Home Office,Home Office,Corporate,Home Office,Consumer,Corporate,Consumer,Consumer,Corporate,Consumer,Consumer,Consumer,Corporate,Consumer,Consumer,Consumer,Corporate,Consumer,Home Office,Consumer
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Fort Worth,Fort Worth,Houston,Chicago,San Antonio,Chicago,Houston,Dallas,Pasadena,Houston,Houston,Houston,Peoria,Chicago,Houston,Chicago,Chicago,Chicago,Aurora,Arlington
State,Texas,Texas,Texas,Illinois,Texas,Illinois,Texas,Texas,Texas,Texas,Texas,Texas,Illinois,Illinois,Texas,Illinois,Illinois,Illinois,Illinois,Texas


In [133]:
# view the 20 highest values in the Profit column

df.nlargest(20, 'Profit').T

Unnamed: 0,6826,8153,4190,9039,4098,2623,509,8488,7666,6520,1085,4277,8990,6626,8204,318,7683,8858,7914,7818
PurchaseId,CA-2016-118689,CA-2017-140151,CA-2017-166709,CA-2016-117121,CA-2014-116904,CA-2017-127180,CA-2015-145352,CA-2016-158841,US-2016-140158,CA-2017-138289,US-2016-143819,US-2016-107440,US-2015-128587,CA-2014-145541,CA-2015-114811,CA-2014-164973,CA-2015-120782,CA-2017-135909,CA-2017-165323,CA-2016-138478
PurchaseDate,2016-10-02 00:00:00,2017-03-23 00:00:00,2017-11-17 00:00:00,2016-12-17 00:00:00,2014-09-23 00:00:00,2017-10-22 00:00:00,2015-03-16 00:00:00,2016-02-02 00:00:00,2016-10-04 00:00:00,2017-01-16 00:00:00,2016-03-01 00:00:00,2016-04-16 00:00:00,2015-12-24 00:00:00,2014-12-14 00:00:00,2015-11-08 00:00:00,2014-11-04 00:00:00,2015-04-28 00:00:00,2017-10-13 00:00:00,2017-06-17 00:00:00,2016-10-21 00:00:00
ShipDate,2016-10-09 00:00:00,2017-03-25 00:00:00,2017-11-22 00:00:00,2016-12-21 00:00:00,2014-09-28 00:00:00,2017-10-24 00:00:00,2015-03-22 00:00:00,2016-02-04 00:00:00,2016-10-08 00:00:00,2017-01-18 00:00:00,2016-03-05 00:00:00,2016-04-20 00:00:00,2015-12-30 00:00:00,2014-12-21 00:00:00,2015-11-08 00:00:00,2014-11-09 00:00:00,2015-05-01 00:00:00,2017-10-20 00:00:00,2017-06-21 00:00:00,2016-10-26 00:00:00
ShipMode,Standard Class,First Class,Standard Class,Standard Class,Standard Class,First Class,Standard Class,Second Class,Standard Class,Second Class,Standard Class,Standard Class,Standard Class,Standard Class,Same Day,Standard Class,First Class,Standard Class,Standard Class,Second Class
CustomerId,TC-20980,RB-19360,HL-15040,AB-10105,SC-20095,TA-21385,CM-12385,SE-20110,DR-12940,AR-10540,KD-16270,BS-11365,HM-14860,TB-21400,KD-16495,NM-18445,SD-20485,JW-15220,SR-20740,DP-13390
CustomerName,Tamara Chand,Raymond Buch,Hunter Lopez,Adrian Barton,Sanjit Chand,Tom Ashbrook,Christopher Martinez,Sanjit Engle,Daniel Raglin,Andy Reiter,Karen Daniels,Bill Shonely,Harry Marie,Tom Boeckenhauer,Keith Dawkins,Nathan Mautz,Shirley Daniels,Jane Waco,Steven Roelle,Dennis Pardue
Segment,Corporate,Consumer,Consumer,Consumer,Consumer,Home Office,Consumer,Consumer,Home Office,Consumer,Consumer,Corporate,Corporate,Consumer,Corporate,Home Office,Home Office,Corporate,Home Office,Home Office
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Lafayette,Seattle,Newark,Detroit,Minneapolis,New York City,Atlanta,Arlington,Providence,Jackson,Yonkers,Lakewood,Springfield,New York City,New York City,New York City,Midland,Sacramento,New York City,North Las Vegas
State,Indiana,Washington,Delaware,Michigan,Minnesota,New York,Georgia,Virginia,Rhode Island,Michigan,New York,New Jersey,Missouri,New York,New York,New York,Michigan,California,New York,Nevada


In [134]:
# view the 20 smallest values in the Profit column

df.nsmallest(20, 'Profit').T

Unnamed: 0,7772,683,9774,3011,4991,3151,5310,9639,1199,2697,27,3324,165,2928,7898,4820,2846,8640,4355,1803
PurchaseId,CA-2016-108196,US-2017-168116,CA-2014-169019,CA-2017-134845,US-2017-122714,CA-2015-147830,CA-2017-131254,CA-2015-116638,CA-2016-130946,CA-2014-145317,US-2015-150630,CA-2014-165309,CA-2014-139892,US-2017-120390,CA-2017-128363,CA-2015-140025,CA-2017-152093,US-2017-148551,CA-2015-155600,CA-2017-158379
PurchaseDate,2016-11-25 00:00:00,2017-11-04 00:00:00,2014-07-26 00:00:00,2017-04-17 00:00:00,2017-12-07 00:00:00,2015-12-15 00:00:00,2017-11-19 00:00:00,2015-01-28 00:00:00,2016-04-08 00:00:00,2014-03-18 00:00:00,2015-09-17 00:00:00,2014-11-11 00:00:00,2014-09-08 00:00:00,2017-10-19 00:00:00,2017-08-13 00:00:00,2015-04-07 00:00:00,2017-09-10 00:00:00,2017-01-12 00:00:00,2015-12-04 00:00:00,2017-09-22 00:00:00
ShipDate,2016-12-02 00:00:00,2017-11-04 00:00:00,2014-07-30 00:00:00,2017-04-23 00:00:00,2017-12-13 00:00:00,2015-12-18 00:00:00,2017-11-21 00:00:00,2015-01-31 00:00:00,2016-04-12 00:00:00,2014-03-23 00:00:00,2015-09-21 00:00:00,2014-11-15 00:00:00,2014-09-12 00:00:00,2017-10-26 00:00:00,2017-08-18 00:00:00,2015-04-11 00:00:00,2017-09-15 00:00:00,2017-01-16 00:00:00,2015-12-07 00:00:00,2017-09-26 00:00:00
ShipMode,Standard Class,Same Day,Standard Class,Standard Class,Standard Class,First Class,First Class,Second Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Standard Class,Second Class,Second Class
CustomerId,CS-12505,GT-14635,LF-17185,SR-20425,HG-14965,NF-18385,NC-18415,JH-15985,ZC-21910,SM-20320,TB-21520,KD-16270,BM-11140,TH-21550,DC-12850,PF-19120,SN-20560,DB-13120,RO-19780,JA-15970
CustomerName,Cindy Stewart,Grant Thornton,Luke Foster,Sharelle Roach,Henry Goldwyn,Natalie Fritzler,Nathan Cano,Joseph Holt,Zuschuss Carroll,Sean Miller,Tracy Blumstein,Karen Daniels,Becky Martin,Tracy Hopkins,Dan Campbell,Peter Fuller,Skye Norling,David Bremer,Rose O'Brian,Joseph Airdo
Segment,Consumer,Corporate,Consumer,Home Office,Corporate,Consumer,Consumer,Consumer,Consumer,Home Office,Consumer,Consumer,Consumer,Home Office,Consumer,Consumer,Home Office,Corporate,Consumer,Consumer
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,Lancaster,Burlington,San Antonio,Louisville,Chicago,Newark,Houston,Concord,Houston,Jacksonville,Philadelphia,Houston,San Antonio,Burlington,Memphis,San Antonio,Chicago,Dallas,Clarksville,Philadelphia
State,Ohio,North Carolina,Texas,Colorado,Illinois,Ohio,Texas,North Carolina,Texas,Florida,Pennsylvania,Texas,Texas,North Carolina,Tennessee,Texas,Illinois,Texas,Tennessee,Pennsylvania


### The highest and lowest values in the Payment, Quantity, Discount, and Profit columns have been reviewed.
### No inconsistencies were detected in these columns

In [135]:
# Create the new DataFrame with SupplierPrice higher than CatalogPrice 

minus_sale_df = df[df['SupplierPrice'] > df['CatalogPrice']]
minus_sale_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1871 entries, 3 to 9962
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PurchaseId     1871 non-null   object        
 1   PurchaseDate   1871 non-null   datetime64[ns]
 2   ShipDate       1871 non-null   datetime64[ns]
 3   ShipMode       1871 non-null   object        
 4   CustomerId     1871 non-null   object        
 5   CustomerName   1871 non-null   object        
 6   Segment        1871 non-null   object        
 7   Country        1871 non-null   object        
 8   City           1871 non-null   object        
 9   State          1871 non-null   object        
 10  PostalCode     1871 non-null   object        
 11  Region         1871 non-null   object        
 12  ProductId      1871 non-null   object        
 13  Category       1871 non-null   object        
 14  SubCategory    1871 non-null   object        
 15  ProductName    1871 non-nu

In [136]:
minus_sale_df.describe()

Unnamed: 0,PurchaseDate,ShipDate,Sales,Quantity,Discount,Profit,CatalogPrice,SupplierPrice
count,1871,1871,1871.0,1871.0,1871.0,1871.0,1871.0,1871.0
mean,2016-04-27 03:02:24.307856896,2016-05-01 02:22:23.025120256,250.511513,3.762694,0.480887,-83.448231,116.87303,138.16922
min,2014-01-04 00:00:00,2014-01-08 00:00:00,0.44,1.0,0.1,-6599.98,1.26,1.46
25%,2015-05-10 00:00:00,2015-05-13 12:00:00,12.5,2.0,0.2,-58.66,11.49,14.38
50%,2016-06-12 00:00:00,2016-06-17 00:00:00,71.09,3.0,0.4,-18.09,44.43,54.34
75%,2017-05-05 12:00:00,2017-05-10 00:00:00,284.92,5.0,0.7,-6.265,135.65,157.29
max,2017-12-30 00:00:00,2018-01-03 00:00:00,22638.48,14.0,0.8,-0.09,7546.16,7848.01
std,,,715.067234,2.141347,0.23508,284.423355,279.409746,330.443249


In [137]:
minus_sale_df.head()

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,CatalogPrice,SupplierPrice,NewProductId
3,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford Cr4500 Series Slim Rectangular Table,957.58,5,0.45,-383.03,Sean,O'Donnell,348.21,424.82,FUR-TA-10005327
14,US-2015-118983,2015-11-22,2015-11-26,Standard Class,HP-14815,Harold Pawlan,Home Office,United States,Fort Worth,Texas,76106,Central,OFF-AP-10002311,Office Supplies,Appliances,Holmes Replacement Filter For Hepa Air Cleaner...,68.81,5,0.8,-123.86,Harold,Pawlan,68.81,93.58,OFF-AP-10005430
15,US-2015-118983,2015-11-22,2015-11-26,Standard Class,HP-14815,Harold Pawlan,Home Office,United States,Fort Worth,Texas,76106,Central,OFF-BI-10000756,Office Supplies,Binders,Storex Duratech Recycled Plastic Frosted Binders,2.54,3,0.8,-3.82,Harold,Pawlan,4.24,5.51,OFF-BI-10005673
23,US-2017-156909,2017-07-16,2017-07-18,Second Class,SF-20065,Sandra Flanagan,Consumer,United States,Philadelphia,Pennsylvania,19140,East,FUR-CH-10002774,Furniture,Chairs,"Global Deluxe Stacking Chair, Gray",71.37,2,0.3,-1.02,Sandra,Flanagan,50.98,51.49,FUR-CH-10005097
27,US-2015-150630,2015-09-17,2015-09-21,Standard Class,TB-21520,Tracy Blumstein,Consumer,United States,Philadelphia,Pennsylvania,19140,East,FUR-BO-10004834,Furniture,Bookcases,"Riverside Palais Royal Lawyers Bookcase, Royal...",3083.43,7,0.5,-1665.05,Tracy,Blumstein,880.98,1118.84,FUR-BO-10005049


### there are 1871 sales  in DF which causes loss, will be reviewed in next steps to find the reason of it 

## 21: Check if columns create unique pairs <a id="21"></a>

In [138]:
df.rename(columns={'ProductId': 'OldProductId'}, inplace=True)


df.rename(columns={'NewProductId': 'ProductId'}, inplace=True)

In [139]:
df.columns

Index(['PurchaseId', 'PurchaseDate', 'ShipDate', 'ShipMode', 'CustomerId',
       'CustomerName', 'Segment', 'Country', 'City', 'State', 'PostalCode',
       'Region', 'OldProductId', 'Category', 'SubCategory', 'ProductName',
       'Sales', 'Quantity', 'Discount', 'Profit', 'FirstName', 'LastName',
       'CatalogPrice', 'SupplierPrice', 'ProductId'],
      dtype='object')

In [140]:
# Identify rows where the combination of PurchaseId and ProductId is duplicated

duplicates = df[df.duplicated(subset=['PurchaseId', 'ProductId'], keep=False)]

duplicates.head(30).T

Unnamed: 0,350,352,430,431,1300,1301,3183,3184,3405,3406,6498,6500,7881,7882,9168,9169
PurchaseId,CA-2016-129714,CA-2016-129714,US-2016-123750,US-2016-123750,CA-2016-137043,CA-2016-137043,CA-2017-152912,CA-2017-152912,US-2014-150119,US-2014-150119,CA-2015-103135,CA-2015-103135,CA-2017-118017,CA-2017-118017,CA-2016-140571,CA-2016-140571
PurchaseDate,2016-09-01 00:00:00,2016-09-01 00:00:00,2016-04-15 00:00:00,2016-04-15 00:00:00,2016-12-23 00:00:00,2016-12-23 00:00:00,2017-11-09 00:00:00,2017-11-09 00:00:00,2014-04-23 00:00:00,2014-04-23 00:00:00,2015-07-24 00:00:00,2015-07-24 00:00:00,2017-12-03 00:00:00,2017-12-03 00:00:00,2016-03-15 00:00:00,2016-03-15 00:00:00
ShipDate,2016-09-03 00:00:00,2016-09-03 00:00:00,2016-04-21 00:00:00,2016-04-21 00:00:00,2016-12-25 00:00:00,2016-12-25 00:00:00,2017-11-12 00:00:00,2017-11-12 00:00:00,2014-04-27 00:00:00,2014-04-27 00:00:00,2015-07-28 00:00:00,2015-07-28 00:00:00,2017-12-06 00:00:00,2017-12-06 00:00:00,2016-03-19 00:00:00,2016-03-19 00:00:00
ShipMode,First Class,First Class,Standard Class,Standard Class,Second Class,Second Class,Second Class,Second Class,Standard Class,Standard Class,Standard Class,Standard Class,Second Class,Second Class,Standard Class,Standard Class
CustomerId,AB-10060,AB-10060,RB-19795,RB-19795,LC-17140,LC-17140,BM-11650,BM-11650,LB-16795,LB-16795,SS-20515,SS-20515,LC-16870,LC-16870,SJ-20125,SJ-20125
CustomerName,Adam Bellavance,Adam Bellavance,Ross Baird,Ross Baird,Logan Currie,Logan Currie,Brian Moss,Brian Moss,Laurel Beltran,Laurel Beltran,Shirley Schmidt,Shirley Schmidt,Lena Cacioppo,Lena Cacioppo,Sanjit Jacobs,Sanjit Jacobs
Segment,Home Office,Home Office,Home Office,Home Office,Consumer,Consumer,Corporate,Corporate,Home Office,Home Office,Home Office,Home Office,Consumer,Consumer,Home Office,Home Office
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
City,New York City,New York City,Gastonia,Gastonia,Springfield,Springfield,Columbia,Columbia,Columbus,Columbus,Louisville,Louisville,Thornton,Thornton,Jackson,Jackson
State,New York,New York,North Carolina,North Carolina,Virginia,Virginia,Maryland,Maryland,Ohio,Ohio,Kentucky,Kentucky,Colorado,Colorado,Mississippi,Mississippi


In [141]:
duplicates.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, 350 to 9169
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PurchaseId     16 non-null     object        
 1   PurchaseDate   16 non-null     datetime64[ns]
 2   ShipDate       16 non-null     datetime64[ns]
 3   ShipMode       16 non-null     object        
 4   CustomerId     16 non-null     object        
 5   CustomerName   16 non-null     object        
 6   Segment        16 non-null     object        
 7   Country        16 non-null     object        
 8   City           16 non-null     object        
 9   State          16 non-null     object        
 10  PostalCode     16 non-null     object        
 11  Region         16 non-null     object        
 12  OldProductId   16 non-null     object        
 13  Category       16 non-null     object        
 14  SubCategory    16 non-null     object        
 15  ProductName    16 non-null

In [142]:
df[(df['PurchaseId'] == 'CA-2016-137043') & (df['ProductId'] == 'FUR-FU-10005268')]

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,OldProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,CatalogPrice,SupplierPrice,ProductId
1300,CA-2016-137043,2016-12-23,2016-12-25,Second Class,LC-17140,Logan Currie,Consumer,United States,Springfield,Virginia,22153,South,FUR-FU-10003664,Furniture,Furnishings,"Electrix Architect'S Clamp-On Swing Arm Lamp, ...",572.76,6,0.0,166.1,Logan,Currie,95.46,67.78,FUR-FU-10005268
1301,CA-2016-137043,2016-12-23,2016-12-25,Second Class,LC-17140,Logan Currie,Consumer,United States,Springfield,Virginia,22153,South,FUR-FU-10003664,Furniture,Furnishings,"Electrix Architect'S Clamp-On Swing Arm Lamp, ...",286.38,3,0.0,83.05,Logan,Currie,95.46,67.78,FUR-FU-10005268


#### we have 16 duplicated entries for PurchaseId - ProductId pairs in df
#### Duplicate pairs have same PurchaseId, ProductId, PurchaseDate, Discount and almost all other columns except Payment and Quantity columns.
#### They may have been calculated or added to Order after main order but these values share same PurchaseId-ProductId pair, which may cause error to use PurchaseId-ProductId as composite PK
#### We will sum the Payment and Quantity columns values in 1 row and delete the other to avoid duplicated PK values for PruchaseDetail table 

In [143]:
# List of index pairs to process
index_pairs = [
    (350, 352),
    (430, 431),
    (1300, 1301),
    (3183, 3184),
    (3405, 3406),
    (6498, 6500),
    (7881, 7882),
    (9168, 9169)
]

# Ensure indices are in descending order for proper processing
index_pairs = sorted(index_pairs, reverse=True)

# Process each pair of indices
for idx1, idx2 in index_pairs:
    if idx1 in df.index and idx2 in df.index:
        # Extract values
        row_1 = df.loc[idx1]
        row_2 = df.loc[idx2]
        
        # Update the second row with the values from the first row
        df.at[idx2, 'Quantity'] += row_1['Quantity']
        df.at[idx2, 'Sales'] += row_1['Sales']
        
        # Drop the first row
        df = df.drop(idx1)

In [144]:
df[(df['PurchaseId'] == 'CA-2016-137043') & (df['ProductId'] == 'FUR-FU-10005268')]

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,OldProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,CatalogPrice,SupplierPrice,ProductId
1301,CA-2016-137043,2016-12-23,2016-12-25,Second Class,LC-17140,Logan Currie,Consumer,United States,Springfield,Virginia,22153,South,FUR-FU-10003664,Furniture,Furnishings,"Electrix Architect'S Clamp-On Swing Arm Lamp, ...",859.14,9,0.0,83.05,Logan,Currie,95.46,67.78,FUR-FU-10005268


In [145]:
# Confirm changes 

# Identify rows where the combination of PurchaseId and ProductId is duplicated

duplicates = df[df.duplicated(subset=['PurchaseId', 'ProductId'], keep=False)]

duplicates.head()

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,OldProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,CatalogPrice,SupplierPrice,ProductId


## 22: Create Data Frames from SQLite tables <a id="22"></a>

In [146]:
# PurchaseDetail DataFrame

df_PurchaseDetail = df[['PurchaseId', 'ProductId', 'Quantity', 'Sales', 'Discount', 'Profit', 'ShipDate', 'ShipMode']]
df_PurchaseDetail = df_PurchaseDetail.sort_values(by=['PurchaseId', 'ProductId'], ascending=[False, True])

# df_Purchase DataFrame - Group by PurchaseId
df_Purchase = df.groupby('PurchaseId').first().reset_index()[['PurchaseId', 'CustomerId', 'PurchaseDate']]

# Sort by PurchaseDate (latest first) and CustomerId
df_Purchase = df_Purchase.sort_values(by=['PurchaseDate', 'CustomerId'], ascending=[False, True])

# df_Customer DataFrame - Group by CustomerId

df_Customer = df.groupby('CustomerId').first().reset_index()[['CustomerId', 'FirstName', 'LastName', 'Segment']]
df_Customer = df_Customer.sort_values(by=['CustomerId', 'LastName', 'FirstName'])

# Drop duplicates to get each unique pair of CustomerId and PostalCode

df_CustomerLocation = df.drop_duplicates(subset=['CustomerId', 'PostalCode'], keep='first')[['CustomerId', 'PostalCode', 'City', 'Region', 'State', 'Country']]
df_CustomerLocation = df_CustomerLocation.sort_values(by=['CustomerId', 'PostalCode'])

# Drop duplicates to get the unique pair of ProductId and SubCategory

df_Category = df.drop_duplicates(subset=['ProductId', 'SubCategory'], keep='first')[['ProductId', 'Category', 'SubCategory']]
df_Category = df_Category.sort_values(by=['ProductId', 'SubCategory'])

# df_Product DataFrame - Use most recent values for CatalogPrice and SupplierPrice
# Group by ProductId and PurchaseDate to get the latest rows based on PurchaseDate

df_Product = df.groupby('ProductId').apply(lambda x: x.sort_values('PurchaseDate').iloc[-1]).reset_index(drop=True)[['ProductId', 'ProductName', 'CatalogPrice', 'SupplierPrice']]
df_Product = df_Product.sort_values(by=['ProductId', 'ProductName'])

In [147]:
df_PurchaseDetail.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9986 entries, 5929 to 2717
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   PurchaseId  9986 non-null   object        
 1   ProductId   9986 non-null   object        
 2   Quantity    9986 non-null   int32         
 3   Sales       9986 non-null   float64       
 4   Discount    9986 non-null   float64       
 5   Profit      9986 non-null   float64       
 6   ShipDate    9986 non-null   datetime64[ns]
 7   ShipMode    9986 non-null   object        
dtypes: datetime64[ns](1), float64(3), int32(1), object(3)
memory usage: 663.1+ KB


In [148]:
df_Purchase.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5009 entries, 3328 to 56
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   PurchaseId    5009 non-null   object        
 1   CustomerId    5009 non-null   object        
 2   PurchaseDate  5009 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 156.5+ KB


In [149]:
df_Customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 793 entries, 0 to 792
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   CustomerId  793 non-null    object
 1   FirstName   793 non-null    object
 2   LastName    793 non-null    object
 3   Segment     793 non-null    object
dtypes: object(4)
memory usage: 24.9+ KB


In [150]:
df_CustomerLocation.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4910 entries, 7468 to 18
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   CustomerId  4910 non-null   object
 1   PostalCode  4910 non-null   object
 2   City        4910 non-null   object
 3   Region      4910 non-null   object
 4   State       4910 non-null   object
 5   Country     4910 non-null   object
dtypes: object(6)
memory usage: 268.5+ KB


In [151]:
df_Product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1894 entries, 0 to 1893
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ProductId      1894 non-null   object 
 1   ProductName    1894 non-null   object 
 2   CatalogPrice   1894 non-null   float64
 3   SupplierPrice  1894 non-null   float64
dtypes: float64(2), object(2)
memory usage: 59.3+ KB


In [152]:
df_Category.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1894 entries, 3512 to 35
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ProductId    1894 non-null   object
 1   Category     1894 non-null   object
 2   SubCategory  1894 non-null   object
dtypes: object(3)
memory usage: 59.2+ KB


## 23: CustomerLocation and CustomerId Distribution <a id="23"></a>

#### check the ratio between CustomerLocation and CustomerId counts. It shows that each customer has averagly almost 7 different locations. 

In [153]:
unique_pairs = df[['PostalCode', 'CustomerId']].drop_duplicates()

unique_pairs = unique_pairs.sort_values(by='CustomerId')

In [154]:
unique_pairs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4910 entries, 1159 to 8341
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PostalCode  4910 non-null   object
 1   CustomerId  4910 non-null   object
dtypes: object(2)
memory usage: 115.1+ KB


In [155]:
unique_pairs.value_counts('CustomerId')

CustomerId
EP-13915    17
JE-15745    13
ZC-21910    13
EA-14035    13
SH-19975    13
            ..
RM-19750     1
JR-15700     1
HH-15010     1
TC-21145     1
MG-18205     1
Name: count, Length: 793, dtype: int64

### It may be about the business model, it is true that each Customer has averagely 7 different Postcodes, which means minimum 7 different locations. 

## 24: Create and Populate Tables <a id="24"></a>

In [156]:
# before populating data to tables, create a new df to see total reveneu of each customer which we will use for ML Clustering with K-means 

df_monetary = df.groupby('CustomerId', as_index=False)['Sales'].sum()
df_monetary.head()

Unnamed: 0,CustomerId,Sales
0,AA-10315,5563.56
1,AA-10375,1056.39
2,AA-10480,1790.51
3,AA-10645,5086.93
4,AB-10015,886.15


In [157]:
# Create new attribute: Recency
# Compute Max Date to detect last transaction 

max_date = max(df['PurchaseDate'])
max_date

Timestamp('2017-12-30 00:00:00')

In [158]:
df['Diff'] = max_date - df['PurchaseDate']

df.head()

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,OldProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,CatalogPrice,SupplierPrice,ProductId,Diff
0,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.91,Claire,Gute,130.98,110.02,FUR-BO-10005012,417 days
1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.58,Claire,Gute,243.98,170.79,FUR-CH-10005056,417 days
2,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels For Typewriters B...,14.62,2,0.0,6.87,Darrin,Van Huff,7.31,3.87,OFF-LA-10005959,566 days
3,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford Cr4500 Series Slim Rectangular Table,957.58,5,0.45,-383.03,Sean,O'Donnell,348.21,424.82,FUR-TA-10005327,811 days
4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.37,2,0.2,2.52,Sean,O'Donnell,13.98,12.72,OFF-ST-10006337,811 days


In [159]:
# Convert 'Diff' column to numeric days

df['Recency'] = df['Diff'].dt.days
df.head(1)

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,OldProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit,FirstName,LastName,CatalogPrice,SupplierPrice,ProductId,Diff,Recency
0,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.91,Claire,Gute,130.98,110.02,FUR-BO-10005012,417 days,417


In [160]:
# Group by 'CustomerId' and get the minimum 'Recency' value

df_Recency = df.groupby('CustomerId')['Recency'].min().reset_index()
df_Recency.head()

Unnamed: 0,CustomerId,Recency
0,AA-10315,184
1,AA-10375,19
2,AA-10480,259
3,AA-10645,55
4,AB-10015,415


In [161]:
# Step 1: Calculate total sales (sum of 'Sales') per CustomerId

df_monetary = df.groupby('CustomerId', as_index=False)['Sales'].sum()

# Merge df_Recency into df_monetary based on 'CustomerId'

df_monetary = df_monetary.merge(df_Recency, on='CustomerId', how='left')

# Display the updated df_monetary DataFrame

df_monetary.head()

Unnamed: 0,CustomerId,Sales,Recency
0,AA-10315,5563.56,184
1,AA-10375,1056.39,19
2,AA-10480,1790.51,259
3,AA-10645,5086.93,55
4,AB-10015,886.15,415


In [162]:
# Count the number of PurchaseId values (including duplicates) for each CustomerId in the main DataFrame (df)
df_purchase_frequency = df.groupby('CustomerId')['PurchaseId'].size().reset_index(name='Frequency')

# Merge the frequency into df_monetary
df_monetary = df_monetary.merge(df_purchase_frequency, on='CustomerId', how='left')

# Display the updated df_monetary DataFrame
df_monetary.head()

Unnamed: 0,CustomerId,Sales,Recency,Frequency
0,AA-10315,5563.56,184,11
1,AA-10375,1056.39,19,15
2,AA-10480,1790.51,259,12
3,AA-10645,5086.93,55,18
4,AB-10015,886.15,415,6


In [163]:
df = df.drop(columns=['Diff'])

In [164]:
import sqlite3

#### Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('database.db')
cursor = conn.cursor()

#### SQL commands to create tables 
create_tables_sql = ''' 
-- Create the Customer table 
CREATE TABLE IF NOT EXISTS Customer (
    CustomerId TEXT PRIMARY KEY NOT NULL,                
    FirstName TEXT NOT NULL CHECK(length(FirstName) <= 100), 
    LastName TEXT NOT NULL CHECK(length(LastName) <= 100),   
    Segment TEXT NOT NULL CHECK(length(Segment) <= 50)
);

-- Create the CustomerLocation table
CREATE TABLE IF NOT EXISTS CustomerLocation (
    CustomerId TEXT NOT NULL, 
    PostalCode TEXT NOT NULL,
    City TEXT NOT NULL,
    Region TEXT,
    State TEXT NOT NULL,
    Country TEXT NOT NULL,
    PRIMARY KEY (PostalCode, CustomerId),
    FOREIGN KEY (CustomerId) REFERENCES Customer(CustomerId)
);

-- Create the Purchase table
CREATE TABLE IF NOT EXISTS Purchase (
    PurchaseId TEXT PRIMARY KEY NOT NULL,                
    CustomerId TEXT NOT NULL, 
    PurchaseDate DATE NOT NULL,
    FOREIGN KEY (CustomerId) REFERENCES Customer(CustomerId)
);

-- Create the PurchaseDetail table
CREATE TABLE IF NOT EXISTS PurchaseDetail (
    PurchaseId TEXT NOT NULL,
    ProductId TEXT NOT NULL,
    Quantity INTEGER NOT NULL,
    Sales DECIMAL NOT NULL,
    Discount DECIMAL NOT NULL,
    Profit DECIMAL NOT NULL,
    ShipDate DATE,
    ShipMode TEXT,
    PRIMARY KEY (PurchaseId, ProductId),
    FOREIGN KEY (PurchaseId) REFERENCES Purchase(PurchaseId),
    FOREIGN KEY (ProductId) REFERENCES Product(ProductId)
);

-- Create the Product table
CREATE TABLE IF NOT EXISTS Product (
    ProductId TEXT PRIMARY KEY NOT NULL,
    ProductName TEXT NOT NULL,
    CatalogPrice DECIMAL NOT NULL,
    SupplierPrice DECIMAL NOT NULL
);

-- Create the Category table
CREATE TABLE IF NOT EXISTS Category (
    ProductId TEXT NOT NULL,
    SubCategory TEXT NOT NULL,
    Category TEXT NOT NULL,
    PRIMARY KEY (ProductId, SubCategory),
    FOREIGN KEY (ProductId) REFERENCES Product(ProductId)
);
'''

#### Execute the SQL commands to create tables
cursor.executescript(create_tables_sql)

#### Commit changes and close the connection
conn.commit()

print("Database and tables created successfully.")

Database and tables created successfully.


### Populate data to Tables

In [165]:
df_Customer.to_sql('Customer', conn, if_exists='append', index=False)

df_CustomerLocation.to_sql('CustomerLocation', conn, if_exists='append', index=False)

df_Purchase.to_sql('Purchase', conn, if_exists='append', index=False)

df_PurchaseDetail.to_sql('PurchaseDetail', conn, if_exists='append', index=False)

df_Product.to_sql('Product', conn, if_exists='append', index=False)

df_Category.to_sql('Category', conn, if_exists='append', index=False)

1894

## 25: Run Some Queries to discover data

In [166]:
# Total Reveneu per Category

conn = sqlite3.connect('database.db')
cursor = conn.cursor()

# Define the path to your SQLite database
db_path = 'database.db'

# Connect to the SQLite database
conn = sqlite3.connect(db_path)


query = """
SELECT Category.Category, 
       SUM(PurchaseDetail.Sales) AS TotalRevenue
FROM PurchaseDetail
JOIN Product ON PurchaseDetail.ProductId = Product.ProductId
JOIN Category ON Product.ProductId = Category.ProductId
GROUP BY Category.Category
ORDER BY TotalRevenue DESC
LIMIT 5;
"""

# Execute the query and fetch the results into a DataFrame
pd.read_sql_query(query, conn)

Unnamed: 0,Category,TotalRevenue
0,Technology,836154.02
1,Furniture,741999.73
2,Office Supplies,719046.9
