### Index
1.  [Load & Discover Data](#1)<br>
#### Review Columns and Values
2.  [PurchaseId Column ](#2) <br>
3.  [PurchaseDate - ShipDate Columns](#3) <br>

### 1: Load & Discover Data<a id="1"></a>

In [1]:
# import libraries
# import libraries
# Import Required Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sqlite3
from sqlite3 import Error

plt.style.use('ggplot')
pd.set_option('display.max_columns', 200) # to display all rows

In [2]:
# Check encoding of dataset

import chardet

# Read the first few bytes to guess the encoding to load the data
with open('Sample-Superstore.csv', 'rb') as file:
    rawdata = file.read(10000)
    result = chardet.detect(rawdata)
    
result

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

In [3]:
# Load the data using the detected encoding

df = pd.read_csv('Sample-Superstore.csv', sep=',', header=0, encoding='ISO-8859-1').copy()

In [4]:
df.shape

(9994, 21)

In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
Row ID,1,2,3,4,5
Order ID,CA-2016-152156,CA-2016-152156,CA-2016-138688,US-2015-108966,US-2015-108966
Order Date,11/8/2016,11/8/2016,6/12/2016,10/11/2015,10/11/2015
Ship Date,11/11/2016,11/11/2016,6/16/2016,10/18/2015,10/18/2015
Ship Mode,Second Class,Second Class,Second Class,Standard Class,Standard Class
Customer ID,CG-12520,CG-12520,DV-13045,SO-20335,SO-20335
Customer Name,Claire Gute,Claire Gute,Darrin Van Huff,Sean O'Donnell,Sean O'Donnell
Segment,Consumer,Consumer,Corporate,Consumer,Consumer
Country,United States,United States,United States,United States,United States
City,Henderson,Henderson,Los Angeles,Fort Lauderdale,Fort Lauderdale


In [7]:
df = df.drop(columns=['Row ID'])

In [8]:
# Rename columns 

df.rename(columns={
    'Order ID': 'PurchaseId',
    'Order Date': 'PurchaseDate',
    'Ship Date': 'ShipDate',
    'Ship Mode': 'ShipMode',  
    'Customer ID': 'CustomerId',
    'Customer Name': 'CustomerName',
    'Segment': 'Segment',
    'Country': 'Country',
    'City': 'City',
    'State': 'State',
    'Postal Code': 'PostalCode',
    'Region': 'Region',
    'Product ID': 'ProductId',
    'Category': 'Category',
    'Sub-Category': 'SubCategory',
    'Product Name': 'ProductName',
    'Sales': 'Sales',
    'Quantity': 'Quantity',
    'Discount': 'Discount',
    'Profit': 'Profit'
}, inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PurchaseId    9994 non-null   object 
 1   PurchaseDate  9994 non-null   object 
 2   ShipDate      9994 non-null   object 
 3   ShipMode      9994 non-null   object 
 4   CustomerId    9994 non-null   object 
 5   CustomerName  9994 non-null   object 
 6   Segment       9994 non-null   object 
 7   Country       9994 non-null   object 
 8   City          9994 non-null   object 
 9   State         9994 non-null   object 
 10  PostalCode    9994 non-null   int64  
 11  Region        9994 non-null   object 
 12  ProductId     9994 non-null   object 
 13  Category      9994 non-null   object 
 14  SubCategory   9994 non-null   object 
 15  ProductName   9994 non-null   object 
 16  Sales         9994 non-null   float64
 17  Quantity      9994 non-null   int64  
 18  Discount      9994 non-null 

In [10]:
df.dtypes

PurchaseId       object
PurchaseDate     object
ShipDate         object
ShipMode         object
CustomerId       object
CustomerName     object
Segment          object
Country          object
City             object
State            object
PostalCode        int64
Region           object
ProductId        object
Category         object
SubCategory      object
ProductName      object
Sales           float64
Quantity          int64
Discount        float64
Profit          float64
dtype: object

In [11]:
# Convert data types for columns 

df['PurchaseId'] = df['PurchaseId'].astype(str)

df['PurchaseDate'] = pd.to_datetime(df['PurchaseDate'])  

df['ShipDate'] = pd.to_datetime(df['ShipDate'])  

df['ShipMode'] = df['ShipMode'].astype(str)

df['CustomerId'] = df['CustomerId'].astype(str)

df['CustomerName'] = df['CustomerName'].astype(str)

df['Segment'] = df['Segment'].astype(str)

df['Country'] = df['Country'].astype(str)

df['City'] = df['City'].astype(str)

df['State'] = df['State'].astype(str)

df['PostalCode'] = df['PostalCode'].astype(str)  

df['Region'] = df['Region'].astype(str)

df['ProductId'] = df['ProductId'].astype(str)

df['Category'] = df['Category'].astype(str)

df['SubCategory'] = df['SubCategory'].astype(str)

df['ProductName'] = df['ProductName'].astype(str)

df['Sales'] = df['Sales'].astype(float)  

df['Quantity'] = df['Quantity'].astype(int)

df['Discount'] = df['Discount'].astype(float)

df['Profit'] = df['Profit'].astype(float)

In [12]:
df.isna().sum()

PurchaseId      0
PurchaseDate    0
ShipDate        0
ShipMode        0
CustomerId      0
CustomerName    0
Segment         0
Country         0
City            0
State           0
PostalCode      0
Region          0
ProductId       0
Category        0
SubCategory     0
ProductName     0
Sales           0
Quantity        0
Discount        0
Profit          0
dtype: int64

In [13]:
df.nunique()

PurchaseId      5009
PurchaseDate    1237
ShipDate        1334
ShipMode           4
CustomerId       793
CustomerName     793
Segment            3
Country            1
City             531
State             49
PostalCode       631
Region             4
ProductId       1862
Category           3
SubCategory       17
ProductName     1850
Sales           5825
Quantity          14
Discount          12
Profit          7287
dtype: int64

In [14]:
df.columns

Index(['PurchaseId', 'PurchaseDate', 'ShipDate', 'ShipMode', 'CustomerId',
       'CustomerName', 'Segment', 'Country', 'City', 'State', 'PostalCode',
       'Region', 'ProductId', 'Category', 'SubCategory', 'ProductName',
       'Sales', 'Quantity', 'Discount', 'Profit'],
      dtype='object')

### 2: PurchaseId<a id="2"></a>

In [15]:
# check if entries have the same format like: CA-2016-152156

import re

pattern = r'^[A-Z]{2}-\d{4}-\d{6}$'

df[~df['PurchaseId'].str.match(pattern)]

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit


In [16]:
# Trim whitespaces

df['PurchaseId'] = df['PurchaseId'].str.strip()

In [17]:
# Ensure that the case is uppercase across all entries.

df['PurchaseId'] = df['PurchaseId'].str.upper()

In [18]:
# Check if there is any unexpected characters or symbols 

df[df['PurchaseId'].str.contains(r'[^A-Z0-9-]')]

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit


In [19]:
# Check if the year starts with 20 and followed with same pattern 

df[~df['PurchaseId'].str.contains(r'-20\d{2}-')]

Unnamed: 0,PurchaseId,PurchaseDate,ShipDate,ShipMode,CustomerId,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductId,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit


### 3: PurchaseDate - ShipDate<a id="3"></a>