# Introduction to Data Analysis Part 2: Mathematical Operations with Data Using Pandas and NumPy

## Introduction

## Importing Pandas and NumPy

In [1]:
import pandas as pd
import numpy as np

## Import Data From Excel and set the Data Types During Import

In [2]:
sales_data = pd.read_excel(io = "data/orders-export.xlsx",
                           sheet_name = "orders-export",
                           skiprows   = 1)

## Validating the Data

### What the Data Looks Like

In [3]:
sales_data.head(n = 5)

Unnamed: 0,order_id,order_date,customer_id,customer_first_name,customer_last_name,customer_gender,customer_city,customer_country,item_description,item_qty,item_price,order_currency,order_vat_rate
0,200001,23/04/2020,1001,Erin,Evans,Female,Swansea,United Kingdom,Tequila - Sauza Silver,45,2.27,GBP,20
1,200002,19/07/2021,1001,Erin,Evans,Female,Swansea,United Kingdom,Oil - Pumpkinseed,6,3.94,GBP,20
2,200003,31/10/2020,1001,Erin,Evans,Female,Swansea,United Kingdom,"Nut - Cashews, Whole, Raw",15,1.78,GBP,20
3,200004,14/08/2020,1001,Erin,Evans,Female,Swansea,United Kingdom,Wine - Coteaux Du Tricastin Ac,1,3.63,GBP,20
4,200005,01/05/2021,1001,Erin,Evans,Female,Swansea,United Kingdom,Sambuca - Ramazzotti,43,3.85,GBP,20


### Check the Columns DataTypes

In [4]:
sales_data.dtypes

order_id                 int64
order_date              object
customer_id              int64
customer_first_name     object
customer_last_name      object
customer_gender         object
customer_city           object
customer_country        object
item_description        object
item_qty                 int64
item_price             float64
order_currency          object
order_vat_rate           int64
dtype: object

## Import Data From Excel Again and Set / Convert the Data Types During Import

In [5]:

sales_data = pd.read_excel(io = "data/orders-export.xlsx",
                           sheet_name = "orders-export",
                           skiprows   = 1,
                           dtype      = {"order_id": np.int64,
                                         "customer_id": np.int64, 
                                         "customer_first_name": str,
                                         "customer_last_name": str,
                                         "customer_gender": str,
                                         "customer_city": str,
                                         "customer_country": str,
                                         "item_qty": np.int64,
                                         "item_price": np.float64,
                                         "order_currency": str,
                                         "order_vat_rate": np.float64},
                           converters = {"order_date": lambda date: pd.to_datetime(date, dayfirst=True),
                                         "item_description": lambda description: description.replace(",", " -")})

In [6]:
sales_data.head()

Unnamed: 0,order_id,order_date,customer_id,customer_first_name,customer_last_name,customer_gender,customer_city,customer_country,item_description,item_qty,item_price,order_currency,order_vat_rate
0,200001,2020-04-23,1001,Erin,Evans,Female,Swansea,United Kingdom,Tequila - Sauza Silver,45,2.27,GBP,20.0
1,200002,2021-07-19,1001,Erin,Evans,Female,Swansea,United Kingdom,Oil - Pumpkinseed,6,3.94,GBP,20.0
2,200003,2020-10-31,1001,Erin,Evans,Female,Swansea,United Kingdom,Nut - Cashews - Whole - Raw,15,1.78,GBP,20.0
3,200004,2020-08-14,1001,Erin,Evans,Female,Swansea,United Kingdom,Wine - Coteaux Du Tricastin Ac,1,3.63,GBP,20.0
4,200005,2021-05-01,1001,Erin,Evans,Female,Swansea,United Kingdom,Sambuca - Ramazzotti,43,3.85,GBP,20.0


In [7]:
sales_data.dtypes

order_id                        int64
order_date             datetime64[ns]
customer_id                     int64
customer_first_name            object
customer_last_name             object
customer_gender                object
customer_city                  object
customer_country               object
item_description               object
item_qty                        int64
item_price                    float64
order_currency                 object
order_vat_rate                float64
dtype: object

### Check for NaN (Null) Values

In [8]:
sales_data.isna().sum()

order_id               0
order_date             0
customer_id            0
customer_first_name    0
customer_last_name     0
customer_gender        0
customer_city          0
customer_country       0
item_description       0
item_qty               0
item_price             0
order_currency         0
order_vat_rate         0
dtype: int64

## Adding Custom Columns

- Convert eur to gbp
- total ex vat = qty * price
- total inc vat = total ex vat * (vat / 100)
  - Show examples with Pandas and NumPy

### Pandas vs. NumPy for Mathematical Operations

In [9]:
# --- Order total ex VAT (Local currency) - all pandas method for multiplication
%timeit sales_data["order_total_ex_vat_local_currency_pandas"] =  np.around(sales_data["item_qty"] * sales_data["item_price"], \
                                                                            decimals = 2)

220 µs ± 7.11 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [10]:
# --- Order total ex VAT (Local currency) - np multiply by reading from pandas dataframe
%timeit sales_data["order_total_ex_vat_local_currency_mixed"] =  np.around(np.multiply(sales_data["item_qty"], \
                                                                                       sales_data["item_price"]), \
                                                                                       decimals = 2)

247 µs ± 15.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [11]:
# --- Order total ex VAT (Local currency) - np multiply by reading from np arrays
%timeit sales_data_item_qty = np.array(sales_data["item_qty"])
sales_data_item_qty = np.array(sales_data["item_qty"])

%timeit sales_data_item_price = np.array(sales_data["item_price"])
sales_data_item_price = np.array(sales_data["item_price"])

%timeit sales_data_order_total_ex_vat_local_currency = np.around(np.multiply(sales_data_item_qty, \
                                                                 sales_data_item_price), \
                                                                 decimals = 2)

sales_data_order_total_ex_vat_local_currency = np.around(np.multiply(sales_data_item_qty, \
                                                         sales_data_item_price), \
                                                         decimals = 2)

%timeit sales_data["order_total_ex_vat_local_currency"] = sales_data_order_total_ex_vat_local_currency

6.64 µs ± 56.3 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
6.64 µs ± 39.5 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
18.1 µs ± 248 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
23.4 µs ± 727 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [12]:
sales_data.head(n = 1)

Unnamed: 0,order_id,order_date,customer_id,customer_first_name,customer_last_name,customer_gender,customer_city,customer_country,item_description,item_qty,item_price,order_currency,order_vat_rate,order_total_ex_vat_local_currency_pandas,order_total_ex_vat_local_currency_mixed,order_total_ex_vat_local_currency
0,200001,2020-04-23,1001,Erin,Evans,Female,Swansea,United Kingdom,Tequila - Sauza Silver,45,2.27,GBP,20.0,102.15,102.15,102.15


In [13]:
sales_data.drop(labels = ["order_total_ex_vat_local_currency_pandas","order_total_ex_vat_local_currency_mixed"], 
                axis = 1,
                inplace = True)

sales_data.head(n = 1)

Unnamed: 0,order_id,order_date,customer_id,customer_first_name,customer_last_name,customer_gender,customer_city,customer_country,item_description,item_qty,item_price,order_currency,order_vat_rate,order_total_ex_vat_local_currency
0,200001,2020-04-23,1001,Erin,Evans,Female,Swansea,United Kingdom,Tequila - Sauza Silver,45,2.27,GBP,20.0,102.15


In [14]:
# --- Order VAT (Local currency)
sales_data_order_vat_rate = np.array(sales_data["order_vat_rate"])
sales_data_order_total_vat_local_currency = np.around(np.multiply(sales_data_order_total_ex_vat_local_currency,
                                                            sales_data_order_vat_rate) / 100,
                                                            decimals = 2)

sales_data["order_total_vat_local_currency"] = sales_data_order_total_vat_local_currency

In [15]:
# --- Order total inc VAT (Local currency)
sales_data_order_total_inc_vat_local_currency = np.around(np.add(sales_data_order_total_ex_vat_local_currency, 
                                                               sales_data_order_total_vat_local_currency), 
                                                               decimals = 2)

sales_data["order_total_inc_vat_local_currency"] = sales_data_order_total_inc_vat_local_currency

### Create a Currency Conversion Rate Column

In [16]:
sales_data["order_currency"].value_counts()

GBP    4000
EUR    1000
Name: order_currency, dtype: int64

In [17]:
# --- Conversion rate
sales_data["order_currency_conversion_rate"] = sales_data["order_currency"].apply(lambda currency: 1.00 if currency == "GBP" else\
                                                                                  (0.84 if currency == "EUR" else 1.00))

In [18]:
sales_data.head(n = 1)

Unnamed: 0,order_id,order_date,customer_id,customer_first_name,customer_last_name,customer_gender,customer_city,customer_country,item_description,item_qty,item_price,order_currency,order_vat_rate,order_total_ex_vat_local_currency,order_total_vat_local_currency,order_total_inc_vat_local_currency,order_currency_conversion_rate
0,200001,2020-04-23,1001,Erin,Evans,Female,Swansea,United Kingdom,Tequila - Sauza Silver,45,2.27,GBP,20.0,102.15,20.43,122.58,1.0


In [19]:
sales_data.tail(n = 1)

Unnamed: 0,order_id,order_date,customer_id,customer_first_name,customer_last_name,customer_gender,customer_city,customer_country,item_description,item_qty,item_price,order_currency,order_vat_rate,order_total_ex_vat_local_currency,order_total_vat_local_currency,order_total_inc_vat_local_currency,order_currency_conversion_rate
4999,205001,2021-06-05,1005,Marie,Braam,Other,Utrecht,Netherlands,Tabasco Sauce - 2 Oz,32,1.89,EUR,21.0,60.48,12.7,73.18,0.84


### Create the Totals Columns For Converted Currency

In [20]:
sales_data_currency_conversion_rate = np.array(sales_data["order_currency_conversion_rate"])

In [21]:
# --- ex vat converted
sales_data["order_total_ex_vat_converted_gbp"] = np.around(np.multiply(sales_data_order_total_ex_vat_local_currency, 
                                                                            sales_data_currency_conversion_rate), 
                                                                            decimals = 2)

In [22]:
# --- total vat converted
sales_data["order_total_vat_converted_gbp"] = np.around(np.multiply(sales_data_order_total_vat_local_currency, 
                                                                         sales_data_currency_conversion_rate), 
                                                                         decimals = 2)

In [23]:
# --- total inc vat converted
sales_data["order_total_inc_vat_converted_gbp"] = np.around(np.multiply(sales_data_order_total_inc_vat_local_currency, 
                                                                             sales_data_currency_conversion_rate), 
                                                                             decimals = 2)

In [24]:
sales_data.head(n = 1)

Unnamed: 0,order_id,order_date,customer_id,customer_first_name,customer_last_name,customer_gender,customer_city,customer_country,item_description,item_qty,item_price,order_currency,order_vat_rate,order_total_ex_vat_local_currency,order_total_vat_local_currency,order_total_inc_vat_local_currency,order_currency_conversion_rate,order_total_ex_vat_converted_gbp,order_total_vat_converted_gbp,order_total_inc_vat_converted_gbp
0,200001,2020-04-23,1001,Erin,Evans,Female,Swansea,United Kingdom,Tequila - Sauza Silver,45,2.27,GBP,20.0,102.15,20.43,122.58,1.0,102.15,20.43,122.58


In [25]:
sales_data.tail(n = 1)

Unnamed: 0,order_id,order_date,customer_id,customer_first_name,customer_last_name,customer_gender,customer_city,customer_country,item_description,item_qty,item_price,order_currency,order_vat_rate,order_total_ex_vat_local_currency,order_total_vat_local_currency,order_total_inc_vat_local_currency,order_currency_conversion_rate,order_total_ex_vat_converted_gbp,order_total_vat_converted_gbp,order_total_inc_vat_converted_gbp
4999,205001,2021-06-05,1005,Marie,Braam,Other,Utrecht,Netherlands,Tabasco Sauce - 2 Oz,32,1.89,EUR,21.0,60.48,12.7,73.18,0.84,50.8,10.67,61.47


In [26]:
sales_data.dtypes

order_id                                       int64
order_date                            datetime64[ns]
customer_id                                    int64
customer_first_name                           object
customer_last_name                            object
customer_gender                               object
customer_city                                 object
customer_country                              object
item_description                              object
item_qty                                       int64
item_price                                   float64
order_currency                                object
order_vat_rate                               float64
order_total_ex_vat_local_currency            float64
order_total_vat_local_currency               float64
order_total_inc_vat_local_currency           float64
order_currency_conversion_rate               float64
order_total_ex_vat_converted_gbp             float64
order_total_vat_converted_gbp                f

In [27]:
sales_data.isna().sum()

order_id                              0
order_date                            0
customer_id                           0
customer_first_name                   0
customer_last_name                    0
customer_gender                       0
customer_city                         0
customer_country                      0
item_description                      0
item_qty                              0
item_price                            0
order_currency                        0
order_vat_rate                        0
order_total_ex_vat_local_currency     0
order_total_vat_local_currency        0
order_total_inc_vat_local_currency    0
order_currency_conversion_rate        0
order_total_ex_vat_converted_gbp      0
order_total_vat_converted_gbp         0
order_total_inc_vat_converted_gbp     0
dtype: int64

## Exporting data back to Excel

In [28]:
with pd.ExcelWriter(path = "data/order_data_with_totals.xlsx", 
                    engine = "xlsxwriter",
                    date_format = "YYYY-MM-DD",
                    datetime_format = "YYYY-MM-DD") as writer:
    
    sales_data.to_excel(writer, 
                        index = False,
                        sheet_name = "order_data_with_totals")