In [26]:
import os
import sys
import boto3
import warnings
import pandas as pd

from dotenv import load_dotenv
from pathlib import Path

# defining functions module path
module_path = str(Path.cwd().parents[0] / "src")
if module_path not in sys.path:
    sys.path.append(module_path)

from functions import connect_to_redshift

In [2]:
# removing warning messages
warnings.filterwarnings("ignore")

In [3]:
# Loading my .env file
load_dotenv()

True

In [4]:
# defining redshift credentials
dbname = os.getenv("dbname")
host = os.getenv("host")
port = os.getenv("port")
user = os.getenv("user")
password = os.getenv("password")

In [5]:
# establishing redshift connection
rs_connection = connect_to_redshift(dbname, host, port, user, password)

connection to redshift made


In [6]:
query = """select *
           from bootcamp.online_transactions_cleaned
        """

In [7]:
ot_cleaned = pd.read_sql(query, rs_connection)

In [8]:
ot_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399841 entries, 0 to 399840
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   invoice            399841 non-null  object        
 1   stock_code         399841 non-null  object        
 2   description        399841 non-null  object        
 3   price              399841 non-null  float64       
 4   quantity           399841 non-null  int64         
 5   total_order_value  399841 non-null  float64       
 6   invoice_date       399841 non-null  datetime64[ns]
 7   customer_id        399841 non-null  object        
 8   country            399841 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 27.5+ MB


In [9]:
ot_cleaned.describe()

Unnamed: 0,price,quantity,total_order_value
count,399841.0,399841.0,399841.0
mean,2.952914,12.224359,20.716904
std,7.317593,250.78886,425.471765
min,0.0,-80995.0,-168469.6
25%,1.25,2.0,4.25
50%,1.95,5.0,11.58
75%,3.75,12.0,19.5
max,1599.26,80995.0,168469.6


In [10]:
print(f"There are {(ot_cleaned.quantity < 0).sum()} entries with negative quantity")

There are 8507 entries with negative quantity


### The total order value is equal to price * quantity, therefore the negavite values in this column

In [11]:
# Checking is all the invoice with negative quantity start with a letter C 
ot_cleaned['invoice'][ot_cleaned.quantity < 0].str.startswith('C').sum()

8507

### I will consider all the entries with negative quantity as returns. They have a C in the invoice code, which I will assume refers to a CANCELLED transaction

In [12]:
ot_cancelled = ot_cleaned[ot_cleaned.invoice.str.startswith('C')]

In [13]:
ot_cleaned.drop(ot_cancelled.index, axis=0, inplace=True)

### Let's explore each column

In [14]:
print(f"The data contains information about {ot_cleaned.invoice.nunique()} online purchases made from {ot_cleaned.invoice_date.min()} to {ot_cleaned.invoice_date.max()}")

The data contains information about 18408 online purchases made from 2010-12-01 08:26:00 to 2011-12-09 12:50:00


In [15]:
print(f"from which {ot_cancelled.shape[0]} are cancelled orders")

from which 8507 are cancelled orders


In [16]:
print(f"A total of {ot_cleaned.stock_code.count()} stocks were sold from which {ot_cleaned.stock_code.nunique()} are different")

A total of 391334 stocks were sold from which 3662 are different


In [17]:
print(f"{ot_cancelled.stock_code.nunique()} stocks were return at least once")

1916 stocks were return at least once


In [18]:
inv_country = ot_cleaned.groupby('country').count()['invoice'].sort_values(ascending=False)

In [19]:
print(f"The invoices were purchased in {len(inv_country.index)} countries being {inv_country.index[0]} the country with the most purchases ({inv_country[0]}) and {inv_country.index[-1]} with the least ({inv_country[-1]})")

The invoices were purchased in 37 countries being United Kingdom the country with the most purchases (348940) and Saudi Arabia with the least (9)


In [20]:
print(f"The average price of the products is {round(ot_cleaned.price.mean(),2)} pounds")

The average price of the products is 2.92 pounds


In [21]:
print(f"The average ordered value per invoice is {round(ot_cleaned.total_order_value.mean(), 2)} pounds")

The average ordered value per invoice is 22.37 pounds


In [22]:
print(f"In total, there are {ot_cleaned.customer_id.nunique()} costumers")

In total, there are 4335 costumers


### Let's explore some KPIs

In [23]:
# Line Items (Number of products per invoice)
line_items = ot_cleaned[ot_cleaned.quantity > 0].groupby("invoice")['stock_code'].count()

In [24]:
print(f"The minimum items per invoice is {line_items.min()}, the maximum is {line_items.max()}. There are in average {round(line_items.mean())} stocks per invoice")

The minimum items per invoice is 1, the maximum is 542. There are in average 21 stocks per invoice


In [25]:
print(f"The most popular stock is {} purchased {} times")

SyntaxError: f-string: empty expression not allowed (3953900477.py, line 1)

In [27]:
pop_idx = ot_cleaned.groupby("stock_code").sum()['quantity'].sort_values(ascending=False).index[0]

In [28]:
print("The most popular item is: ")
ot_cleaned[ot_cleaned['stock_code'] == pop_idx]

The most popular item is: 


Unnamed: 0,invoice,stock_code,description,price,quantity,total_order_value,invoice_date,customer_id,country
394635,581483,23843,"PAPER CRAFT , LITTLE BIRDIE",2.08,80995,168469.6,2011-12-09 09:15:00,u16446,United Kingdom
