PHASE 1: ASK



Business Objective: Conduct a comprehensive analysis of the sales performance of an online store, broken down by products, periods of time, countries and customers. From this analysis, the aim is to devise effective strategies to boost sales and optimize future inventory management.

In [2]:
# Guiding questions:

# Which are the most sold products?

# Which products generate the most revenue?

# In which months, days, hours of day, are the most revenue generated? 

# In which countries are most revenue generated?

# Who are the most loyal customers? (considering total spent)

PHASE 2: PREPARE

In [3]:
# Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Creating the data frame
sales = pd.read_csv("C:/Users/adria/OneDrive/Documentos/Sales_proyect/Online_Retail.csv", encoding='latin1')

In [5]:
# Checking data types
print(sales.dtypes)

InvoiceNo       object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID     float64
Country         object
dtype: object


In [6]:
print(sales.describe())

            Quantity      UnitPrice     CustomerID
count  541909.000000  541909.000000  406829.000000
mean        9.552250       4.611114   15287.690570
std       218.081158      96.759853    1713.600303
min    -80995.000000  -11062.060000   12346.000000
25%         1.000000       1.250000   13953.000000
50%         3.000000       2.080000   15152.000000
75%        10.000000       4.130000   16791.000000
max     80995.000000   38970.000000   18287.000000


PHASE 3: CLEANING

In [7]:
# In order to work with complete months, it is necessary to delimit the sales from 1st December 2010 to 30th November 2011

# First, converting column InvoiceDate from object type to datetime type
sales['InvoiceDate'] = pd.to_datetime(sales['InvoiceDate'], format='%d/%m/%Y %H:%M')

# Now, the filter can be applied
sales_clean = sales[sales['InvoiceDate'].between('2010-12-01', '2011-11-30')]

In [8]:
# For this project, I assume most of the NaN values in column CustomerID were purchases attempts (uncompleted purchases), so them won´t be considered.
# The rest of NaN values seem to be missing products, them also won´t be considered for this project.

# Deleting NaN values of CustomerID column
sales_clean = sales_clean.dropna(subset=['CustomerID'])

In [9]:
# Counting NaN values before cleaning
nan_count = sales.isna().sum().sum()
print(f'The total amount of NaN values in CustomerID before cleaning is: {nan_count}')

# Counting NaN values after cleaning
nan_count_2 = sales_clean.isna().sum().sum()
print(f'The total amount of NaN values in CustomerID after cleaning is:{nan_count_2}')

The total amount of NaN values in CustomerID before cleaning is: 136534
The total amount of NaN values in CustomerID after cleaning is:0


In [10]:
# Creating new columns
sales_clean['Revenue'] = sales_clean['Quantity'] * sales['UnitPrice'] #Revenue column

sales_clean['InvoiceMonth'] = sales_clean['InvoiceDate'].dt.month     #InvoiceMonth column

sales_clean['InvoiceDay'] = sales_clean['InvoiceDate'].dt.day_name()   #InvoiceDay column

sales_clean['HourofDay'] = sales_clean['InvoiceDate'].dt.hour          #HourofDay column

In [11]:
# Rearrenging the order of columns
new_order = ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'UnitPrice', 'Revenue', 'InvoiceDate', 'InvoiceMonth', 'InvoiceDay', 'HourofDay', 'Country', 'CustomerID',]
sales_clean = sales_clean[new_order]

PHASE  4: ANALYSIS

In [12]:
 # Monthly revenue 
monthly_revenue = sales_clean.groupby('InvoiceMonth')['Revenue'].sum().sort_values(ascending=False)
print(monthly_revenue)

InvoiceMonth
11    1091886.890
10     974603.590
9      931440.372
5      648251.080
8      616368.000
6      608013.160
3      579964.610
7      574238.481
12     554604.020
1      475074.380
2      436546.150
4      426047.851
Name: Revenue, dtype: float64


In [13]:
 # Daily revenue 
daily_revenue = sales_clean.groupby('InvoiceDay')['Revenue'].sum().sort_values(ascending=False)
print(daily_revenue)

InvoiceDay
Thursday     1812903.010
Tuesday      1519534.391
Wednesday    1420965.370
Monday       1217916.981
Friday       1181620.291
Sunday        764098.541
Name: Revenue, dtype: float64


In [14]:
# Revenue by hour of day
byhour_revenue = sales_clean.groupby('HourofDay')['Revenue'].sum().sort_values(ascending=False)
print(byhour_revenue)

HourofDay
12    1263253.610
10    1114794.561
13    1075601.290
11    1021742.340
14     906073.451
15     854758.880
9      633387.031
16     416465.790
8      263189.530
17     194508.311
18      84614.400
19      42178.600
7       31009.320
20      15958.820
6        -497.350
Name: Revenue, dtype: float64


In [15]:
# Most profitable products
most_profitable = sales_clean.groupby(['Description','StockCode'])['Revenue'].sum().sort_values(ascending=False).head(10)
print(most_profitable)

Description                         StockCode
REGENCY CAKESTAND 3 TIER            22423        128112.07
WHITE HANGING HEART T-LIGHT HOLDER  85123A        91018.51
JUMBO BAG RED RETROSPOT             85099B        80960.40
PARTY BUNTING                       47566         67172.73
POSTAGE                             POST          63475.74
ASSORTED COLOUR BIRD ORNAMENT       84879         54310.48
CHILLI LIGHTS                       79321         43780.14
RABBIT NIGHT LIGHT                  23084         43646.72
PICNIC BASKET WICKER 60 PIECES      22502         39619.50
PAPER CHAIN KIT 50'S CHRISTMAS      22086         36986.96
Name: Revenue, dtype: float64


In [16]:
# Most sold products
most_sold = sales_clean.groupby(['Description','StockCode'])['Quantity'].sum().sort_values(ascending=False).head(10)
print(most_sold)

Description                         StockCode
WORLD WAR 2 GLIDERS ASSTD DESIGNS   84077        51612
JUMBO BAG RED RETROSPOT             85099B       43874
ASSORTED COLOUR BIRD ORNAMENT       84879        33928
WHITE HANGING HEART T-LIGHT HOLDER  85123A       33093
PACK OF 72 RETROSPOT CAKE CASES     21212        32902
MINI PAINT SET VINTAGE              22492        25481
POPCORN HOLDER                      22197        24803
PACK OF 12 LONDON TISSUES           22616        24660
PACK OF 60 PINK PAISLEY CAKE CASES  21977        23823
RABBIT NIGHT LIGHT                  23084        23081
Name: Quantity, dtype: int64


In [17]:
# Products in both rankings
products_both_rankings = most_sold.index.intersection(most_profitable.index)
print(products_both_rankings)

MultiIndex([(           'JUMBO BAG RED RETROSPOT', '85099B'),
            (     'ASSORTED COLOUR BIRD ORNAMENT',  '84879'),
            ('WHITE HANGING HEART T-LIGHT HOLDER', '85123A'),
            (                'RABBIT NIGHT LIGHT',  '23084')],
           names=['Description', 'StockCode'])


In [18]:
# Countries with higher sales
countries_with_most_sales = sales_clean.groupby('Country')['Revenue'].sum().sort_values(ascending=False).head(10)
print(countries_with_most_sales)


Country
United Kingdom    6438733.674
Netherlands        272814.520
EIRE               241714.210
Germany            213214.110
France             186911.950
Australia          137077.270
Switzerland         55739.400
Spain               54503.150
Belgium             39501.530
Sweden              36595.910
Name: Revenue, dtype: float64


In [19]:
# Getting the total spent by Customer
total_spent = sales_clean.groupby('CustomerID')['Revenue'].sum().sort_values(ascending=False).head(10)
print(total_spent)
# This code also considers the cancellations made by customers.

CustomerID
14646.0    267761.00
18102.0    244952.95
17450.0    185919.77
14911.0    124762.32
12415.0    123725.45
14156.0    112623.43
17511.0     80746.06
16684.0     60490.10
13694.0     59521.94
15311.0     58352.77
Name: Revenue, dtype: float64


PHASE 5: SHARE

In [None]:
# Monthly revenue 
monthly_revenue = monthly_revenue.sort_index()
monthly_revenue.plot(kind='bar')
plt.title('Monthly Revenue')
plt.ylabel('Revenue')
plt.show()

# Daily revenue 
order_of_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Sunday']
daily_revenue = daily_revenue.loc[order_of_days]
daily_revenue.plot(kind='bar')
plt.title('Daily Revenue')
plt.ylabel('Revenue')
plt.show()

# Revenue by hour of day
byhour_revenue = byhour_revenue.sort_index()
byhour_revenue.plot(kind='bar')
plt.title('Revenue by Hour of Day')
plt.ylabel('Revenue')
plt.show()

# Most profitable products
most_profitable.plot(kind='bar')
plt.title('Most Profitable Products')
plt.ylabel('Revenue')
plt.show()

# Most sold products
most_sold.plot(kind='bar')
plt.title('Most Sold Products')
plt.ylabel('Quantity')
plt.show()

# Countries with higher sales
countries_with_most_sales.plot(kind='bar')
plt.title('Countries with Higher Sales')
plt.ylabel('Revenue')
plt.show()

# Getting the total spent by Customer
total_spent.plot(kind='bar')
plt.title('Total Spent by Customer')
plt.ylabel('Revenue')
plt.show() 


In [None]:
# Key findings

# The months with most revenue generated are september, october and november. 

# The days with most revenue generated are Thursday and Tueday

# The hours of the day with most revenue generated are between 10:00 and 13:59

# The country wich generates the most revenue is UK

# The most loyal customers are 14646, 18102, 17450

# Products wich are both in most sold products and most profitable products are: JUMBO BAG RED RETROSPOT, ASSORTED COLOUR BIRD ORNAMENT,
# WHITE HANGING HEART T-LIGHT HOLDER and RABBIT NIGHT LIGHT.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Preparar los datos
X = sales_clean[['InvoiceMonth']]  # Usamos el mes como característica
y = sales_clean[['Revenue']]  # Variable objetivo (ventas)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializar el modelo de regresión lineal
model = LinearRegression()

# Entrenar el modelo con los datos de entrenamiento
model.fit(X_train, y_train)

# Hacer predicciones con los datos de prueba
y_pred = model.predict(X_test)

# Evaluar el rendimiento del modelo
mse = mean_squared_error(y_test, y_pred)
print("Error cuadrático medio:", mse)

# Ejemplo de predicción futura (reemplaza los valores de las características con los valores que desees predecir)
nuevo_mes = 4  # Por ejemplo, para predecir las ventas en agosto
nueva_caracteristica = [[nuevo_mes]]
prediccion = model.predict(nueva_caracteristica)
print("Predicción de ventas en el mes", nuevo_mes, ":", prediccion)
