In [1]:
# packages for data manipulation

import numpy as np
import pandas as pd

# packages for visualisation
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
# Read in sales data
sales = pd.read_csv('sales_data_sample.csv', encoding='ISO-8859-1')

In [10]:
with open('sales_data_sample.csv') as f:
    print(f)

<_io.TextIOWrapper name='sales_data_sample.csv' mode='r' encoding='UTF-8'>


In [11]:
sales.head()

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,...,897 Long Airport Avenue,,NYC,NY,10022.0,USA,,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,5/7/2003 0:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100.0,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium
3,10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,...,78934 Hillside Dr.,,Pasadena,CA,90003.0,USA,,Young,Julie,Medium
4,10159,49,100.0,14,5205.27,10/10/2003 0:00,Shipped,4,10,2003,...,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium


In [28]:
# Total sales by month
sales.groupby('MONTH_ID').SALES.sum()

MONTH_ID
1      785874.44
2      810441.90
3      754501.39
4      669390.96
5      923972.56
6      454756.78
7      514875.97
8      659310.57
9      584724.27
10    1121215.22
11    2118885.67
12     634679.12
Name: SALES, dtype: float64

In [30]:
# Total sales by country
sales.groupby('COUNTRY').QUANTITYORDERED.sum()

COUNTRY
Australia       6246
Austria         1974
Belgium         1074
Canada          2293
Denmark         2197
Finland         3192
France         11090
Germany         2148
Ireland          490
Italy           3773
Japan           1842
Norway          2842
Philippines      961
Singapore       2760
Spain          12429
Sweden          2006
Switzerland     1078
UK              5013
USA            35659
Name: QUANTITYORDERED, dtype: int64

In [31]:
# Total sales by year
sales.groupby('YEAR_ID').SALES.sum()

YEAR_ID
2003    3516979.54
2004    4724162.60
2005    1791486.71
Name: SALES, dtype: float64

In [32]:
# Create a new column with clients' full names
sales['NAME'] = sales['CONTACTFIRSTNAME'] + sales['CONTACTLASTNAME']

In [33]:
sales.head()

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE,NAME
0,10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,...,,NYC,NY,10022.0,USA,,Yu,Kwai,Small,KwaiYu
1,10121,34,81.35,5,2765.9,5/7/2003 0:00,Shipped,2,5,2003,...,,Reims,,51100.0,France,EMEA,Henriot,Paul,Small,PaulHenriot
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium,DanielDa Cunha
3,10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,...,,Pasadena,CA,90003.0,USA,,Young,Julie,Medium,JulieYoung
4,10159,49,100.0,14,5205.27,10/10/2003 0:00,Shipped,4,10,2003,...,,San Francisco,CA,,USA,,Brown,Julie,Medium,JulieBrown


In [43]:
# Groupby client name (NAME), count the number of previous orders
previous_orders = sales.groupby('NAME', as_index=False)['ORDERNUMBER'].count()

In [41]:
# rename the columns
previous_orders.columns = ['NAME', 'Number_orders']

In [42]:
previous_orders.head()

Unnamed: 0,NAME,Number_orders
0,AdrianHuxley,46
1,AkikoShimamura,32
2,AllenNelson,24
3,AnnBrown,26
4,AnnaO'Hara,46


In [46]:
# add this new dataframe to the sales dataframe using a left-merge (keeps all rows of sales DF)
dfinal = sales.merge(previous_orders, on="NAME", how='left')

In [50]:
# rename columns
dfinal.columns = ['ORDERNUMBER' if x=='ORDERNUMBER_x' else x for x in dfinal.columns]
dfinal.columns = ['Total_Customer_Orders' if x=='ORDERNUMBER_y' else x for x in dfinal.columns]
dfinal.head()

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE,NAME,Total_Customer_Orders
0,10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,...,NYC,NY,10022.0,USA,,Yu,Kwai,Small,KwaiYu,49
1,10121,34,81.35,5,2765.9,5/7/2003 0:00,Shipped,2,5,2003,...,Reims,,51100.0,France,EMEA,Henriot,Paul,Small,PaulHenriot,41
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium,DanielDa Cunha,20
3,10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,...,Pasadena,CA,90003.0,USA,,Young,Julie,Medium,JulieYoung,30
4,10159,49,100.0,14,5205.27,10/10/2003 0:00,Shipped,4,10,2003,...,San Francisco,CA,,USA,,Brown,Julie,Medium,JulieBrown,41


In [51]:
# Earliest order
dfinal.ORDERDATE.min()

'1/10/2003 0:00'

In [58]:
# Testing extracting rows with a specific month and year
test_slice = dfinal[(dfinal['MONTH_ID'] == 1) & (dfinal['YEAR_ID'] == 2003)]

In [59]:
test_slice.head()

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE,NAME,Total_Customer_Orders
26,10103,26,100.0,11,5404.62,1/29/2003 0:00,Shipped,1,1,2003,...,Stavern,,4110,Norway,EMEA,Bergulfsen,Jonas,Medium,JonasBergulfsen,32
133,10103,42,100.0,4,5398.26,1/29/2003 0:00,Shipped,1,1,2003,...,Stavern,,4110,Norway,EMEA,Bergulfsen,Jonas,Medium,JonasBergulfsen,32
212,10103,27,100.0,8,3394.98,1/29/2003 0:00,Shipped,1,1,2003,...,Stavern,,4110,Norway,EMEA,Bergulfsen,Jonas,Medium,JonasBergulfsen,32
266,10104,34,100.0,1,5958.5,1/31/2003 0:00,Shipped,1,1,2003,...,Madrid,,28034,Spain,EMEA,Freyre,Diego,Medium,DiegoFreyre,259
367,10104,41,100.0,9,4615.78,1/31/2003 0:00,Shipped,1,1,2003,...,Madrid,,28034,Spain,EMEA,Freyre,Diego,Medium,DiegoFreyre,259


In [78]:
test_slice.SALES.sum()

129753.6

In [81]:
# Feature: Total sales last month vs prev month
# function returns percentage of last month compared to the previous month
def total_sales_lastvsprev_month(month, year):
    
    # Handle out of bounds (earliest date is 10th Jan 2003)
    if month == 1 and year == 2003 : return 100
    
    # Find total orders for last month
    last = dfinal[(dfinal['MONTH_ID'] == month) & (dfinal['YEAR_ID'] == year)].SALES.sum()
        
    # Handle January condition (go back to Dec of previous year)
    if month == 1 : 
        month = 12
        year = year - 1    
    else:
        month = month - 1
        
    prev = dfinal[(dfinal['MONTH_ID'] == month) & (dfinal['YEAR_ID'] == year)].SALES.sum()
        
    return (last/prev) * 100;

In [83]:
# Test function
total_sales_lastvsprev_month(1, 2003)
total_sales_lastvsprev_month(2, 2003)
total_sales_lastvsprev_month(3, 2003)

123.90629141558006

In [85]:
# Print sales by month for 2003
num_bins = 12
x = list(range(1, 12))
n, bins, patches = plt.hist(total_sales_lastvsprev_month(x, 2003), num_bins, facecolor='blue', alpha=0.5)

ValueError: ('Lengths must match to compare', (2823,), (10,))