In [130]:
# Import Libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functions import *
%matplotlib inline

## Load Datasets from CSV Files

In [131]:
# Read CSV files containing sales transactions and temperatures
pathfile = './datasets/'
data_years = ['2018', '2019', '2020', '2021']


# Load datasets with sales between 2014 and 2019
filename = 'y_'
df_sales = pd.read_csv(pathfile + filename + '2017.csv', infer_datetime_format = True, encoding = 'latin-1')
print('Initiall Shape df_sales: {}, year: 2017'.format(df_sales.shape) )

for year in data_years:
    df = pd.read_csv(pathfile + filename + year + '.csv', infer_datetime_format = True, encoding = 'latin-1')
    df_sales = pd.concat([df_sales, df], ignore_index=True)
    print('Shapes df: {}, df_sales: {}, year: {}'.format(df.shape, df_sales.shape, year) )

# Read dataset with Minimum Phoenix Temperatures
df_min_temp = pd.read_csv(pathfile + 'phx_min_temp.csv', sep='\t', encoding = 'utf-8')

# Read dataset with Maximum Phoenix Temperatures
df_max_temp = pd.read_csv(pathfile + 'phx_max_temp.csv', sep='\t', encoding = 'utf-8')

df_tmp = None

Initiall Shape df_sales: (25030, 11), year: 2017
Shapes df: (27331, 11), df_sales: (52361, 11), year: 2018
Shapes df: (27993, 11), df_sales: (80354, 11), year: 2019
Shapes df: (26658, 11), df_sales: (107012, 11), year: 2020
Shapes df: (9495, 11), df_sales: (116507, 11), year: 2021


In [132]:
df_sales.head()

Unnamed: 0.1,Unnamed: 0,Type,Date,Num,Memo,Name,Item,Qty,Sales Price,Amount,Balance
0,Inventory,,,,,,,,,,
1,00-Beer & Spirits,,,,,,,,,,
2,"00177USA - R2 Casino Azul Blanc (Casino Azul, ...",,,,,,,,,,
3,,Invoice,01/18/2017,70229.0,"Casino Azul, Tequila Blanco",Total Wine 1006,00-Beer & Spirits:00177USA - R2 Casino Azul Bl...,6.0,29.95,179.7,179.7
4,,Invoice,01/18/2017,70233.0,"Casino Azul, Tequila Blanco",Total Wine 1004,00-Beer & Spirits:00177USA - R2 Casino Azul Bl...,6.0,29.95,179.7,359.4


## Data Preprocessing

In [133]:
# Dictionary with interesting Products for the project
target_items = {'60190':'Dipinti, Pinot Grigio La Vis',
                '70270':'Le Contesse, Prosecchino Brut, 187ml',
                '20209':'Alverdi, Pinot Grigio',
                '70165':'Santome, Prosecco Extra Dry',
                '50215':'Carpineto, Dogajolo Rosso',
                '70208':'Italo Cescon, Pinot Grigio',
                '70271':'Le Contesse, Pinot Noir Rose Brut'
               }
df_sales = clean_sales(df_sales, list(target_items.keys()))
df_sales.head()

Final Dataset Shape: (9214, 5)


Unnamed: 0,quantity,price,item_code,month,year
5623,7.0,6.99,20209,1,2017
5624,12.0,6.99,20209,1,2017
5625,12.0,6.99,20209,1,2017
5626,12.0,6.99,20209,1,2017
5627,6.0,6.99,20209,1,2017


Check for NaN's Values

In [134]:
print('Null values:\n{}'.format(df_sales.isnull().sum())) # Display Null values per Column
print(df_sales[df_sales['price'].isnull()]) # Display rows with Null values

Null values:
quantity     0
price        1
item_code    0
month        0
year         0
dtype: int64
        quantity  price item_code  month  year
100379       2.0    NaN     70271      5  2020


Imputate average price for 1 row with Nan value

In [135]:
df_sales.loc[100379,'price'] = df_sales.price[df_sales['item_code']=='70271'].mean()
print('Null values:\n{}'.format(df_sales.isnull().sum())) # Display Null values per Column

Null values:
quantity     0
price        0
item_code    0
month        0
year         0
dtype: int64


Create Time Series for each product by Month

In [136]:
df = df_sales.groupby(by=['item_code','year','month'], as_index=False).agg(
    quantity=('quantity','sum'), 
    avg_price=('price', np.mean)
) # Group by item, month and year to result in a time series for each product
df['day'] = "01"
df['date'] = pd.to_datetime(df[['year','month','day']]) # Create a date column using year, month and day
df.drop(['month','day'], axis=1, inplace=True) # Drop day and month columns
df.head()

Unnamed: 0,item_code,year,quantity,avg_price,date
0,20209,2017,109.0,6.99,2017-01-01
1,20209,2017,136.0,7.144666,2017-02-01
2,20209,2017,129.0,6.99,2017-03-01
3,20209,2017,190.0,6.99,2017-04-01
4,20209,2017,172.0,6.99,2017-05-01


Check the number of Data Points per Item

In [137]:
for item, values in target_items.items():
    print('Item: {}, Data Points:{}, {}'.format(item, df.date[df['item_code']==item].count(), values))

Item: 60190, Data Points:37, Dipinti, Pinot Grigio La Vis
Item: 70270, Data Points:48, Le Contesse, Prosecchino Brut, 187ml
Item: 20209, Data Points:52, Alverdi, Pinot Grigio
Item: 70165, Data Points:52, Santome, Prosecco Extra Dry
Item: 50215, Data Points:52, Carpineto, Dogajolo Rosso
Item: 70208, Data Points:52, Italo Cescon, Pinot Grigio
Item: 70271, Data Points:43, Le Contesse, Pinot Noir Rose Brut


In [138]:
# Remove rows with date ='2021-04-01' since April 2021 data is incomplete
df.drop(df[df['date'] == '2021-04-01'].index, axis=0, inplace = True)
df.shape

(329, 5)

## Additional Features

The next section adds 3 new features to dataset:
- Phoenix Minimum Temperature
- Phoenix Maximum Temperature
- Holidays

In [139]:
import calendar
import datetime


def extract_temperature(df_temperature, new_colum_name):
    '''
    Create a DataFrame with monthly temperature data points
    '''
    # Create an empty dataframe to storage temperatures
    df_to_return = pd.DataFrame({ 'date': [], new_colum_name:[] })    
    
    df_temperature_transposed = df_temperature.T # Transpose temperature DF to iter on it
    str_month_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num}
    years = df_temperature_transposed.iloc[0].tolist() # Create a list with the years
    
    for index, row in df_temperature_transposed.iterrows():    
        if index != 'Year':
            month = str(str_month_to_num[index])
            for j, year in enumerate(years):
                date = str(year) + '-' + month + '-' + '01'
                temp = row[j]
                new_row = {'date': date, new_colum_name: temp }
                df_to_return = df_to_return.append(new_row, ignore_index=True)
    return df_to_return
extract_temperature(df_max_temp, new_colum_name='max_temp')

Unnamed: 0,date,max_temp
0,2015-1-01,81
1,2016-1-01,76
2,2017-1-01,76
3,2018-1-01,83
4,2019-1-01,74
...,...,...
79,2017-12-01,84
80,2018-12-01,75
81,2019-12-01,75
82,2020-12-01,80


In [21]:
# Subset data points to be used for each time series
# For this project, I will be using 36 observations for training and 3 for testing

training_filter = (df['date']>='2018-01-01')
test_filter = (df['date']>='2021-01-01')

1

In [None]:
df[df.loc[:,'item_code']=='60190']

1.B Visualization by Product

In [None]:
# target_items = {'60190':'Dipinti, Pinot Grigio La Vis',
#                 '70270':'Le Contesse, Prosecchino Brut, 187ml',
#                 '20209':'Alverdi, Pinot Grigio',
#                 '70165':'Santome, Prosecco Extra Dry',
#                 '50215':'Carpineto, Dogajolo Rosso',
#                 '70208':'Italo Cescon, Pinot Grigio',
#                 '70271':'Le Contesse, Pinot Noir Rose Brut'
#                }
item = '70270'
item_set = x[x['item_code']==item]

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

ax.plot(item_set['date'], item_set['quantity'])
ax.set_title(str(target_items[item]))
ax.set_xlabel('Time')
ax.set_ylabel('Sold Btls')
# plt.show()

In [None]:
target_items[item]

In [None]:
# df_sales.set_index('date', inplace=True) # Set 'date' as index

In [None]:

alverdi = df_sales[df_sales['item_code']=='20209']
print(alverdi.shape)

In [None]:
alverdi.head()

In [None]:
x = alverdi.groupby('item_code')
print(len(x))
x.head()


In [None]:
alverdi.groupby([pd.Grouper(freq='M'), 'quantity']).sum()

In [None]:
x =  x['item_code'].resample('M', how=[np.sum, len])
x.head()

In [None]:
x = df_sales.groupby(by= df_sales.index.month).agg({'quantity':'sum','item_code':'min'})

x = x[x['item_code']=='20209']

x.head(20)
# target_items = ['60190','70270','20209','70165','50215','70208','70271']

In [None]:
df_sales.head()

In [None]:
# Apply Date filter to select only the time period of interest
# df_sales_2021[(df_sales_2021['date']>'2020/01/01') & (df_sales_2021['date']<'2020/06/01')]

In [None]:
print(df.shape)
# df.query('item_code == "60190"').count()

In [None]:

#     df.set_index('date', inplace=True) # Set 'date' as index
#     df = df.loc[start_date:end_date] # filter data points by dates
#     print('shape after index: {}'.format(df.shape))
df_sales.to_excel('sales.xlsx')

In [None]:
df_sales_2021.groupby(['item_code'], as_index=False).agg({'quantity':'sum','item_name':'max'})
# target_items = ['60190','70270','20209','70165','50215','70208','70271']