In [1]:
# Import Libraries and modules
import pandas as pd
import numpy as np

In [22]:
# Read CSV files containing sales transactions and temperatures
pathfile = './datasets/'

# Load datasets with sales between 2014 and 2019
df_sales_2019 = pd.read_csv(pathfile + 'sales_2014_to_2019.csv',
                       sep=',', 
                       infer_datetime_format = True,
                       encoding = 'latin-1'
                      )
# Load datasets with sales between 2020 and 2021
df_sales_2021 = pd.read_csv(pathfile + 'sales_2020_to_2021.csv',
                       sep=',', 
                       infer_datetime_format = True,
                       encoding = 'latin-1'
                      )

# Read dataset with Minimum Phoenix Temperatures
df_min_temp = pd.read_csv(pathfile + 'phx_min_temp.csv',
                       sep=',', 
                       infer_datetime_format = True,
                       encoding = 'latin-1'
                      )

# Read dataset with Maximum Phoenix Temperatures
df_max_temp = pd.read_csv(pathfile + 'phx_max_temp.csv',
                       sep=',', 
                       infer_datetime_format = True,
                       encoding = 'latin-1'
                      )


In [23]:
df_sales_2019.head()

Unnamed: 0.1,Unnamed: 0,Type,Date,Num,Memo,Name,Item,Qty,Sales Price,Amount,Balance
0,Inventory,,,,,,,,,,
1,00-Beer & Spirits,,,,,,,,,,
2,"00101LAI - P3 Amarett Da Vinci (Da Vinci, Amar...",,,,,,,,,,
3,,Invoice,04/29/2014,51426.0,"Da Vinci, Amaretto Regency",Veneto Trattoria,00-Beer & Spirits:00101LAI - P3 Amarett Da Vin...,2.0,6.96,13.92,13.92
4,,Invoice,12/02/2014,54994.0,"Da Vinci, Amaretto Regency",Veneto Trattoria,00-Beer & Spirits:00101LAI - P3 Amarett Da Vin...,6.0,6.96,41.76,55.68


In [19]:
def clean_sales(df_sales, target_items):
    '''
    This function is intended to perform cleaning, formating and filtering operations to sales transaction datasets.
    Parameters:
    'df_sales' : (Pandas DataFrame) Dataset contains sales transactions
    'target_items': (Dictionary) Contains the items of interest for the analysis    
    '''
    df = df_sales.copy()
    # Rename Columns
    column_names = ['del1',
                'type',
                'date',
                'inv_num',
                'item_name',
                'customer',
                'item_description',
                'quantity',
                'price',
                'amount',
                'del2'
               ]
    df.columns = column_names
    
    df.drop(['del1','del2'], axis = 1, inplace = True) # drop unuseful columns 
    df.dropna(axis = 0, thresh=5, inplace = True) # Drop rows with NaN values
    
    # Filter all transaction different than Samples
    df = df[~df['customer'].str.lower().str.contains('sample')]
    
    # Extract item_code from item_description
    df['item_code'] = df['item_description'].str.split(expand=True)[0]
    df.drop(['item_description'], axis = 1, inplace = True) # Drop item_description column
    df = df[df['item_code'].isin(target_items)] # Filter interesting items for Analysis
    
    df['date'] = pd.to_datetime(df['date'], format= '%m/%d/%Y')
    print('Shape: {}'.format(df.shape))
    
    return df

In [20]:
# target_items = ['60190','70270','20209','70165','50215','70208','70271']
target_items = {'60190':'Dipinti, Pinot Grigio La Vis',
                '70270':'Le Contesse, Prosecchino Brut, 187ml',
                '20209':'Alverdi, Pinot Grigio',
                '70165':'Santome, Prosecco Extra Dry',
                '50215':'Carpineto, Dogajolo Rosso',
                '70208':'Italo Cescon, Pinot Grigio',
                '70271':'Le Contesse, Pinot Noir Rose Brut'
               }
df_sales_2021 = clean_sales(df_sales_2021, list(target_items.keys()))
df_sales_2019 = clean_sales(df_sales_2019, list(target_items.keys()))

Shape: (3666, 9)
Shape: (0, 9)


In [14]:
# Apply Date filter to select only the time period of interest
df_tmp[(df_tmp['date']>'2020/01/01') & (df_tmp['date']<'2020/06/01')]

Unnamed: 0,type,date,inv_num,item_name,customer,quantity,price,amount,item_code
5873,Invoice,2020-01-06,92915.0,"Alverdi, Pinot Grigio 2018",AJ's Fine Foods #159-Camelback,14.0,7.49,104.86,20209
5874,Invoice,2020-01-06,92918.0,"Alverdi, Pinot Grigio 2018",Pita Jungle Shea Scottsdale,24.0,6.99,167.76,20209
5875,Invoice,2020-01-06,92927.0,"Alverdi, Pinot Grigio 2018",Corner On the Market,3.0,7.49,22.47,20209
5876,Invoice,2020-01-06,92932.0,"Alverdi, Pinot Grigio 2018",Pita Jungle 7th St Uptown,12.0,6.99,83.88,20209
5877,Invoice,2020-01-07,92936.0,"Alverdi, Pinot Grigio 2018",Pita Jungle - Val Vista/Mesa,12.0,6.99,83.88,20209
...,...,...,...,...,...,...,...,...,...
26401,Invoice,2020-05-27,91818.0,"Le Contesse, Pinot Noir Rose Brut, NV",Cibo,2.0,0.00,0.00,70271
26402,Invoice,2020-05-27,91826.0,"Le Contesse, Pinot Noir Rose Brut, NV",My Wine Cellar1,5.0,10.99,54.95,70271
26403,Invoice,2020-05-27,91826.0,"Le Contesse, Pinot Noir Rose Brut, NV",My Wine Cellar1,1.0,0.00,0.00,70271
26404,Invoice,2020-05-28,91845.0,"Le Contesse, Pinot Noir Rose Brut, NV",Food and Things,5.0,10.99,54.95,70271


In [318]:

#     df.set_index('date', inplace=True) # Set 'date' as index
#     df = df.loc[start_date:end_date] # filter data points by dates
#     print('shape after index: {}'.format(df.shape))
df_sales_2021.to_excel('sales.xlsx')

In [329]:
print(df.shape)
# df.query('item_code == "60190"').count()

(3666, 9)


In [294]:
df1 = (df_sales_2021.groupby(['item_code','item_name'], as_index=False).agg({'quantity':'sum'})).sort_values('quantity',
                                                                                                       ascending=False
                                                                                                      )
df1.head(20)
# target_items = ['60190','70270','20209','70165','50215','70208','70271']

Unnamed: 0,item_code,item_name,quantity
730,50345,"Villa Cafaggio, Chianti Classico Basilica Cafa...",15120.0
2,107,"Menabrea, Microbrew Bionda",14481.0
1065,80334,"Les Caves de Landiras, Grandial Sparkling Blan...",10824.0
961,70270,"Le Contesse, Prosecchino Brut, 187ml",7674.0
318,30287,"Bouchon, Chardonnay, UNOAKED, 2018",6540.0
1,106,"Menabrea, Microbrew Ambrata",6246.0
324,30289,"Bouchon, Cabernet Sauvignon, 2018",5843.0
264,20209,"Alverdi, Pinot Grigio, 2019",5397.0
895,70165,"Santome, Prosecco Extra Dry, N.V.",5349.0
855,60190,"Dipinti, Pinot Grigio, La Vis 2018",5324.0


In [317]:
df_sales_2021.groupby(['item_code'], as_index=False).agg({'quantity':'sum','item_name':'max'})
# target_items = ['60190','70270','20209','70165','50215','70208','70271']

Unnamed: 0,item_code,quantity,item_name
0,20209,7109.0,"Alverdi, Pinot Grigio, 2019"
1,50215,5313.0,"Poderi Einaudi Barolo ""Costa Grimaldi"" 2012"
2,60190,6958.0,"Dipinti, Pinot Grigio, La Vis 2018 (Mich)"
3,70165,6240.0,"Zabu, Nero d' Avola, 2018"
4,70208,5916.0,"Marcarini, Barolo ""La Serra"" 2015"
5,70270,10368.0,"Le Contesse, Prosecchino Brut, 187ml"
6,70271,4208.0,"Le Contesse, Pinot Noir Rose Brut, NV (mich)"
