# Title

## Import modules

In [2]:
import os
import sys

import pandas as pd
import numpy as np
import datetime as dt

# To print all the outputs in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# To display the full text of a pandas DataFrame
pd.set_option('display.max_colwidth', 1) 

# To display all columns of DataFrame
pd.set_option('display.max_columns', None)

# To show warnings only once:
import warnings; warnings.filterwarnings(action='once')

# Set path
sys.path.insert(1, '../tools/')
import viztools as vt
import helpers as hp

viztools.py is being imported into module
helpers.py is being imported into module


## Download processed datasets

In [3]:
path_to_processed_data = '../data/processed/'
file_sales = 'online_sales_details.csv'
file_calendar_tbl = 'calendar_tbl.csv'

Online sales details

In [4]:
sales = pd.read_csv(os.path.join(path_to_processed_data, file_sales))

sales['transaction_date'] = pd.to_datetime(sales['transaction_date'])

df_sales = sales.copy()

Analytical table creation

In [5]:
df_sales.head()

Unnamed: 0,customer_id,transaction_id,transaction_date,product_sku,product_description,product_category,quantity,price,delivery_charges,coupon_status,total_price
0,17850,1785016679,2019-01-01,GGOENEBJ079499,Nest Learning Thermostat 3rd Gen-USA - Stainless Steel,Nest-USA,1,153.71,6.5,Used,153.71
1,17850,1785016680,2019-01-01,GGOENEBJ079499,Nest Learning Thermostat 3rd Gen-USA - Stainless Steel,Nest-USA,1,153.71,6.5,Used,153.71
2,17850,1785016681,2019-01-01,GGOEGFKQ020399,Google Laptop and Cell Phone Stickers,Office,1,2.05,6.5,Used,2.05
3,17850,1785016682,2019-01-01,GGOEGAAB010516,Google Men's 100% Cotton Short Sleeve Hero Tee Black,Apparel,5,17.53,6.5,Not Used,87.65
4,17850,1785016682,2019-01-01,GGOEGBJL013999,Google Canvas Tote Natural/Navy,Bags,1,16.5,6.5,Used,16.5


In [6]:
at_sales = df_sales.groupby(['transaction_date', 'customer_id']).agg({
    'transaction_id': 'nunique'
    ,'product_category': 'nunique'
    ,'quantity': 'sum'
    ,'total_price': 'sum'
})\
    .reset_index()\
        .sort_values(by='transaction_date', ascending=True)\
            .rename(columns={
                'transaction_id': 'transactions_num'
                ,'product_category': 'product_category_num'
                ,'quantity': 'quantity_sum'
                ,'total_price': 'total_price_sum'
            })

Validate data after aggregating

In [7]:
df_sales['total_price'].sum()
at_sales['total_price_sum'].sum()

df_sales['quantity'].sum()
at_sales['quantity_sum'].sum()

df_sales['transaction_id'].nunique()
at_sales['transactions_num'].sum()

4670794.62

4670794.62

238033

238033

26631

26631

In [8]:
at_sales.info() # 

<class 'pandas.core.frame.DataFrame'>
Index: 3208 entries, 0 to 3207
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   transaction_date      3208 non-null   datetime64[ns]
 1   customer_id           3208 non-null   int64         
 2   transactions_num      3208 non-null   int64         
 3   product_category_num  3208 non-null   int64         
 4   quantity_sum          3208 non-null   int64         
 5   total_price_sum       3208 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(4)
memory usage: 175.4 KB


Add the first and the last transaction dates

In [9]:
at_sales['first_transaction_date'] = at_sales.groupby(['customer_id'])['transaction_date'].transform('min')
at_sales['last_transaction_date'] = at_sales.groupby(['customer_id'])['transaction_date'].transform('max')

In [10]:
at_sales.sample(5)

Unnamed: 0,transaction_date,customer_id,transactions_num,product_category_num,quantity_sum,total_price_sum,first_transaction_date,last_transaction_date
1362,2019-06-11,16782,5,6,11,548.7,2019-03-18,2019-12-04
1026,2019-05-05,12753,18,6,131,1839.31,2019-01-08,2019-05-05
875,2019-04-18,15311,2,1,337,13667.63,2019-02-01,2019-12-19
1478,2019-06-21,13496,1,1,1,58.79,2019-03-06,2019-06-21
1171,2019-05-21,17576,4,4,17,320.6,2019-04-20,2019-09-19


Add indicators of the first and the last customer activities

In [11]:
at_sales['is_new_user'] = ( at_sales['first_transaction_date']==at_sales['transaction_date'] ) * 1
at_sales['is_last_purchase'] = ( at_sales['last_transaction_date']==at_sales['transaction_date'] ) * 1

In [12]:
at_sales[at_sales['customer_id'] == 14713].sort_values(by='transaction_date')

Unnamed: 0,transaction_date,customer_id,transactions_num,product_category_num,quantity_sum,total_price_sum,first_transaction_date,last_transaction_date,is_new_user,is_last_purchase
929,2019-04-21,14713,14,5,54,2561.27,2019-04-21,2019-09-30,1,0
1503,2019-06-23,14713,12,6,56,1293.57,2019-04-21,2019-09-30,0,0
2414,2019-09-30,14713,23,6,109,3452.09,2019-04-21,2019-09-30,0,1


Add new columns of averages

Generally, an order may encompass one or more transactions. Each transaction is a record of an action that is taken on an order.
However, the order ID is not presented in the datasets. 

Therefore, we will consider the average transaction cost, average quantity items per transaction, ets

In [14]:
# todo - maybe after aggregation

Save analytical tale

In [16]:
path_to_save = '../data/processed/'
file_at_sales = 'at_sales.csv'
hp.write_df_to_csv(at_sales, path_to_save, file_at_sales)