In [1]:
#Basic notebook with a few jupyter examples


# Metrics to calc

# Users that haven't purchased in a while
# Last purchase per user - any product

#Last purchase per user - each product
#  * somehow include no purchases - for ones that they have been assigned


In [2]:
import pandas as pd
import logging
from datetime import date, timedelta
import numpy as np
from datetime import date, datetime
import seaborn as sns


logging.basicConfig(level=logging.DEBUG, format="%(message)s")
LOGGER = logging.getLogger(__name__)
LOGGER.debug("log level = %s", str(LOGGER.getEffectiveLevel()))


log level = 10


In [3]:
DATA_PATH="../data/purchases.csv"
DATE_AS_OF=date.fromisoformat('2023-06-01') #Date for analysis
#For data visualization - date range sizes
DATE_GROUP_DAYS=30
CUTOFF_DAYS=90



REQUIRED_COLUMNS=['user', 'date','product', 'price']

#loist of recommended actions for different date thresholds
ACTION_DATES = dict(out_of_warranty=30, obsolete=90)


In [4]:

def clean_raw(df:pd.DataFrame, required_columns:[str])->pd.DataFrame:
    """
    Check and clean purchases dataframe.  
    :param df: 
    :return:    updated dataframe  or exception if issue. 
    """
    
    missing_cols = set(required_columns) - set(df.columns)
    if missing_cols:
        raise Exception(f"Invalid dataframe Missing required columns: {', '.join(missing_cols)}")
    
    #trim up - sometimes there are bad rows at the bottom of the csv. We use the first required column
    df = df.loc[df[required_columns[0]].notnull()]
    
    #Error if any of the cells are blanks
    if df.isnull().sum().sum():
        raise Exception(f"Invalid dataframe: some cells are empty")

    #convert dates
    df['date']  = pd.to_datetime(df['date'], format='%Y-%m-%d').dt.date
     
    return df


In [5]:
df = pd.read_csv(DATA_PATH)
LOGGER.info("Loaded %d rows from CSV[%s]", len(df), DATA_PATH)
df.head()

Loaded 22 rows from CSV[../data/purchases.csv]


Unnamed: 0,user,date,product,price
0,A,2023-05-31,prod-a,17.0
1,B,2023-06-01,prod-a,
2,C,2023-05-01,prod-b,256.0
3,D,2023-06-01,prod-a,118.0
4,D,2023-05-15,prod-a,224.0


In [6]:
#EXAMPLE: of separate Lambda - this is overkill for this small task 
def to_caps(s:str) ->str: 
    return s.upper()

df['product'] = df['product'].apply(to_caps)
df.head()

Unnamed: 0,user,date,product,price
0,A,2023-05-31,PROD-A,17.0
1,B,2023-06-01,PROD-A,
2,C,2023-05-01,PROD-B,256.0
3,D,2023-06-01,PROD-A,118.0
4,D,2023-05-15,PROD-A,224.0


In [7]:
#EXAMPLE: Lambda of in-line lambda
df['product'] = df['product'].apply(lambda x:str(x).lower())
df.head()



Unnamed: 0,user,date,product,price
0,A,2023-05-31,prod-a,17.0
1,B,2023-06-01,prod-a,
2,C,2023-05-01,prod-b,256.0
3,D,2023-06-01,prod-a,118.0
4,D,2023-05-15,prod-a,224.0


## EXAMPLE: Convert a string date to a date type   

In [8]:
#EXAMPLE:   Convert a column to dates - this can also be done in df csv load. 
df['date']  = pd.to_datetime(df['date'], format='%Y-%m-%d').dt.date


In [10]:

#EXAMPLE:  time processing: delta since some starting date. This makes some downstream calcs a little easier
df['days_ago'] = df['date'].apply(lambda x: (DATE_AS_OF - x).days)
df.head()


Unnamed: 0,user,date,product,price,days_ago
0,A,2023-05-31,prod-a,17.0,1
1,B,2023-06-01,prod-a,,0
2,C,2023-05-01,prod-b,256.0,31
3,D,2023-06-01,prod-a,118.0,0
4,D,2023-05-15,prod-a,224.0,17


## EXAMPLE: Days since MIN date.   use to normalize. 

In [16]:

LOGGER.info("Date-range min: %s", df['date'].min())
LOGGER.info("Date-range max: %s", df['date'].max())

dmin = df['date'].min()
df['days_since'] = df['date'].apply(lambda x: (x-dmin).days)

Date-range min: 2023-01-01
Date-range max: 2023-06-01


Unnamed: 0,user,date,product,price,days_ago,days_since
0,A,2023-05-31,prod-a,17.0,1,150
1,B,2023-06-01,prod-a,,0,151
2,C,2023-05-01,prod-b,256.0,31,120
3,D,2023-06-01,prod-a,118.0,0,151
4,D,2023-05-15,prod-a,224.0,17,134
5,A,2023-02-04,prod-d,939.0,117,34
6,B,2023-02-07,prod-b,57.0,114,37
7,C,2023-02-10,prod-a,323.0,111,40
8,D,2023-02-13,prod-d,949.0,108,43
9,A,2023-02-16,prod-c,120.0,105,46


In [13]:


# Count unique values in each column
for col in ['user', 'product']: 
    LOGGER.info("  # unique values in col: %s = %d",col, df_all[col].nunique())

LOGGER.info("Date-range min: %s", df_all['date'].min())
LOGGER.info("Date-range max: %s", df_all['date'].max())

TypeError: unsupported operand type(s) for -: 'datetime.date' and 'str'

In [None]:
df_all.head(5)

In [None]:
# Begin to count the recommend actions. 
#sort so the largest date value is first in dict.  i.e., we want to recommend the action with the largest date range
actions = {k: v for k, v in sorted(ACTION_DATES.items(), key=lambda item: item[1], reverse=True)} 
default='ok'


def determine_action(days:int) ->str: 
    for action, d_thresehold in actions.items():
        if days >= d_thresehold:
            return action
    return default

df_all['action'] = df_all['days_ago'].apply(determine_action)
df_all.head(5)

#Find the ones beyond the dates. 

In [None]:

# sort the actions by oldest


In [None]:
#####################################
#most recent purchases per user - any product
df_recent = df_all.sort_values(by='date', ascending=False)

# group the data by Name and get the first row of each group
df_most_recent_purchases = df_recent.groupby('user').first()

df_most_recent_purchases


In [None]:
#####################################
#most recent purchases per user - any product

df_recent = df_all.sort_values(by='date', ascending=False)

# group the data by Name and get the most recent purchase (first row) of each group
df_most_recent_purchase = df_recent.groupby(['User'],as_index=False).first()
df_most_recent_purchase.head(5)

In [None]:
#####################################
#most recent purchases per user - list all products

df_recent = df_all.sort_values(by='date', ascending=False)

# group the data by Name and get the most recent purchase (first row) of each group
df_most_recent_purchases_all = df_recent.groupby(['user', 'product'],as_index=False).first()
df_most_recent_purchases_all.head(15)

In [None]:
### Find purchases that are old - such as out of warranty date.
filtered_df = df_most_recent_purchases_all.loc[df_most_recent_purchases_all['date'] < '2023-02-01']


print(f"Num purchases out of warranty: {len(filtered_df)}")
filtered_df.head(5)


In [None]:
#####################################
# Last purchase per user per product
#
# Note: this doesn't handle case where a user never purchased
#####################################
#Generate blocks of cutoff days.    The first is the max.
dates = []
dateblocks = {}

dmin = df_all['date'].min().date()
print(f"dmin: Earliest entry from date: {dmin}")
d = DATE_AS_OF
ndays = 0
while d >= dmin:
 dates.insert(0,d)

 prevdate = d - timedelta(days=DATE_GROUP_DAYS)
 ndays_max=ndays + DATE_GROUP_DAYS-1
 dateblocks[ndays] = dict(ndays=ndays,
                          date=d,
                          prevdate=prevdate,
                          ndays_max = ndays_max,
                          label=f"{ndays} to {ndays_max} days")

 #Next block
 d = prevdate
 ndays += DATE_GROUP_DAYS

 #print(f"date slice: {d}")
 #print(f"dateblocks: {dateblocks}")

 # End up with a dict

 #values
 #* n days
 #* label
 #df (slice) of entries that match.
 #* be able to visualize.


 #<= 10 days:   15
 #11 <= 20 days  37
 #21

In [None]:
#dateblocks

In [None]:
df_all['days_since_login'] = df_all['date'].apply(lambda d: (DATE_AS_OF - d.date()).days)

def calc_date_block(ndays):

    for d in sorted(dateblocks.keys()):
        #print(f"calc_date_block d={str(d)}  v={dateblocks.get(d)}")
        v = dateblocks[d]
        if ndays <= v.get('ndays_max'):
            return v.get('label')
    return ('unknown')

df_all['days_since_label'] = df_all['days_since_login'].apply(calc_date_block)

df_all.head(5)

In [None]:
df_date_bands = df_all.groupby('days_since_label',as_index=False).count()
df_date_bands.head(5)

In [None]:


sns.barplot(x='days_since_label', y='User', data=df_date_bands) #hue='variable')
#plt.xticks(rotation=90)
#plt.ylabel('Returns')
#plt.title('Portfolio vs Benchmark Returns');

In [None]:


df = filtered_df
df

 Filter data between two dates
filtered_df = df.loc[(df['date'] >= '2020-09-01')
                     & (df['date'] < '2020-09-15')

In [None]:
df_all['date'].min()

In [None]:
my_dict = {'a': 3, 'b': 1, 'c': 2}
sorted_dict = {k: v for k, v in sorted(my_dict.items(), key=lambda item: item[1], reverse=True)}  # Sort by values
print(sorted_dict)  # Output: {'b': 1, 'c': 2, 'a': 3}
