In [66]:
#Basic notebook with a few jupyter examples - use case is calc out of date warranties
# Load and process a CSV

# Metrics to calc
 # number of product purchases that are either out of warrenty or obsolete
# Users that haven't purchased in a while
# Last purchase per user - any product

#Last purchase per user - each product
#  * somehow include no purchases - for ones that they have been assigned


In [67]:
import pandas as pd
import logging
from datetime import date, timedelta
import numpy as np
import seaborn as sns

logging.basicConfig(level=logging.DEBUG, format="%(message)s")
LOGGER = logging.getLogger(__name__)
LOGGER.debug("log level = %s", str(LOGGER.getEffectiveLevel()))


log level = 10


In [68]:
# Params - Moved these to command line or env in final version 
DATA_PATH="./data/purchases.csv"
DATE_AS_OF=date.fromisoformat('2023-06-01') #Date for analysis
#For data visualization - date range sizes
DATE_GROUP_DAYS=30
CUTOFF_DAYS=90


REQUIRED_COLUMNS=['user', 'date','product', 'price']

#loist of recommended actions for different date thresholds
ACTION_DATES = dict(out_of_warranty=30, obsolete=90)


In [69]:

def clean_raw(df:pd.DataFrame, required_columns:[str])->pd.DataFrame:
    """
    Check and clean purchases dataframe. Returns cleaned df or throws exception
    :param df:   raw dataframe.   
    :param required_columns:   List of expected columns in csv.   Any others can be deleted. 
    :return:    updated dataframe  or exception if any issue. 
    """
    
    missing_cols = set(required_columns) - set(df.columns)
    if missing_cols:
        raise Exception(f"Invalid CSV. Missing required columns: {', '.join(missing_cols)}")
    
    #Blank check - in case CSV had blank or incomplete rows at bottom. We use the first required column to looks for nulls.      #Error if any of the cells are blanks
    df = df.loc[df[required_columns[0]].notnull()]
    if df.isnull().sum().sum():
        raise Exception(f"Invalid dataframe: some cells are empty")

    return df


In [70]:
df_all = pd.read_csv(DATA_PATH)

df_all=clean_raw(df_all, required_columns=REQUIRED_COLUMNS)
#fixup convert 8601 strings to dates
df_all['date']  = pd.to_datetime(df_all['date'], format='%Y-%m-%d').dt.date
     
LOGGER.info("Loaded %d clean rows from CSV[%s]", len(df_all), DATA_PATH)

#Add number of days since date.   Makes some downstream calcs a little easier
df_all['days_ago'] = df_all['date'].apply(lambda x: (DATE_AS_OF - x).days)

# Count unique values in each column
for col in ['user', 'product']: 
    LOGGER.info("  # unique values in col: %s = %d",col, df_all[col].nunique())

LOGGER.info("Date-range min: %s", df_all['date'].min())
LOGGER.info("Date-range max: %s", df_all['date'].max())


Exception: Invalid dataframe: some cells are empty

In [71]:
df_all = pd.read_csv(DATA_PATH)

In [72]:
df = df_all

df.head(5)

df.isnull().sum()

empty_idx=df[df.eq('')].index

df.isnull().sum().sum()

print(empty_idx)

# Print locations of null cells
print("Null cells:")
for row, col in zip(*df.isnull().values.nonzero()):
    print(f"Row {row + 1}, Column {col + 1}")

x = df[df.eq('')]
# Print locations of empty strings
print("\nEmpty strings:")
for row in df[df.eq('')].index:
    print(f"Row {row + 1}")

# Print locations of spaces or nulls
print("\nSpaces or nulls:")
for row, col in zip(*df.strip().isnull().values.nonzero()):
    print(f"Row {row + 1}, Column {col + 1}")
    
dbg = 12

RangeIndex(start=0, stop=21, step=1)
Null cells:
Row 2, Column 4

Empty strings:
Row 1
Row 2
Row 3
Row 4
Row 5
Row 6
Row 7
Row 8
Row 9
Row 10
Row 11
Row 12
Row 13
Row 14
Row 15
Row 16
Row 17
Row 18
Row 19
Row 20
Row 21

Spaces or nulls:


AttributeError: 'DataFrame' object has no attribute 'strip'

In [73]:
# Begin to count the recommend actions. 
#sort so the largest date value is first in dict.  i.e., we want to recommend the action with the largest date range
actions = {k: v for k, v in sorted(ACTION_DATES.items(), key=lambda item: item[1], reverse=True)} 
default='ok'


def determine_action(days:int) ->str: 
    for action, d_thresehold in actions.items():
        if days >= d_thresehold:
            return action
    return default

df_all['action'] = df_all['days_ago'].apply(determine_action)
df_all.head(5)

#Find the ones beyond the dates. 

KeyError: 'days_ago'

In [None]:

# sort the actions by oldest


In [None]:
#####################################
#most recent purchases per user - any product
df_recent = df_all.sort_values(by='date', ascending=False)

# group the data by Name and get the first row of each group
df_most_recent_purchases = df_recent.groupby('user').first()

df_most_recent_purchases


In [None]:
#####################################
#most recent purchases per user - any product

df_recent = df_all.sort_values(by='date', ascending=False)

# group the data by Name and get the most recent purchase (first row) of each group
df_most_recent_purchase = df_recent.groupby(['user'],as_index=False).first()
df_most_recent_purchase.head(5)

In [None]:
#####################################
#most recent purchases per user - list all products

df_recent = df_all.sort_values(by='date', ascending=False)

# group the data by Name and get the most recent purchase (first row) of each group
df_most_recent_purchases_all = df_recent.groupby(['user', 'product'],as_index=False).first()
df_most_recent_purchases_all.head(15)

In [None]:
### Find purchases that are old - such as out of warranty date.
filtered_df = df_most_recent_purchases_all.loc[df_most_recent_purchases_all['date'] < '2023-02-01']


print(f"Num purchases out of warranty: {len(filtered_df)}")
filtered_df.head(5)


In [None]:
#####################################
# Last purchase per user per product
#
# Note: this doesn't handle case where a user never purchased
#####################################
#Generate blocks of cutoff days.    The first is the max.
dates = []
dateblocks = {}

#dmin = df_all['date'].min().date()
dmin = df_all['date'].min()
print(f"dmin: Earliest entry from date: {dmin}")
d = DATE_AS_OF
ndays = 0
while d >= dmin:
 dates.insert(0,d)

 prevdate = d - timedelta(days=DATE_GROUP_DAYS)
 ndays_max=ndays + DATE_GROUP_DAYS-1
 dateblocks[ndays] = dict(ndays=ndays,
                          date=d,
                          prevdate=prevdate,
                          ndays_max = ndays_max,
                          label=f"{ndays} to {ndays_max} days")

 #Next block
 d = prevdate
 ndays += DATE_GROUP_DAYS

 #print(f"date slice: {d}")
 #print(f"dateblocks: {dateblocks}")

 # End up with a dict

 #values
 #* n days
 #* label
 #df (slice) of entries that match.
 #* be able to visualize.


 #<= 10 days:   15
 #11 <= 20 days  37
 #21

In [None]:
#dateblocks

In [None]:
#df_all['days_since_login'] = df_all['date'].apply(lambda d: (DATE_AS_OF - d.date()).days)

def calc_date_block(ndays):

    for d in sorted(dateblocks.keys()):
        #print(f"calc_date_block d={str(d)}  v={dateblocks.get(d)}")
        v = dateblocks[d]
        if ndays <= v.get('ndays_max'):
            return v.get('label')
    return ('unknown')

df_all['days_since_label'] = df_all['days_ago'].apply(calc_date_block)

df_all.head(5)

In [None]:
df_date_bands = df_all.groupby('days_since_label',as_index=False).count()
df_date_bands.head(5)

In [None]:


sns.barplot(x='days_since_label', y='user', data=df_date_bands) #hue='variable')
#plt.xticks(rotation=90)
#plt.ylabel('Returns')
#plt.title('Portfolio vs Benchmark Returns');

Unnamed: 0,A,B,C
0,EMpty,whitespace,None and empty
1,,xx,
2,3,6,9
3,4,,


In [55]:
def trim_all(df:pd.DataFrame):
    df_trimmed = df.map(lambda x: x.strip() if isinstance(x, str) else x)
    return df_trimmed

In [56]:
def find_blanks(df):
    # Print locations of null cells
    print("Null cells:")
    for row, col in zip(*df.isnull().values.nonzero()):
        print(f"Row {row + 1}, Column {col + 1}")

    # Print locations of empty strings
    print("\nEmpty strings:")
    for row in df[df.eq('')].index:
        print(f"Row {row + 1}")

    # Print locations of spaces or nulls
#    print("\nSpaces or nulls:")
#    for row, col in zip(*df.str.strip().isnull().values.nonzero()):
#        print(f"Row {row + 1}, Column {col + 1}")

In [57]:
print("before - time_all")
print(df.head(20))
find_blanks(df)

before - time_all
       A           B               C
0  EMpty  whitespace  None and empty
1                 xx            None
2      3           6               9
3      4                            
Null cells:
Row 2, Column 3

Empty strings:
Row 1
Row 2
Row 3
Row 4


In [58]:
print("after - trim_all")
df_trimmed = trim_all(df)
print(df_trimmed.head(20))
find_blanks(df_trimmed)

after - trim_all
       A           B               C
0  EMpty  whitespace  None and empty
1                 xx            None
2      3           6               9
3      4                            
Null cells:
Row 2, Column 3

Empty strings:
Row 1
Row 2
Row 3
Row 4


In [60]:
# Find rows with empty strings in any column
empty_rows = df[df.eq('').any(axis=1)]
print('rows with blanks')
print(empty_rows)

# Find columns with empty strings
empty_cols = df.columns[df.eq('').any()]
print('empty_cols')
print(empty_cols)


rows with blanks
   A   B     C
1     xx  None
3  4          
empty_cols
Index(['A', 'C'], dtype='object')


In [ ]:
0