In [13]:
import pandas as pd
from timy import timer
import polars as pl
from datetime import timedelta, datetime

In [7]:
def listify_items(
    df_: pd.DataFrame,
    set_column: str,
    item_column: str
):
    result = df_.groupby(set_column)[item_column] \
               .apply(list) \
               .reset_index(name='items_list')

    return list(result['items_list'])

def get_unique_elements(
    df_: pd.DataFrame,
    column_label: str
):
    return unique(list(df_[column_label]))

def get_dataframe_last_n_days(
    df_: pd.DataFrame,
    time_column: str
):
    df_.sort_values(by=time_column)

    # Assuming 'pedi_data' is in string format, convert it to datetime
    df_[time_column] = pd.to_datetime(df_[time_column])

    # Get the maximum date
    end_date = df[time_column].max()

    # Calculate the start date as 30 days before the end date
    start_date = end_date - timedelta(days=last_days)

    # Filter the DataFrame to include only rows within the last 30 days
    interval_mask = (df_[time_column] >= start_date) & (df_[time_column] <= end_date)

    return df_[interval_mask]

def read_data_to_dataframe_gen(
    data_folder_: str,
    set_column: str,
    item_column: str,
    extension: str = 'xlsx'
):
    filepaths = [
        path.join(data_folder, filename) 
        for filename in listdir(data_folder) 
        if filename.split('.')[-1] == extension
    ]
    
    dfs = []
    for filepath in filepaths:
        df_ = pd.read_excel(filepath)

        df_p = pl.from_pandas(df_)

        # Group by 'pedi_id' and 'prod_id', and select the first occurrence of each group
        relevant_columns = [set_column, item_column]
        filtered_df = df_p.group_by(relevant_columns).first()

        df_ = filtered_df.to_pandas()        
        
        yield filepath, df_

@timer()
def read_data_to_dataframe(
    data_folder_: str,
    set_column: str,
    item_column: str,
    extension: str = 'xlsx'
):
    return dict(read_data_to_dataframe_gen(data_folder_, set_column, item_column, extension))

In [11]:
from os import getcwd, listdir, path 

data_folder = getcwd()+'/data/'

filename_dfs = dict(read_data_to_dataframe(data_folder, 'pedi_id', 'prod_id'))

Timy executed (read_data_to_dataframe) for 1 time in 145.443559 seconds
Timy best time was 145.443559 seconds


In [15]:
last_days = 1

filtered_dfs = dict()
for filename, df in filename_dfs.items(): 
    df = get_dataframe_last_n_days(df, 'pedi_data')
    len(df)
    filtered_dfs[filename] = df

dfs = list(filtered_dfs.values())

In [None]:
# import the apriori modules from mlxtend
from mlxtend.frequent_patterns import apriori,association_rules
from mlxtend.preprocessing import TransactionEncoder

'''
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]
'''

dataset = listify_items(dfs[0], 'pedi_id', 'prod_id')

encoded_data = TransactionEncoder()
encoded_data = encoded_data.fit(dataset)\
                           .transform(dataset, sparse=True)

sparse_df = pd.DataFrame.sparse.from_spmatrix(encoded_data)

# Run the association rules function of mined frequent itemset
sparse_frequent_candidates = apriori(sparse_df, min_support=0.001, use_colnames=True)

# Run the association rules function of mined frequent itemset
sparse_rules = association_rules(sparse_frequent_candidates, metric="lift", min_threshold=1)

# Inspect your rules with filters
sparse_rules[ (sparse_rules['lift'] >= 4) & (sparse_rules['confidence'] >= 0.5) ]

# lets check the size of the standard object and the sparse object we created.
# The only reason we are doing the sparse is for space and memory optimization.
# import the system module
import sys

# get size of the two objects to compare
sys.getsizeof(sparse_df)
sys.getsizeof(sparse_df)
