In [2]:
#importing libraries
import pandas as pd 
import numpy as np
import itertools
import json
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
# sns.frozenset(style="white")

import warnings
warnings.filterwarnings("ignore")

# !pip install mlxtend
import mlxtend
from mlxtend.frequent_patterns import apriori

In [3]:
df = pd.read_csv("data.csv", delimiter = ",")
print(len(df))
df

38765


Unnamed: 0,Member_no,Date,item
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk
...,...,...,...
38760,4471,08-10-2014,sliced cheese
38761,2022,23-02-2014,candy
38762,1097,16-04-2014,cake bar
38763,1510,03-12-2014,fruit/vegetable juice


# 1. Data frozenset Overview:

## Attribute Information:


# 2. Data Analysis and Summarization Before Preprocess

First off, let's check the datatype of each column to see if there are any non-numeric columns we should exclude for our pre-Preprocess Analysis:

In [4]:
df.dtypes

Member_no     int64
Date         object
item         object
dtype: object

In [5]:
df.isna().sum()

Member_no    0
Date         0
item         0
dtype: int64

# 3. Data cleaning

## 1. Identify outliers and smooth out noisy data:

Removing the outliers that are outside the range (Q1 - 1.5 * IQR, Q3 + 1.5 * IQR):

In [6]:
# Calculate quantiles and IQR
cols = df.columns
Q1 = df[cols].quantile(0.25) # Same as np.percentile but maps (0,1) and not (0,100)
Q3 = df[cols].quantile(0.75)
IQR = Q3 - Q1

# Return a boolean array of the rows with (any) non-outlier column values
condition = ~((df[cols] < (Q1 - 1.5 * IQR)) | (df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)

# Filter our dataframe based on condition
df = df[condition]

In [7]:
df = df.sort_values(by=["Member_no"])

In [8]:
user_items = df[(df["Member_no"] == 1000)]["item"]
user_items

1629                    soda
13331             whole milk
8395              whole milk
4843                 sausage
17778     pickled vegetables
2047             canned beer
24544                 yogurt
18196        misc. beverages
32851            salty snack
6388                 sausage
20992    semi-finished bread
22537       hygiene articles
29480                 pastry
Name: item, dtype: object

Create an empty dict for all users before filling the dicts with bought items.

In [9]:
shoppingCartDict = dict.fromkeys(df["Member_no"].unique(),None)

In [10]:
for member_no in df["Member_no"].unique(): 
    user_items = df[(df["Member_no"] == member_no)]["item"]
    for item in user_items:
        if (shoppingCartDict[member_no] == None):
            shoppingCart = {}
        else :
            shoppingCart = shoppingCartDict[member_no]

        try:
            count  = shoppingCart[item]
            count += 1
            shoppingCart.update( {item : count} )
        except:
            shoppingCart[item] = 1
        shoppingCartDict[member_no] = shoppingCart

def calculateSupport (shoppingCartDict, item_list):
    search_space = len({member_no:cart for (member_no,cart) in shoppingCartDict.items() 
              if frozenset(item_list) <= frozenset(cart)})
    overall_space = len(shoppingCartDict)
    return search_space / overall_space

In [11]:
def calculateSupport (shoppingCartDict, item_list):
    search_space = 0
    for cart in shoppingCartDict.values():
        if (len( set(cart) | set(item_list) ) == len(set(cart)) ):
            search_space += 1
    overall_space = len(shoppingCartDict)
    return search_space / overall_space

In [34]:
def combinationsWithIntersection (temp_n, current_size):
    output_dict = {}
    keys = list(temp_n)
    for i in range(len(temp_n)):
        for j in range (i+1,len(temp_n)):
            if (len(union := ({keys[i]} | {keys[j]}) ) == current_size + 1):
                output_dict[str(union)] = None
    return output_dict

In [13]:
def aprioi (shoppingCartDict, minsup = 0.01):
    temp_n = dict.fromkeys(df["item"].unique(), None)
    frozenset_n = dict.fromkeys(df["item"].unique(), None)
    for i in range(1, len(temp_n.keys())):
        #temp_n = frozenset_n   
        if i == 1:
            frozenset_n = dict.fromkeys(df["item"].unique(), None)
            for item in dict.fromkeys(df["item"].unique()).keys():
                if ((sup := calculateSupport (shoppingCartDict,[item])) >= minsup):
                    frozenset_n[item] = sup
                else :
                    del frozenset_n[item]
        else :
            # frozenset_n = dict.fromkeys(itertools.combinations(temp_n.keys(), i), None)
            frozenset_n = combinationsWithIntersection (temp_n, i)
            for item in combinationsWithIntersection (temp_n, i).keys():
                if ((sup := calculateSupport (shoppingCartDict,(item))) >= minsup):
                    frozenset_n[item] = sup
                else :
                    del frozenset_n[item]
        if frozenset_n == {}:
            return temp_n
        else :
            temp_n = temp_n | frozenset_n
        # frozenset_n = {item:support for (item,support) in frozenset_n.items() if support >= 0.01}

In [None]:
aprioi (shoppingCartDict, 0.01)

In [21]:
frozenset_1 = dict.fromkeys(df["item"].unique(), None)
for item in frozenset_1.keys():
    frozenset_1[item] = calculateSupport (shoppingCartDict,[item])
frozenset_1 = {item:support for (item,support) in frozenset_1.items() if support >= 0.01}
len(frozenset_1)

116

In [37]:
frozenset_1 = dict.fromkeys(df["item"].unique(), None)
frozenset_1
frozenset_2 = combinationsWithIntersection (frozenset_1, 1)
frozenset_2
frozenset_3 = combinationsWithIntersection (frozenset_2, 2)
frozenset_3

{}

In [15]:
#frozenset_2 = dict.fromkeys(itertools.combinations(frozenset_1.keys(), 2), None)
frozenset_2 = combinationsWithIntersection (frozenset(frozenset_1.keys()), 1)
for item in frozenset_2.keys():
    frozenset_2[item] = calculateSupport (shoppingCartDict,(item))
frozenset_2 = {item:support for (item,support) in frozenset_2.items() if support >= 0.01}
len(frozenset_2)

TypeError: 'frozenset' object is not subscriptable

In [None]:
#frozenset_3 = dict.fromkeys(itertools.combinations(frozenset_2.keys(), 3), None)
frozenset_2 = combinationsWithIntersection (frozenset_2.keys(), 2)
for item in frozenset_3.keys():
    frozenset_3[item] = calculateSupport (shoppingCartDict,(item))
frozenset_3 = {item:support for (item,support) in frozenset_3.items() if support >= 0.01}
len(frozenset_3)

#### 