In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [122]:
from mlxtend.frequent_patterns import apriori, association_rules

> Association rules are normally written like this: {Diapers} -> {Beer} which means that there is a strong relationship between customers that purchased diapers and also purchased beer in the same transaction.

- In the above example, the {Diaper} is the antecedent and the {Beer} is the consequent. Both antecedents and consequents can have multiple items. In other words, {Diaper, Gum} -> {Beer, Chips} is a valid rule.

- Support is the relative frequency that the rules show up. In many instances, you may want to look for high support in order to make sure it is a useful relationship. However, there may be instances where a low support is useful if you are trying to find “hidden” relationships.

- Confidence is a measure of the reliability of the rule. A confidence of .5 in the above example would mean that in 50% of the cases where Diaper and Gum were purchased, the purchase also included Beer and Chips. For product recommendation, a 50% confidence may be perfectly acceptable but in a medical situation, this level may not be high enough.

- Lift is the ratio of the observed support to that expected if the two rules were independent (see wikipedia). The basic rule of thumb is that a lift value close to 1 means the rules were completely independent. Lift values > 1 are generally more “interesting” and could be indicative of a useful rule pattern.

In [123]:
df = pd.read_csv('market_basket_dataset.csv')

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   BillNo      500 non-null    int64  
 1   Itemname    500 non-null    object 
 2   Quantity    500 non-null    int64  
 3   Price       500 non-null    float64
 4   CustomerID  500 non-null    int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 19.7+ KB
None


Unnamed: 0,BillNo,Itemname,Quantity,Price,CustomerID
0,1000,Apples,5,8.3,52299
1,1000,Butter,4,6.06,11752
2,1000,Eggs,4,2.66,16415
3,1000,Potatoes,4,8.1,22889
4,1004,Oranges,2,7.26,52255


In [124]:
# Group items by BillNo and create a set of items each basket

basket = df.groupby('BillNo')['Itemname'].apply(set).to_frame()

basket

Unnamed: 0_level_0,Itemname
BillNo,Unnamed: 1_level_1
1000,"{Eggs, Apples, Butter, Potatoes}"
1004,{Oranges}
1005,"{Milk, Cereal, Onions}"
1008,"{Cereal, Tomatoes, Potatoes}"
1011,{Bananas}
...,...
1471,"{Cheese, Yogurt, Eggs, Bananas, Bread, Butter,..."
1483,"{Coffee, Tomatoes}"
1485,"{Oranges, Bananas, Butter, Juice, Bread, Potat..."
1493,"{Juice, Bananas, Chicken, Bread}"


In [125]:
# Encoded items as binary variables using one-hot encoding
basket_encoded = basket['Itemname'].str.join('|').str.get_dummies('|')
basket_encoded

Unnamed: 0_level_0,Apples,Bananas,Bread,Butter,Cereal,Cheese,Chicken,Coffee,Eggs,Juice,Milk,Onions,Oranges,Pasta,Potatoes,Sugar,Tea,Tomatoes,Yogurt
BillNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1000,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1004,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1005,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0
1008,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
1011,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1471,0,1,1,1,1,1,1,1,1,1,0,0,1,1,0,0,0,0,1
1483,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
1485,0,1,1,1,0,0,1,0,0,1,1,0,1,0,1,0,0,0,0
1493,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0


In [126]:
# Convert the DataFrame to use boolean types
basket_encoded_bool = basket_encoded.astype(bool)

# Apply Apriori algorithm
frequent_itemsets = apriori(basket_encoded_bool, 
                            min_support=0.01, 
                            use_colnames=True)

# Display frequent itemsets
frequent_itemsets


Unnamed: 0,support,itemsets
0,0.163399,(Apples)
1,0.241830,(Bananas)
2,0.150327,(Bread)
3,0.163399,(Butter)
4,0.202614,(Cereal)
...,...,...
1472,0.013072,"(Eggs, Juice, Milk, Tomatoes, Onions, Sugar)"
1473,0.013072,"(Apples, Eggs, Potatoes, Milk, Pasta, Chicken,..."
1474,0.013072,"(Apples, Yogurt, Coffee, Milk, Pasta, Tea, Sugar)"
1475,0.013072,"(Cheese, Yogurt, Coffee, Butter, Bread, Cereal..."


In [127]:
# Generate association rules with lower lift threshold

rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Apples),(Bread),0.163399,0.150327,0.045752,0.280000,1.862609,0.021188,1.180102,0.553571
1,(Bread),(Apples),0.150327,0.163399,0.045752,0.304348,1.862609,0.021188,1.202614,0.545055
2,(Cheese),(Apples),0.183007,0.163399,0.039216,0.214286,1.311429,0.009313,1.064765,0.290667
3,(Apples),(Cheese),0.163399,0.183007,0.039216,0.240000,1.311429,0.009313,1.074991,0.283854
4,(Apples),(Chicken),0.163399,0.130719,0.032680,0.200000,1.530000,0.011320,1.086601,0.414062
...,...,...,...,...,...,...,...,...,...,...
18397,(Juice),"(Oranges, Coffee, Bread, Butter, Cereal, Chicken)",0.150327,0.013072,0.013072,0.086957,6.652174,0.011107,1.080921,1.000000
18398,(Butter),"(Oranges, Coffee, Juice, Bread, Cereal, Chicken)",0.163399,0.013072,0.013072,0.080000,6.120000,0.010936,1.072748,1.000000
18399,(Bread),"(Oranges, Coffee, Juice, Butter, Cereal, Chicken)",0.150327,0.013072,0.013072,0.086957,6.652174,0.011107,1.080921,1.000000
18400,(Cereal),"(Oranges, Coffee, Juice, Bread, Butter, Chicken)",0.202614,0.013072,0.013072,0.064516,4.935484,0.010423,1.054992,1.000000


In [128]:
# Display association rules
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))

  antecedents consequents   support  confidence      lift
0    (Apples)     (Bread)  0.045752    0.280000  1.862609
1     (Bread)    (Apples)  0.045752    0.304348  1.862609
2    (Cheese)    (Apples)  0.039216    0.214286  1.311429
3    (Apples)    (Cheese)  0.039216    0.240000  1.311429
4    (Apples)   (Chicken)  0.032680    0.200000  1.530000
5   (Chicken)    (Apples)  0.032680    0.250000  1.530000
6      (Milk)    (Apples)  0.045752    0.291667  1.785000
7    (Apples)      (Milk)  0.045752    0.280000  1.785000
8    (Onions)    (Apples)  0.026144    0.173913  1.064348
9    (Apples)    (Onions)  0.026144    0.160000  1.064348


In [129]:
rules['filter_1'] = rules['antecedents'].apply(lambda x: 1 if len(x) < 2 else 0)
rules['filter_2'] = rules['consequents'].apply(lambda x: 1 if len(x) < 2 else 0)

final = rules[(rules['filter_1'] == 1) & (rules['filter_2'] == 1)]\
        .drop(['filter_1', 'filter_2'], axis=1).copy()

In [130]:
final['antecedents'] = final.loc[:,'antecedents'].apply(lambda x: next(iter(x)))
final['consequents'] = final.loc[:,'consequents'].apply(lambda x: next(iter(x)))
final['dummy'] = np.ones(final.shape[0], dtype=int)


In [174]:
item = np.sort(final['antecedents'].unique())
item

array(['Apples', 'Bananas', 'Bread', 'Butter', 'Cereal', 'Cheese',
       'Chicken', 'Coffee', 'Eggs', 'Juice', 'Milk', 'Onions', 'Oranges',
       'Pasta', 'Potatoes', 'Sugar', 'Tea', 'Tomatoes', 'Yogurt'],
      dtype=object)

In [256]:
# Converting dot_product
def unique_dot_product(item):
    dot_products = {frozenset({x, y}) for x in item for y in item}
    result = []
    for e in list(dot_products):
        temp = []
        for i in iter(e):
            temp.append(i)
        result.append(tuple(temp))
    return result

dot_product = unique_dot_product(item)

In [272]:
def fun_relation(inp):
    result = None
    try:
        result = list(dot_product).index(inp)
        result = 'T-' + str(result)
    except ValueError:
        try:
            inp = (inp[-1], inp[0])
            result = list(dot_product).index(inp)
            result = 'T-' + str(result)
        except ValueError:
            result = None
    return result

In [273]:
final['ant-cons'] = list(zip(final['antecedents'], final['consequents']))

In [276]:
final['connection'] = final['ant-cons'].apply(fun_relation)

In [None]:
final.to_csv('result/market-network.csv', index=False)

In [132]:
df['dummy'] = np.ones(df.shape[0], dtype=int)
df.to_csv('result/market_basket_dataset.csv', index=False)