# OnlineRetail

In [1]:
import pandas as pd

file_path = 'OnlineRetail_1225.csv'
data = pd.read_csv(file_path)
data.head(5)

Unnamed: 0,InvoiceNo,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-2010 08:26,2.55,17850.0,United Kingdom
1,536365,WHITE METAL LANTERN,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
2,536365,CREAM CUPID HEARTS COAT HANGER,8,01-12-2010 08:26,2.75,17850.0,United Kingdom
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
4,536365,RED WOOLLY HOTTIE WHITE HEART.,6,01-12-2010 08:26,3.39,17850.0,United Kingdom


# Data processing

1. 同一張InvoiceNo的資料的Description轉成一筆資料

In [2]:
import pandas as pd


# 移除含有NaN值的行（特別是在'Description'列中）
data = data.dropna(subset=['Description'])

# 將所有描述轉換為字符串類型
data['Description'] = data['Description'].astype(str)

# 按InvoiceNo分組，並將每組的描述字段合併為一個字串
grouped_data = data.groupby('InvoiceNo')['Description'].apply(lambda x: ', '.join(x)).reset_index()

# 查看處理後的數據
print(grouped_data)


# Save the data to a new CSV file
grouped_data_path = 'grouped_data.csv'
grouped_data.to_csv(grouped_data_path, index=False)

      InvoiceNo                                        Description
0        536365  WHITE HANGING HEART T-LIGHT HOLDER, WHITE META...
1        536366  HAND WARMER UNION JACK, HAND WARMER RED POLKA DOT
2        536367  ASSORTED COLOUR BIRD ORNAMENT, POPPY'S PLAYHOU...
3        536368  JAM MAKING SET WITH JARS, RED COAT RACK PARIS ...
4        536369                           BATH BUILDING BLOCK WORD
...         ...                                                ...
24441   C581484                        PAPER CRAFT , LITTLE BIRDIE
24442   C581490  VICTORIAN GLASS HANGING T-LIGHT, ZINC T-LIGHT ...
24443   C581499                                             Manual
24444   C581568                         VICTORIAN SEWING BOX LARGE
24445   C581569  HANGING HEART JAR T-LIGHT HOLDER, 36 PENCILS T...

[24446 rows x 2 columns]


2. apriori

In [3]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# 將描述列表轉換為交易列表
transactions = grouped_data['Description'].apply(lambda x: x.split(', ')).tolist()
#print(transactions)

# 初始化交易編碼器
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)

# 將交易數據轉換為適合進行Apriori算法的DataFrame格式
df = pd.DataFrame(te_ary, columns=te.columns_)

"""最小支持度為0.01"""

# 使用Apriori算法找出頻繁項集，設定最小支持度為0.01
frequent_itemsets_1 = apriori(df, min_support=0.01, use_colnames=True)

# 使用關聯規則函數生成規則，設定最小置信度為0.5
rules_1 = association_rules(frequent_itemsets_1, metric="confidence", min_threshold=0.6)

# 顯示生成的規則
rules_1.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(BIRTHDAY CARD),(),0.017017,0.023603,0.011004,0.646635,27.396239,0.010602,2.763137,0.980178
1,(FANCY FONT BIRTHDAY CARD),(),0.013254,0.023603,0.013254,1.0,42.367418,0.012941,inf,0.989512
2,(ALARM CLOCK BAKELIKE CHOCOLATE),(ALARM CLOCK BAKELIKE GREEN),0.01804,0.040947,0.011372,0.630385,15.395009,0.010633,2.594737,0.952222
3,(ALARM CLOCK BAKELIKE CHOCOLATE),(ALARM CLOCK BAKELIKE RED ),0.01804,0.04422,0.012108,0.671202,15.178723,0.011311,2.90689,0.951279
4,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED ),0.040947,0.04422,0.026426,0.645355,14.594209,0.024615,2.695031,0.97125


In [4]:
# 對規則按confidence進行降序排序
sorted_rules_by_confidence = rules_1.sort_values(by='confidence', ascending=False)

# 選取前10條規則
top_10_rules_by_confidence = sorted_rules_by_confidence.head(5)
top_10_rules_by_confidence

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
11,(RETRO SPOT),(BIRTHDAY CARD),0.011045,0.017017,0.011045,1.0,58.764423,0.010857,inf,0.993961
40,(SHED),(KEY FOB ),0.01534,0.021721,0.01534,1.0,46.037665,0.015007,inf,0.993519
8,(BACK DOOR ),(KEY FOB ),0.012681,0.021721,0.012681,1.0,46.037665,0.012406,inf,0.990844
1,(FANCY FONT BIRTHDAY CARD),(),0.013254,0.023603,0.013254,1.0,42.367418,0.012941,inf,0.989512
196,"(REGENCY TEA PLATE PINK, REGENCY TEA PLATE ROS...",(REGENCY TEA PLATE GREEN ),0.011127,0.01579,0.010554,0.948529,60.071891,0.010378,19.121796,0.994418


In [5]:
# 對規則按提升度（lift）進行降序排序
sorted_rules_by_lift = rules_1.sort_values(by='lift', ascending=False)

# 選取前20條規則
top_rules_by_lift = sorted_rules_by_lift.head(5)
top_rules_by_lift


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
199,(REGENCY TEA PLATE PINK),"(REGENCY TEA PLATE ROSES , REGENCY TEA PLATE G...",0.012845,0.013172,0.010554,0.821656,62.379515,0.010385,5.533286,0.996772
198,"(REGENCY TEA PLATE ROSES , REGENCY TEA PLATE G...",(REGENCY TEA PLATE PINK),0.013172,0.012845,0.010554,0.801242,62.379515,0.010385,4.966625,0.997103
200,(REGENCY TEA PLATE GREEN ),"(REGENCY TEA PLATE PINK, REGENCY TEA PLATE ROS...",0.01579,0.011127,0.010554,0.668394,60.071891,0.010378,2.982071,0.999129
196,"(REGENCY TEA PLATE PINK, REGENCY TEA PLATE ROS...",(REGENCY TEA PLATE GREEN ),0.011127,0.01579,0.010554,0.948529,60.071891,0.010378,19.121796,0.994418
12,(BIRTHDAY CARD),(RETRO SPOT),0.017017,0.011045,0.011045,0.649038,58.764423,0.010857,2.817845,1.0


In [6]:
"""最小支持度為0.02"""

# 使用Apriori算法找出頻繁項集，設定最小支持度為0.01
frequent_itemsets_2 = apriori(df, min_support=0.02, use_colnames=True)

# 使用關聯規則函數生成規則，設定最小置信度為0.5
rules_2 = association_rules(frequent_itemsets_2, metric="confidence", min_threshold=0.5)

# 顯示生成的規則
rules_2.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN),0.04422,0.040947,0.026426,0.597595,14.594209,0.024615,2.383301,0.974575
1,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED ),0.040947,0.04422,0.026426,0.645355,14.594209,0.024615,2.695031,0.97125
2,(CHARLOTTE BAG PINK POLKADOT),(RED RETROSPOT CHARLOTTE BAG),0.031089,0.042952,0.021517,0.692105,16.113529,0.020181,3.108362,0.968036
3,(RED RETROSPOT CHARLOTTE BAG),(CHARLOTTE BAG PINK POLKADOT),0.042952,0.031089,0.021517,0.500952,16.113529,0.020181,1.94152,0.980035
4,(CHARLOTTE BAG SUKI DESIGN),(RED RETROSPOT CHARLOTTE BAG),0.03657,0.042952,0.020862,0.57047,13.281624,0.019292,2.228128,0.959809


# divide country

In [7]:
import pandas as pd

file_path = 'OnlineRetail_1225.csv'
data = pd.read_csv(file_path)
data.head(5)

Unnamed: 0,InvoiceNo,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-2010 08:26,2.55,17850.0,United Kingdom
1,536365,WHITE METAL LANTERN,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
2,536365,CREAM CUPID HEARTS COAT HANGER,8,01-12-2010 08:26,2.75,17850.0,United Kingdom
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
4,536365,RED WOOLLY HOTTIE WHITE HEART.,6,01-12-2010 08:26,3.39,17850.0,United Kingdom


In [8]:
unique_values = data['Country'].unique()
print(unique_values)

['United Kingdom' 'France' 'Australia' 'Netherlands' 'Germany' 'Norway'
 'EIRE' 'Switzerland' 'Spain' 'Poland' 'Portugal' 'Italy' 'Belgium'
 'Lithuania' 'Japan' 'Iceland' 'Channel Islands' 'Denmark' 'Cyprus'
 'Sweden' 'Austria' 'Israel' 'Finland' 'Bahrain' 'Greece' 'Hong Kong'
 'Singapore' 'Lebanon' 'United Arab Emirates' 'Saudi Arabia'
 'Czech Republic' 'Canada' 'Unspecified' 'Brazil' 'USA'
 'European Community' 'Malta' 'RSA']


In [10]:
value_counts = data['Country'].value_counts()
print(value_counts)
value_counts.to_csv("country.csv")

United Kingdom          495478
Germany                   9495
France                    8557
EIRE                      8196
Spain                     2533
Netherlands               2371
Belgium                   2069
Switzerland               2002
Portugal                  1519
Australia                 1259
Norway                    1086
Italy                      803
Channel Islands            758
Finland                    695
Cyprus                     622
Sweden                     462
Unspecified                446
Austria                    401
Denmark                    389
Japan                      358
Poland                     341
Israel                     297
USA                        291
Hong Kong                  288
Singapore                  229
Iceland                    182
Canada                     151
Greece                     146
Malta                      127
United Arab Emirates        68
European Community          61
RSA                         58
Lebanon 

# UK

In [11]:
UK_data = data[data['Country'] == 'United Kingdom']
UK_data.head()

Unnamed: 0,InvoiceNo,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-2010 08:26,2.55,17850.0,United Kingdom
1,536365,WHITE METAL LANTERN,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
2,536365,CREAM CUPID HEARTS COAT HANGER,8,01-12-2010 08:26,2.75,17850.0,United Kingdom
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
4,536365,RED WOOLLY HOTTIE WHITE HEART.,6,01-12-2010 08:26,3.39,17850.0,United Kingdom


In [12]:
import pandas as pd


# 移除含有NaN值的行（特別是在'Description'列中）
UK_data = UK_data.dropna(subset=['Description'])

# 將所有描述轉換為字符串類型
UK_data['Description'] = UK_data['Description'].astype(str)

# 按InvoiceNo分組，並將每組的描述字段合併為一個字串
UK_grouped_data = UK_data.groupby('InvoiceNo')['Description'].apply(lambda x: ', '.join(x)).reset_index()
UK_grouped_data.head()

Unnamed: 0,InvoiceNo,Description
0,536365,"WHITE HANGING HEART T-LIGHT HOLDER, WHITE META..."
1,536366,"HAND WARMER UNION JACK, HAND WARMER RED POLKA DOT"
2,536367,"ASSORTED COLOUR BIRD ORNAMENT, POPPY'S PLAYHOU..."
3,536368,"JAM MAKING SET WITH JARS, RED COAT RACK PARIS ..."
4,536369,BATH BUILDING BLOCK WORD


In [13]:
# Save the data to a new CSV file
UK_grouped_data_path = 'UK_grouped_data.csv'
UK_grouped_data.to_csv(UK_grouped_data_path, index=False)

In [14]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# 將描述列表轉換為交易列表
transactions = UK_grouped_data['Description'].apply(lambda x: x.split(', ')).tolist()
#print(transactions)

# 初始化交易編碼器
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)

# 將交易數據轉換為適合進行Apriori算法的DataFrame格式
df = pd.DataFrame(te_ary, columns=te.columns_)


# 使用Apriori算法找出頻繁項集，設定最小支持度為0.01
frequent_itemsets_1 = apriori(df, min_support=0.01, use_colnames=True)

# 使用關聯規則函數生成規則，設定最小置信度為0.5
rules_1 = association_rules(frequent_itemsets_1, metric="confidence", min_threshold=0.6)


In [15]:
rules_1.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(BIRTHDAY CARD),(),0.017196,0.024365,0.011116,0.646438,26.531645,0.010697,2.759446,0.979147
1,(FANCY FONT BIRTHDAY CARD),(),0.013884,0.024365,0.013884,1.0,41.042831,0.013546,inf,0.989371
2,(ALARM CLOCK BAKELIKE CHOCOLATE),(ALARM CLOCK BAKELIKE GREEN),0.017786,0.040563,0.011343,0.637755,15.722732,0.010622,2.648588,0.953354
3,(ALARM CLOCK BAKELIKE CHOCOLATE),(ALARM CLOCK BAKELIKE RED ),0.017786,0.043512,0.011661,0.655612,15.06746,0.010887,2.777358,0.950538
4,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED ),0.040563,0.043512,0.025817,0.636465,14.62742,0.024052,2.631078,0.971022


In [16]:
# 對規則按confidence進行降序排序
sorted_rules_by_confidence = rules_1.sort_values(by='confidence', ascending=False)

# 選取前10條規則
top_10_rules_by_confidence = sorted_rules_by_confidence.head(5)
top_10_rules_by_confidence

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,(FANCY FONT BIRTHDAY CARD),(),0.013884,0.024365,0.013884,1.0,41.042831,0.013546,inf,0.989371
43,(SHED),(KEY FOB ),0.016878,0.023911,0.016878,1.0,41.821632,0.016475,inf,0.992847
8,(BACK DOOR ),(KEY FOB ),0.013975,0.023911,0.013975,1.0,41.821632,0.01364,inf,0.989923
11,(RETRO SPOT),(BIRTHDAY CARD),0.01157,0.017196,0.01157,1.0,58.153034,0.011371,inf,0.994308
259,"(REGENCY TEA PLATE PINK, REGENCY TEA PLATE ROS...",(REGENCY TEA PLATE GREEN ),0.010753,0.015381,0.010163,0.945148,61.448539,0.009998,17.950359,0.994419


In [17]:
# 對規則按提升度（lift）進行降序排序
sorted_rules_by_lift = rules_1.sort_values(by='lift', ascending=False)

# 選取前20條規則
top_rules_by_lift = sorted_rules_by_lift.head(5)
top_rules_by_lift

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
262,(REGENCY TEA PLATE PINK),"(REGENCY TEA PLATE ROSES , REGENCY TEA PLATE G...",0.012341,0.012795,0.010163,0.823529,64.363788,0.010005,5.594162,0.996765
261,"(REGENCY TEA PLATE ROSES , REGENCY TEA PLATE G...",(REGENCY TEA PLATE PINK),0.012795,0.012341,0.010163,0.794326,64.363788,0.010005,4.802065,0.997223
259,"(REGENCY TEA PLATE PINK, REGENCY TEA PLATE ROS...",(REGENCY TEA PLATE GREEN ),0.010753,0.015381,0.010163,0.945148,61.448539,0.009998,17.950359,0.994419
263,(REGENCY TEA PLATE GREEN ),"(REGENCY TEA PLATE PINK, REGENCY TEA PLATE ROS...",0.015381,0.010753,0.010163,0.660767,61.448539,0.009998,2.916128,0.999093
12,(BIRTHDAY CARD),(RETRO SPOT),0.017196,0.01157,0.01157,0.672823,58.153034,0.011371,3.021089,1.0


In [21]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
import time

# Load your data
file_path = 'UK_grouped_data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Splitting the Description column into individual items
data['Items'] = data['Description'].str.split(',')


# Start the timer for FP-growth algorithm
start_time = time.time()

# Extracting the list of transactions
transactions = data['Items'].tolist()

# TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

# FP-growth 
frequent_itemsets = fpgrowth(df, min_support=0.01, use_colnames=True)

# Generating the association rules
FP_rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.6)


# Stop the timer and print the time taken for FP-growth
fp_growth_time = time.time() - start_time
print(f"Time taken for FP-growth: {fp_growth_time} seconds")

Time taken for FP-growth: 3.5130319595336914 seconds


In [22]:
# Calculating confidence and lift
# These metrics are already included in the rules DataFrame
FP_rules = FP_rules[['antecedents', 'consequents', 'confidence', 'lift']]
FP_rules

Unnamed: 0,antecedents,consequents,confidence,lift
0,( HOME BUILDING BLOCK WORD),( WHITE HANGING HEART T-LIGHT HOLDER),0.328191,3.600460
1,( WHITE HANGING HEART T-LIGHT HOLDER),( HOME BUILDING BLOCK WORD),0.116476,3.600460
2,( HOME BUILDING BLOCK WORD),( LOVE BUILDING BLOCK WORD),0.374474,13.871274
3,( LOVE BUILDING BLOCK WORD),( HOME BUILDING BLOCK WORD),0.448739,13.871274
4,( POPPY'S PLAYHOUSE KITCHEN),( POPPY'S PLAYHOUSE BEDROOM ),0.694595,42.762192
...,...,...,...,...
1525,( LUNCH BAG RED RETROSPOT),( LUNCH BAG VINTAGE DOILY ),0.182819,7.545579
1526,( JUMBO BAG VINTAGE DOILY ),( LUNCH BAG VINTAGE DOILY ),0.370662,15.298503
1527,( LUNCH BAG VINTAGE DOILY ),( JUMBO BAG VINTAGE DOILY ),0.440075,15.298503
1528,( HOT WATER BOTTLE KEEP CALM),( CHOCOLATE HOT WATER BOTTLE),0.338959,9.504658


In [23]:
# 對規則按lift進行降序排序
sorted_rules_by_confidence = FP_rules.sort_values(by='lift', ascending=False)

# 選取前幾條規則
top_10_rules_by_confidence = sorted_rules_by_confidence.head(10)
top_10_rules_by_confidence

Unnamed: 0,antecedents,consequents,confidence,lift
273,(METAL SIGN),( AIRLINE LOUNGE),0.986207,76.0
272,( AIRLINE LOUNGE),(METAL SIGN),1.0,76.0
1158,(HOT PINK),( FEATHER PEN),0.991111,65.206235
1159,( FEATHER PEN),(HOT PINK),0.665672,65.206235
1500,( REGENCY TEA PLATE GREEN ),( REGENCY TEA PLATE PINK),0.705167,59.094618
1501,( REGENCY TEA PLATE PINK),( REGENCY TEA PLATE GREEN ),0.882129,59.094618
798,( BIRTHDAY CARD),( RETRO SPOT),0.669333,57.851399
799,( RETRO SPOT),( BIRTHDAY CARD),0.984314,57.851399
1145,(SUGAR),( SET 3 RETROSPOT TEA),0.965969,57.696335
1144,( SET 3 RETROSPOT TEA),(SUGAR),1.0,57.696335


# Germany

In [5]:
Germany_data = data[data['Country'] == 'Germany']
Germany_data.head()

Unnamed: 0,InvoiceNo,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
1109,536527,SET OF 6 T-LIGHTS SANTA,6,01-12-2010 13:04,2.95,12662.0,Germany
1110,536527,ROTATING SILVER ANGELS T-LIGHT HLDR,6,01-12-2010 13:04,2.55,12662.0,Germany
1111,536527,MULTI COLOUR SILVER T-LIGHT HOLDER,12,01-12-2010 13:04,0.85,12662.0,Germany
1112,536527,5 HOOK HANGER MAGIC TOADSTOOL,12,01-12-2010 13:04,1.65,12662.0,Germany
1113,536527,3 HOOK HANGER MAGIC GARDEN,12,01-12-2010 13:04,1.95,12662.0,Germany


In [6]:
import pandas as pd


# 移除含有NaN值的行（特別是在'Description'列中）
Germany_data = Germany_data.dropna(subset=['Description'])

# 將所有描述轉換為字符串類型
Germany_data['Description'] = Germany_data['Description'].astype(str)

# 按InvoiceNo分組，並將每組的描述字段合併為一個字串
Germany_grouped_data = Germany_data.groupby('InvoiceNo')['Description'].apply(lambda x: ', '.join(x)).reset_index()
Germany_grouped_data.head()



Unnamed: 0,InvoiceNo,Description
0,536527,"SET OF 6 T-LIGHTS SANTA, ROTATING SILVER ANGEL..."
1,536840,"JAM MAKING SET PRINTED, JAM JAR WITH PINK LID,..."
2,536861,"FELTCRAFT 6 FLOWER FRIENDS, 6 RIBBONS RUSTIC C..."
3,536967,"POSTAGE, JUMBO BAG RED RETROSPOT"
4,536983,"WOODLAND PARTY BAG + STICKER SET, HAND WARMER ..."


In [7]:
# Save the data to a new CSV file
Germany_grouped_data_path = 'Germany_grouped_data.csv'
Germany_grouped_data.to_csv(Germany_grouped_data_path, index=False)

In [8]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# 將描述列表轉換為交易列表
transactions = Germany_grouped_data['Description'].apply(lambda x: x.split(', ')).tolist()
#print(transactions)

# 初始化交易編碼器
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)

# 將交易數據轉換為適合進行Apriori算法的DataFrame格式
df = pd.DataFrame(te_ary, columns=te.columns_)


# 使用Apriori算法找出頻繁項集，設定最小支持度為0.01
frequent_itemsets_1 = apriori(df, min_support=0.01, use_colnames=True)

# 使用關聯規則函數生成規則，設定最小置信度為0.5
rules_1 = association_rules(frequent_itemsets_1, metric="confidence", min_threshold=0.6)

# 顯示生成的規則
# rules_1.head()

In [9]:
# 對規則按confidence進行降序排序
sorted_rules_by_confidence = rules_1.sort_values(by='confidence', ascending=False)

# 選取前10條規則
top_10_rules_by_confidence = sorted_rules_by_confidence.head(10)
top_10_rules_by_confidence

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
531,"(RED HARMONICA IN BOX , BLUE HARMONICA IN BOX )",(POSTAGE),0.021559,0.635158,0.021559,1.0,1.574413,0.007866,inf,0.372881
2148,"(RETROSPOT PARTY BAG + STICKER SET, DINOSAUR P...","(WOODLAND PARTY BAG + STICKER SET, POSTAGE)",0.013267,0.046434,0.013267,1.0,21.535714,0.012651,inf,0.966387
2162,"(SPACEBOY CHILDRENS CUP, DOLLY GIRL CHILDRENS ...",(DOLLY GIRL CHILDRENS CUP),0.011609,0.0199,0.011609,1.0,50.25,0.011378,inf,0.991611
2161,"(SPACEBOY CHILDRENS CUP, DOLLY GIRL CHILDRENS ...",(DOLLY GIRL CHILDRENS BOWL),0.011609,0.023217,0.011609,1.0,43.071429,0.011339,inf,0.988255
636,"(WHITE SPOT RED CERAMIC DRAWER KNOB, BLUE SPOT...",(POSTAGE),0.016584,0.635158,0.016584,1.0,1.574413,0.00605,inf,0.370995
1130,"(RED RETROSPOT CHARLOTTE BAG, LUNCH BAG WOODLAND)",(WOODLAND CHARLOTTE BAG),0.011609,0.097844,0.011609,1.0,10.220339,0.010473,inf,0.912752
1537,"(WATERING CAN PINK BUNNY, WATERING CAN BLUE EL...",(POSTAGE),0.011609,0.635158,0.011609,1.0,1.574413,0.004235,inf,0.369128
2155,"(DOLLY GIRL CHILDRENS BOWL, POSTAGE, SPACEBOY ...",(DOLLY GIRL CHILDRENS CUP),0.011609,0.0199,0.011609,1.0,50.25,0.011378,inf,0.991611
1544,"(WOODLAND CHARLOTTE BAG, WOODLAND MINI BACKPACK)",(POSTAGE),0.011609,0.635158,0.011609,1.0,1.574413,0.004235,inf,0.369128
644,"(WHITE SPOT BLUE CERAMIC DRAWER KNOB, RED SPOT...",(BLUE SPOT CERAMIC DRAWER KNOB),0.013267,0.021559,0.013267,1.0,46.384615,0.012981,inf,0.991597


In [10]:
# 對規則按提升度（lift）進行降序排序
sorted_rules_by_lift = rules_1.sort_values(by='lift', ascending=False)

# 選取前20條規則
top_rules_by_lift = sorted_rules_by_lift.head(10)
top_rules_by_lift

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1968,"(GREEN VINTAGE SPOT BEAKER, BLUE VINTAGE SPOT ...","(PINK VINTAGE SPOT BEAKER, RED VINTAGE SPOT BE...",0.011609,0.013267,0.011609,1.0,75.375,0.011455,inf,0.998322
1973,"(PINK VINTAGE SPOT BEAKER, RED VINTAGE SPOT BE...","(GREEN VINTAGE SPOT BEAKER, BLUE VINTAGE SPOT ...",0.013267,0.011609,0.011609,0.875,75.375,0.011455,7.907131,1.0
2108,"(CHOCOLATE BOX RIBBONS , ROUND SNACK BOXES SET...","(ROUND SNACK BOXES SET OF 4 FRUITS , SCANDINAV...",0.014925,0.013267,0.013267,0.888889,67.0,0.013069,8.880597,1.0
2107,"(SCANDINAVIAN REDS RIBBONS, ROUND SNACK BOXES ...","(ROUND SNACK BOXES SET OF 4 FRUITS , CHOCOLATE...",0.013267,0.014925,0.013267,1.0,67.0,0.013069,inf,0.998319
2105,"(ROUND SNACK BOXES SET OF 4 FRUITS , CHOCOLATE...","(SCANDINAVIAN REDS RIBBONS, ROUND SNACK BOXES ...",0.014925,0.013267,0.013267,0.888889,67.0,0.013069,8.880597,1.0
172,(STAR WREATH DECORATION WITH BELL),(HEART WREATH DECORATION WITH BELL),0.011609,0.014925,0.011609,1.0,67.0,0.011435,inf,0.996644
173,(HEART WREATH DECORATION WITH BELL),(STAR WREATH DECORATION WITH BELL),0.014925,0.011609,0.011609,0.777778,67.0,0.011435,4.447761,1.0
2104,"(ROUND SNACK BOXES SET OF 4 FRUITS , SCANDINAV...","(CHOCOLATE BOX RIBBONS , ROUND SNACK BOXES SET...",0.013267,0.014925,0.013267,1.0,67.0,0.013069,inf,0.998319
2172,"(SPACEBOY CHILDRENS CUP, DOLLY GIRL CHILDRENS ...","(DOLLY GIRL CHILDRENS BOWL, SPACEBOY CHILDRENS...",0.013267,0.013267,0.011609,0.875,65.953125,0.011433,7.893864,0.998079
2175,"(DOLLY GIRL CHILDRENS BOWL, SPACEBOY CHILDRENS...","(SPACEBOY CHILDRENS CUP, DOLLY GIRL CHILDRENS ...",0.013267,0.013267,0.011609,0.875,65.953125,0.011433,7.893864,0.998079


# France

In [11]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

France_data = data[data['Country'] == 'France']

France_data = France_data.dropna(subset=['Description'])
France_data['Description'] = France_data['Description'].astype(str)

France_grouped_data = France_data.groupby('InvoiceNo')['Description'].apply(lambda x: ', '.join(x)).reset_index()

# Apriori
transactions = France_grouped_data['Description'].apply(lambda x: x.split(', ')).tolist()

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)

df = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

# confidence
sorted_rules_by_confidence = rules.sort_values(by='confidence', ascending=False)
top_rules_by_confidence = sorted_rules_by_confidence.head(10)

# lift
sorted_rules_by_lift = rules.sort_values(by='lift', ascending=False)
top_rules_by_lift = sorted_rules_by_lift.head(10)

In [12]:
top_rules_by_confidence

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
26564,"(SPACEBOY LUNCH BOX , LUNCH BAG SPACEBOY DESIG...",(PLASTERS IN TIN SPACEBOY),0.010846,0.119306,0.010846,1.0,8.381818,0.009552,inf,0.890351
22591,"(POSTAGE, SET/6 RED SPOTTY PAPER PLATES, ALARM...","(ALARM CLOCK BAKELIKE GREEN, ALARM CLOCK BAKEL...",0.010846,0.062907,0.010846,1.0,15.896552,0.010164,inf,0.947368
22641,"(ALARM CLOCK BAKELIKE GREEN, ROUND SNACK BOXES...",(ROUND SNACK BOXES SET OF4 WOODLAND ),0.010846,0.138829,0.010846,1.0,7.203125,0.00934,inf,0.870614
22640,"(ALARM CLOCK BAKELIKE GREEN, ROUND SNACK BOXES...",(ALARM CLOCK BAKELIKE RED ),0.010846,0.08026,0.010846,1.0,12.459459,0.009975,inf,0.929825
47878,"(ALARM CLOCK BAKELIKE GREEN, PLASTERS IN TIN S...","(SET/6 RED SPOTTY PAPER CUPS, ROUND SNACK BOXE...",0.010846,0.013015,0.010846,1.0,76.833333,0.010705,inf,0.997807
22631,"(ROUND SNACK BOXES SET OF4 WOODLAND , RED RETR...","(ALARM CLOCK BAKELIKE GREEN, ALARM CLOCK BAKEL...",0.015184,0.067245,0.015184,1.0,14.870968,0.014163,inf,0.947137
22627,"(ALARM CLOCK BAKELIKE GREEN, ROUND SNACK BOXES...",(ALARM CLOCK BAKELIKE RED ),0.015184,0.08026,0.015184,1.0,12.459459,0.013966,inf,0.933921
22626,"(ROUND SNACK BOXES SET OF4 WOODLAND , RED RETR...",(ALARM CLOCK BAKELIKE GREEN),0.015184,0.084599,0.015184,1.0,11.820513,0.0139,inf,0.929515
22615,"(ALARM CLOCK BAKELIKE GREEN, STRAWBERRY LUNCH ...",(POSTAGE),0.015184,0.67462,0.015184,1.0,1.482315,0.004941,inf,0.330396
22601,"(SPACEBOY LUNCH BOX , ALARM CLOCK BAKELIKE GRE...",(ALARM CLOCK BAKELIKE RED ),0.019523,0.08026,0.019523,1.0,12.459459,0.017956,inf,0.938053


In [13]:
top_rules_by_lift

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
41057,"(CHILDRENS CUTLERY DOLLY GIRL , ALARM CLOCK BA...","(ROUND SNACK BOXES SET OF4 WOODLAND , ALARM CL...",0.010846,0.010846,0.010846,1.0,92.2,0.010728,inf,1.0
47784,"(SET/6 RED SPOTTY PAPER CUPS, ALARM CLOCK BAKE...","(ALARM CLOCK BAKELIKE GREEN, ROUND SNACK BOXES...",0.010846,0.010846,0.010846,1.0,92.2,0.010728,inf,1.0
47890,"(PLASTERS IN TIN SPACEBOY, ALARM CLOCK BAKELIK...","(ALARM CLOCK BAKELIKE GREEN, ROUND SNACK BOXES...",0.010846,0.010846,0.010846,1.0,92.2,0.010728,inf,1.0
47888,"(ALARM CLOCK BAKELIKE GREEN, PLASTERS IN TIN S...","(ROUND SNACK BOXES SET OF4 WOODLAND , ALARM CL...",0.010846,0.010846,0.010846,1.0,92.2,0.010728,inf,1.0
41058,"(CHILDRENS CUTLERY DOLLY GIRL , ALARM CLOCK BA...","(ROUND SNACK BOXES SET OF4 WOODLAND , CARD DOL...",0.010846,0.010846,0.010846,1.0,92.2,0.010728,inf,1.0
47864,"(ALARM CLOCK BAKELIKE GREEN, ROUND SNACK BOXES...","(PLASTERS IN TIN SPACEBOY, SET/6 RED SPOTTY PA...",0.010846,0.010846,0.010846,1.0,92.2,0.010728,inf,1.0
47818,"(ALARM CLOCK BAKELIKE GREEN, ROUND SNACK BOXES...","(SET/6 RED SPOTTY PAPER CUPS, ALARM CLOCK BAKE...",0.010846,0.010846,0.010846,1.0,92.2,0.010728,inf,1.0
47815,"(SET/6 RED SPOTTY PAPER PLATES, ROUND SNACK BO...","(ALARM CLOCK BAKELIKE GREEN, SET/6 RED SPOTTY ...",0.010846,0.010846,0.010846,1.0,92.2,0.010728,inf,1.0
47813,"(SET/6 RED SPOTTY PAPER CUPS, PLASTERS IN TIN ...","(SET/6 RED SPOTTY PAPER PLATES, ROUND SNACK BO...",0.010846,0.010846,0.010846,1.0,92.2,0.010728,inf,1.0
47791,"(SET/6 RED SPOTTY PAPER PLATES, ROUND SNACK BO...","(SET/6 RED SPOTTY PAPER CUPS, PLASTERS IN TIN ...",0.010846,0.010846,0.010846,1.0,92.2,0.010728,inf,1.0
