In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fim import apriori as fim_apriori
from gsp import apriori as gsp_apriori

# Task 4: Frequent Pattern mining and Association Rule Mining

In [2]:
df_geo = pd.read_csv('./geography.csv', sep=',', index_col=0)
df_ram = pd.read_csv('./ram.csv', sep=',', index_col=0)

# Original dataset split for VCS purposes
df_sales_part1 = pd.read_csv('./sales_ram-part1.csv', sep=',', index_col=0) #The dataset doesn't have a column name. This causes the error.
df_sales_part2 = pd.read_csv('./sales_ram-part2.csv', sep=',', index_col=0)
df_sales = df_sales_part1.append(df_sales_part2)

df_time = pd.read_csv('./time.csv', sep=',', index_col=0)
df_vendor = pd.read_csv('./vendor.csv', sep=',', index_col=0)

  mask |= (ar1 == a)


In [3]:
df_sales_merged = df_sales[["Id", "ram_code"]].join(df_ram, on="ram_code", rsuffix="_ram")
df_sales_merged["time_code"] = df_sales["time_code"]
df_sales_merged = df_sales_merged.join(df_time.set_index("time_code"), on="time_code")
df_sales_merged["geo_code"] = df_sales["geo_code"]
df_sales_merged = df_sales_merged.join(df_geo.set_index("geo_code"), on="geo_code")
df_sales_merged["vendor_code"] = df_sales["vendor_code"]
df_sales_merged = df_sales_merged.join(df_vendor.set_index("vendor_code"), on="vendor_code", rsuffix="_vendor")
df_sales_merged = df_sales_merged.join(df_sales[["sales_uds", "sales_currency"]])

In [4]:
df_sales_merged.head()

Unnamed: 0,Id,ram_code,brand,name,memory,memory_type,clock,time_code,year,month,...,week,geo_code,continent,country,region,currency,vendor_code,name_vendor,sales_uds,sales_currency
2602347,3719,1.0,ADATA,Adata,0.5,DDR,400,20130322,2013,3,...,12,25,Europe,Germany,north rhine-westphalla,EUR,32,geizhals_unknown,13.749032,10.65
2602348,3719,1.0,ADATA,Adata,0.5,DDR,400,20130323,2013,3,...,12,18,Europe,Germany,berlin,EUR,32,geizhals_unknown,13.828708,10.65
2602349,3719,1.0,ADATA,Adata,0.5,DDR,400,20130326,2013,3,...,13,28,Europe,Germany,saxony,EUR,32,geizhals_unknown,13.694297,10.65
2602350,3719,1.0,ADATA,Adata,0.5,DDR,400,20130327,2013,3,...,13,25,Europe,Germany,north rhine-westphalla,EUR,32,geizhals_unknown,13.69053,10.65
2602351,3719,1.0,ADATA,Adata,0.5,DDR,400,20130328,2013,3,...,13,27,Europe,Germany,saarland,EUR,32,geizhals_unknown,13.605216,10.65


## Association rules
Due to memory constraints we are forced to apply an aggressive stratified random sampling to the transactions dataset.

In [5]:
transactions_assoc = df_sales_merged.groupby(["year", "week"]).sample(frac=0.007, random_state=0)
num_trans_assoc = transactions_assoc.shape[0]

print(f"Number of transactions sampled: {num_trans_assoc}")

Number of transactions sampled: 23884


### ram_code level associations

In [6]:
trans_ram_code = transactions_assoc.groupby(["year", "week"])["ram_code"].apply(list)
trans_ram_code.head()

year  week
2013  12      [2824.0, 2438.0, 1235.0, 725.0, 2976.0, 2434.0...
      13      [845.0, 1432.0, 2779.0, 3030.0, 3063.0, 1531.0...
      14      [2366.0, 1430.0, 624.0, 951.0, 965.0, 1532.0, ...
      15      [2533.0, 1050.0, 2364.0, 2123.0, 1489.0, 2406....
      16      [786.0, 765.0, 2363.0, 738.0, 769.0, 2479.0, 2...
Name: ram_code, dtype: object

In [7]:
baskets_ram = trans_ram_code.values
num_bask_ram = baskets_ram.shape[0]
num_bask_ram

269

In [8]:
help(fim_apriori)

Help on built-in function apriori in module fim:

apriori(...)
    apriori (tracts, target='s', supp=10, zmin=1, zmax=None, report='a',
             eval='x', agg='x', thresh=10, prune=None, algo='b', mode='',
             border=None)
    Find frequent item sets with the Apriori algorithm.
    tracts  transaction database to mine (mandatory)
            The database must be an iterable of transactions;
            each transaction must be an iterable of items;
            each item must be a hashable object.
            If the database is a dictionary, the transactions are
            the keys, the values their (integer) multiplicities.
    target  type of frequent item sets to find     (default: s)
            s/a   sets/all   all     frequent item sets
            c     closed     closed  frequent item sets
            m     maximal    maximal frequent item sets
            g     gens       generators
            r     rules      association rules
    supp    minimum support of an i

In [9]:
rules_ram = fim_apriori(baskets_ram, supp=-4, zmin=2, target='r', conf=60, report='ascl')
print('Number of rules:', len(rules_ram))

Number of rules: 3249787


In [10]:
report_cols = ["Conseq.", "Prem.", "Support (abs)", "Support (frac)", "Confidence", "Lift"]

rules_ram_df = pd.DataFrame(rules_ram, columns=report_cols).sort_values(["Lift", "Support (abs)"], ascending=False)
rules_ram_df.head(10)

Unnamed: 0,Conseq.,Prem.,Support (abs),Support (frac),Confidence,Lift
3695,734.0,"(1386.0, 2045.0)",4,0.01487,1.0,67.25
3,2100.0,"(2319.0, 526.0, 2243.0, 2074.0)",3,0.011152,0.75,67.25
8,1724.0,"(464.0, 2205.0, 2186.0)",3,0.011152,0.75,67.25
10,1724.0,"(464.0, 2186.0)",3,0.011152,0.75,67.25
11,1724.0,"(464.0, 2046.0)",3,0.011152,0.75,67.25
12,1724.0,"(2040.0, 657.0, 2186.0)",3,0.011152,0.75,67.25
13,1724.0,"(2040.0, 657.0, 2046.0)",3,0.011152,0.75,67.25
15,1724.0,"(2040.0, 2205.0, 2186.0)",3,0.011152,0.75,67.25
16,1724.0,"(2040.0, 2205.0)",3,0.011152,0.75,67.25
18,1724.0,"(2040.0, 2046.0)",3,0.011152,0.75,67.25


In [13]:
df_ram.loc[[1386.0, 2045.0, 734.0]]

Unnamed: 0_level_0,brand,name,memory,memory_type,clock
ram_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1386.0,G.SKILL,G.Skill Ripjaws V,8.0,DDR4,3733
2045.0,KINGSTON,Kingston,8.0,DDR3,1333
734.0,CORSAIR,Corsair Xms3,4.0,DDR3,2000


The support for these rules is very low, even relative to the number of baskets used, making them very unreliable.

In [12]:
stop

NameError: name 'stop' is not defined

## Frequent patterns

In [None]:
baskets = df_sales_merged.reset_index().groupby(["year", "month"]).sample(frac=0.05)
baskets = baskets.groupby(["year", "month"])["index"].apply(list)
baskets

In [None]:
baskets_exp = baskets.copy()

for row in baskets_exp:
    for i in range(len(row)):
        row[i] = df_sales_merged.loc[df_sales_merged.index==row[i], ["brand", "country"]].values
        
baskets_exp

In [None]:
result_set = gsp_apriori(baskets_exp, baskets_exp.shape[0]*50//100, verbose=False)

In [None]:
stophere