### _執行前說明_

須先至 [Microsoft Access 2016 ODBC Driver](https://www.microsoft.com/zh-TW/download/details.aspx?id=54920) 下載 Driver 檔案，安裝後，再安裝 `RODBC`，即可執行。

In [13]:
library(RODBC)
library(RWeka)
library(arules)
library(knitr)

In [121]:
db <- odbcDriverConnect("Driver={Microsoft Access Driver (*.mdb, *.accdb)};
                        DBQ=.\\foodmart2000.mdb")

sql1 <- "
select
    ct.account_num,
    dt.the_date,
    st.store_name,
    pc.product_category
from sales_fact_1998 as tx, product as pd, product_class as pc, store as st, customer as ct, time_by_day as dt
where pd.product_id = tx.product_id
and pd.product_class_id = pc.product_class_id
and st.store_id = tx.store_id
and ct.customer_id = tx.customer_id
and dt.time_id = tx.time_id;"

sql2 <- "
select
    trim(str(ft.customer_id)) & '-' & trim(str(ft.time_id)) & '-' & trim(str(ft.store_id)) as tid,
    pc.product_category as item
from sales_fact_1998 as ft, product as pd, product_class as pc
where pd.product_id = ft.product_id
and pd.product_class_id = pc.product_class_id;"

ft <- as.data.frame(sqlQuery(db, sql2))
ft[, 1] <- as.character(ft[, 1])
ft[, 2] <- as.character(ft[, 2])
# for(ci in 1:4){
#     ft[,ci] <- as.character(ft[,ci])
# }
head(ft)


Unnamed: 0_level_0,tid,item
Unnamed: 0_level_1,<chr>,<chr>
1,2094-748-1,Vegetables
2,2094-748-1,Vegetables
3,2094-748-1,Jams and Jellies
4,2094-748-1,Paper Products
5,2094-748-1,Frozen Desserts
6,2094-748-1,Canned Soup


In [None]:
write.arff(ft, file = 'sales_fact_1998_2.arff')
write.table(
    ft,
    'sales_fact_1998_2.csv',
    row.names = FALSE,
    col.names = FALSE,
    sep = ",",
    quote = FALSE)

In [77]:
# tids <- head(unique(ft$tid), n=50)


In [72]:
tid1 <- tids[[1]]
item1 <- unique(ft[ft$tid==tid1, 2])
c(item1)

In [82]:
tids <- unique(ft$tid)
print(length(tids))

max_len <- 0
for (tid in tids){
    items <- unique(ft[ft$tid==tid, 2])
    item_len <- length(items)
    max_len <- ifelse(item_len > max_len, item_len, max_len)
}

print(max_len)

# max_len <- max_len + 1
header <- paste('I', 1:max_len, sep = '')
result_frame <- rbind(NULL)

for (tid in tids){
    items <- unique(ft[ft$tid==tid, 2])
    items_len <- length(items)
    # print(items_len)
    diff_len <- max_len - items_len
    # print(diff_len)
    row <- c(items, rep('', diff_len))
    # print(row)
    result_frame <- rbind(result_frame, row)
}

result_frame <- rbind(header, result_frame)

print(head(result_frame))

write.arff(result_frame, file = 'sales_fact_1998_tx2.arff')
write.table(
    result_frame,
    'sales_fact_1998_tx2.csv',
    row.names = FALSE,
    col.names = FALSE,
    sep = ",",
    quote = TRUE)

[1] 34070
[1] 18


In [83]:
tx <- read.transactions('sales_fact_1998_tx2.csv',
                        sep = ',',
                        format = 'basket',
                        header = TRUE)
rules <- apriori(tx, parameter = list(supp = 0.0001,
                                      conf = 0.9,
                                      minlen = 2,
                                      maxlen = (max_len - 1),
                                      target = 'rules'))
result <- DATAFRAME(head(rules, by = 'lift', n = 10))
knitr::kable(result, format="markdown")

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.9    0.1    1 none FALSE            TRUE       5   1e-04      2
 maxlen target  ext
     17  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 3 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[45 item(s), 34070 transaction(s)] done [0.01s].
sorting and recoding items ... [45 item(s)] done [0.00s].
creating transaction tree ... done [0.01s].
checking subsets of size 1 2 3 4 5 6 7 8 done [0.05s].
writing ... [9478 rule(s)] done [0.01s].
creating S4 object  ... done [0.01s].




|     |LHS                                                                        |RHS             |   support| confidence|  coverage|     lift| count|
|:----|:--------------------------------------------------------------------------|:---------------|---------:|----------:|---------:|--------:|-----:|
|661  |{Dairy,Frozen Entrees,Kitchen Products,Side Dishes}                        |{Miscellaneous} | 0.0001174|          1| 0.0001174| 64.89524|     4|
|672  |{Canned Soup,Electrical,Kitchen Products,Side Dishes}                      |{Miscellaneous} | 0.0001468|          1| 0.0001468| 64.89524|     5|
|3532 |{Canned Soup,Electrical,Frozen Entrees,Kitchen Products,Side Dishes}       |{Miscellaneous} | 0.0001174|          1| 0.0001174| 64.89524|     4|
|3537 |{Dairy,Electrical,Frozen Entrees,Kitchen Products,Side Dishes}             |{Miscellaneous} | 0.0001174|          1| 0.0001174| 64.89524|     4|
|3542 |{Canned Soup,Dairy,Frozen Entrees,Kitchen Products,Side Dishes}            |{Mi

In [40]:
write.arff(ft, file = 'sales_fact_1998_tx.arff')
write.table(
    ft,
    'sales_fact_1998_tx.csv',
    row.names = FALSE,
    col.names = TRUE,
    sep = ",",
    quote = FALSE)

In [122]:
sql_pc <- "
select distinct product_category from product_class; 
"

pc <- as.data.frame(sqlQuery(db, sql_pc))
pc[,1] <- as.character(pc[,1])

columns <- c("tid", as.vector(t(pc)[1,]))
# print(columns)

len <- length(columns)
# tids <- head(unique(ft$tid), n=500)
tids <- unique(ft$tid)
df <- as.data.frame(matrix(ncol = length(columns), 
                           nrow=0, 
                           dimnames = list(NULL,columns)))

# print(columns)
# print(colnames(df))

ridx <- 1

for(tid in tids){
    df[ridx, 1] = tid
    df[ridx, 2:len] = 0

    pcs <- unique(as.vector(ft[ft$tid==tid,2]))
    
    for(pc in pcs){
        df[ridx, pc] = 1
    }
    
    ridx <- ridx + 1
}

for (idx in 1:length(columns)){
    df[,idx] <- as.factor(df[,idx])
}

head(df)

Unnamed: 0_level_0,tid,Baking Goods,Bathroom Products,Beer and Wine,Bread,Breakfast Foods,Candles,Candy,Canned Anchovies,Canned Clams,...,Paper Products,Pizza,Plastic Products,Pure Juice Beverages,Seafood,Side Dishes,Snack Foods,Specialty,Starchy Foods,Vegetables
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,...,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,2094-748-1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
2,1277-748-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1745-748-1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2312-748-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,8943-748-1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
6,1447-748-1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0


In [123]:
write.arff(df[,2:len], file = 'sales_fact_1998_fp.arff')
write.table(
    df[,2:len],
    'sales_fact_1998_fp.csv',
    row.names = FALSE,
    col.names = TRUE,
    sep = ",",
    quote = FALSE)

### 題目 1

請利用Weka 中的Apriori 演算法，從Foodmart資料庫的交易資料中，探勘符合Minimum Support = 0.0001且Minimum Confidence = 0.9的Association Rules，並列出Confidence最高的前10條Rules。若無法跑出結果，請簡述其原因。

In [92]:
# tx <- read.transactions('sales_fact_1998_tx.csv',
#                         sep = ',',
#                         format = 'single',
#                         header = TRUE,
#                         cols = c('tid', 'item'))

# tx <- read.transactions('sales_fact_1998.csv',
#                         sep = ',',
#                         format = 'basket',
#                         header = TRUE)

tx <- read.transactions('sales_fact_1998_tx2.csv',
                        sep = ',',
                        format = 'basket',
                        header = TRUE)
summary(tx)
# summary(df[,2:len])

transactions as itemMatrix in sparse format with
 49 rows (elements/itemsets/transactions) and
 40 columns (items) and a density of 0.1061224 

most frequent items:
 Vegetables Snack Foods        Meat       Bread       Dairy     (Other) 
         20          17          12          10          10         139 

element (itemset/transaction) length distribution:
sizes
 1  2  3  4  5  6  7  8 13 
 7  6  8  6  7  8  3  3  1 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   2.000   4.000   4.245   6.000  13.000 

includes extended item information - examples:
             labels
1      Baking Goods
2 Bathroom Products
3     Beer and Wine

In [93]:
# rules <- apriori(df[,2:len], parameter = list(supp = 0.01, conf = 0.9, maxlen = 10))
rules <- apriori(tx, parameter = list(supp = 0.0001, conf = 0.9, maxlen = 10, target = 'rules'))
summary(rules)

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.9    0.1    1 none FALSE            TRUE       5   1e-04      1
 maxlen target  ext
     10  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 0 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[40 item(s), 49 transaction(s)] done [0.00s].
sorting and recoding items ... [40 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 7 8 9 10

"Mining stopped (maxlen reached). Only patterns up to a length of 10 returned!"


 done [0.00s].
writing ... [55664 rule(s)] done [0.01s].
creating S4 object  ... done [0.02s].


set of 55664 rules

rule length distribution (lhs + rhs):sizes
    2     3     4     5     6     7     8     9    10 
   46  1133  4069  7653 10947 12201 10320  6435  2860 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.000   6.000   7.000   6.794   8.000  10.000 

summary of quality measures:
    support          confidence    coverage            lift      
 Min.   :0.02041   Min.   :1    Min.   :0.02041   Min.   : 2.45  
 1st Qu.:0.02041   1st Qu.:1    1st Qu.:0.02041   1st Qu.: 4.90  
 Median :0.02041   Median :1    Median :0.02041   Median : 7.00  
 Mean   :0.02047   Mean   :1    Mean   :0.02047   Mean   :11.03  
 3rd Qu.:0.02041   3rd Qu.:1    3rd Qu.:0.02041   3rd Qu.: 9.80  
 Max.   :0.06122   Max.   :1    Max.   :0.06122   Max.   :49.00  
     count      
 Min.   :1.000  
 1st Qu.:1.000  
 Median :1.000  
 Mean   :1.003  
 3rd Qu.:1.000  
 Max.   :3.000  

mining info:
 data ntransactions support confidence
   tx            49   1e-04        0.9

In [94]:
summary(rules)

set of 55664 rules

rule length distribution (lhs + rhs):sizes
    2     3     4     5     6     7     8     9    10 
   46  1133  4069  7653 10947 12201 10320  6435  2860 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.000   6.000   7.000   6.794   8.000  10.000 

summary of quality measures:
    support          confidence    coverage            lift      
 Min.   :0.02041   Min.   :1    Min.   :0.02041   Min.   : 2.45  
 1st Qu.:0.02041   1st Qu.:1    1st Qu.:0.02041   1st Qu.: 4.90  
 Median :0.02041   Median :1    Median :0.02041   Median : 7.00  
 Mean   :0.02047   Mean   :1    Mean   :0.02047   Mean   :11.03  
 3rd Qu.:0.02041   3rd Qu.:1    3rd Qu.:0.02041   3rd Qu.: 9.80  
 Max.   :0.06122   Max.   :1    Max.   :0.06122   Max.   :49.00  
     count      
 Min.   :1.000  
 1st Qu.:1.000  
 Median :1.000  
 Mean   :1.003  
 3rd Qu.:1.000  
 Max.   :3.000  

mining info:
 data ntransactions support confidence
   tx            49   1e-04        0.9

In [95]:
inspect(head(rules, by = 'lift', n = 10))

     lhs                                  rhs               support   
[1]  {Drinks,Pain Relievers}           => {Canned Tuna}     0.02040816
[2]  {Drinks,Eggs}                     => {Canned Tuna}     0.02040816
[3]  {Eggs,Pain Relievers}             => {Canned Tuna}     0.02040816
[4]  {Electrical,Hot Beverages}        => {Candles}         0.02040816
[5]  {Breakfast Foods,Candy}           => {Candles}         0.02040816
[6]  {Breakfast Foods,Electrical}      => {Candles}         0.02040816
[7]  {Eggs,Jams and Jellies}           => {Decongestants}   0.02040816
[8]  {Jams and Jellies,Meat}           => {Decongestants}   0.02040816
[9]  {Cold Remedies,Frozen Entrees}    => {Canned Sardines} 0.02040816
[10] {Cleaning Supplies,Cold Remedies} => {Canned Sardines} 0.02040816
     confidence coverage   lift count
[1]  1          0.02040816 49   1    
[2]  1          0.02040816 49   1    
[3]  1          0.02040816 49   1    
[4]  1          0.02040816 49   1    
[5]  1          0.02040816 49

### 題目 2

請利用Weka 中的FP-Growth演算法，從Foodmart資料庫的交易資料中，探勘符合Minimum Support = 0.0001 且Minimum Confidence = 0.9的Association Rules，並列出Confidence最高的前10條Rules。若無法跑出結果，請簡述其原因。

### 題目 3

有時候我們有興趣的資料不只有產品間的資訊，也會想要由User Profile探勘顧客的基本資料。請運用Weka，給定Minimum Support = 0.05且Minimum Confidence= 0.9的條件下，探勘Foodmart顧客基本資料的屬性{State_Province, Yearly_Income , Gender , Total_Children , Num_Children_at_Home , Education , Occupation, Houseowner , Num_cars,owned } 間的association rule。(列出10條)

In [132]:
db <- odbcDriverConnect("Driver={Microsoft Access Driver (*.mdb, *.accdb)};
                        DBQ=.\\foodmart2000.mdb")

sql3 <- "
select distinct
    ct.customer_id,
    ct.state_province,
    ct.yearly_income,
    ct.gender,
    ct.total_children,
    ct.num_children_at_home,
    ct.education,
    ct.occupation,
    ct.houseowner,
    ct.num_cars_owned
from sales_fact_1998 tx, customer ct
where tx.customer_id = ct.customer_id;
"

ct <- as.data.frame(sqlQuery(db, sql3))
for(idx in 2:10){
    ct[,idx] <- as.factor(ct[, idx])
}

head(ct)

Unnamed: 0_level_0,customer_id,state_province,yearly_income,gender,total_children,num_children_at_home,education,occupation,houseowner,num_cars_owned
Unnamed: 0_level_1,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,3,WA,$50K - $70K,F,1,1,Bachelors Degree,Professional,Y,2
2,6,WA,$70K - $90K,F,3,0,Bachelors Degree,Professional,Y,3
3,8,DF,$50K - $70K,M,2,2,Bachelors Degree,Professional,Y,3
4,9,BC,$10K - $30K,M,5,3,Partial High School,Skilled Manual,Y,1
5,10,OR,$30K - $50K,M,4,0,Bachelors Degree,Management,N,4
6,11,CA,$50K - $70K,M,4,0,High School Degree,Manual,N,2


In [133]:
write.arff(ct[,2:10], file = 'customer.arff')
write.table(
    df[,2:10],
    'customer.csv',
    row.names = FALSE,
    col.names = TRUE,
    sep = ",",
    quote = FALSE)

In [131]:
items <- c("state_province",
           "yearly_income",
           "gender",
           "total_children",
           "num_children_at_home",
           "education",
           "occupation",
           "houseowner",
           "num_cars_owned")

fact_list <- list()
for(item in items){
    item_sql = paste("select distinct", item, "from customer;")
    fact_rows <- as.data.frame(sqlQuery(db, item_sql))
    facts <- as.vector(t(fact_rows)[1,])
    fact_list[[item]] <- facts
}

head(fact_list)

### 題目 4

請運用Weka探勘Foodmart資料庫中，顧客背景資料與其交易資料之間的關係(Quantitative Association Rules)。例如80%女性顧客常買保養品。請自行嘗試設定Minimum Support Minimum Confidence，找出10條你覺得有意義的Rules。請說明你的作法及相關參數設定。

In [160]:
db <- odbcDriverConnect("Driver={Microsoft Access Driver (*.mdb, *.accdb)};
                        DBQ=.\\foodmart2000.mdb")

sql4 <- "
select
    trim(str(ft.customer_id)) & '-' & trim(str(ft.time_id)) & '-' & trim(str(ft.store_id)) as tid,
    ct.yearly_income,
    ct.gender,
    ct.occupation,
    pc.product_category as item
from sales_fact_1998 as ft, product as pd, product_class as pc, customer as ct
where pd.product_id = ft.product_id
and pd.product_class_id = pc.product_class_id
and ft.customer_id = ct.customer_id
union all
select
    trim(str(ft.customer_id)) & '-' & trim(str(ft.time_id)) & '-' & trim(str(ft.store_id)) as tid,
    ct.yearly_income,
    ct.gender,
    ct.occupation,
    pc.product_category as item
from sales_fact_dec_1998 as ft, product as pd, product_class as pc, customer as ct
where pd.product_id = ft.product_id
and pd.product_class_id = pc.product_class_id
and ft.customer_id = ct.customer_id;"

max_columns <- 5

ft <- as.data.frame(sqlQuery(db, sql4))

for(ci in 1:max_columns){
   ft[,ci] <- as.character(ft[,ci])
}

head(ft)

Unnamed: 0_level_0,tid,yearly_income,gender,occupation,item
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
1,3-819-15,$50K - $70K,F,Professional,Magazines
2,3-838-15,$50K - $70K,F,Professional,Dairy
3,3-838-15,$50K - $70K,F,Professional,Dairy
4,3-838-15,$50K - $70K,F,Professional,Candy
5,3-838-15,$50K - $70K,F,Professional,Starchy Foods
6,3-838-15,$50K - $70K,F,Professional,Kitchen Products


In [161]:
columns <- c("tid")

cate_sql <- "select distinct product_category from product_class;"

cust_attrs <- c("yearly_income", "gender", "education", "occupation")

attr_sqls = paste("select distinct", cust_attrs, "from customer;")

all_attr_sqls = c(attr_sqls, cate_sql)

for( sql in all_attr_sqls){
  pc <- as.data.frame(sqlQuery(db, sql))
  pc[,1] <- as.character(pc[,1])
  columns <- c(columns, as.vector(t(pc)[1,]))
}

len <- length(columns)
# tids <- head(unique(ft$tid), n=5000)
tids <- unique(ft$tid)
df <- as.data.frame(matrix(ncol = length(columns), 
                           nrow=0, 
                           dimnames = list(NULL,columns)))

ridx <- 1
for(tid in tids){
  df[ridx, 1] = tid
  df[ridx, 2:len] = 0
  
  cust_indices <- 2:(length(cust_attrs) + 1)
  for(cust_idx in cust_indices){
    ca <- unique(as.vector(ft[ft$tid==tid, cust_idx]))
    df[ridx, ca] = 1
  }
  
  prod_idx <- max_columns
  pcs <- unique(as.vector(ft[ft$tid==tid, prod_idx]))    
  for(pc in pcs){
    df[ridx, pc] = 1
  }
  
  ridx <- ridx + 1
}

for (idx in 1:length(columns)){
  df[,idx] <- as.factor(df[,idx])
}

head(df)

write.arff(df[,2:len], file = 'q4_customer_tx.arff')
write.table(
  df[,2:len],
  'q4_customer_tx.csv',
  row.names = FALSE,
  col.names = TRUE,
  sep = ",",
  quote = FALSE)

Unnamed: 0_level_0,tid,$10K - $30K,$110K - $130K,$130K - $150K,$150K +,$30K - $50K,$50K - $70K,$70K - $90K,$90K - $110K,F,...,Paper Products,Pizza,Plastic Products,Pure Juice Beverages,Seafood,Side Dishes,Snack Foods,Specialty,Starchy Foods,Vegetables
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,...,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,3-819-15,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,3-838-15,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,6-907-15,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,6-747-15,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
5,8-984-21,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
6,8-1062-21,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 題目 5

在美國由於聖誕節，12月是購物的旺季。請探勘分析比較12月與1~11月的顧客購物行為。有哪些相似的地方，有哪些差異的地方？ 

In [14]:
db <- odbcDriverConnect("Driver={Microsoft Access Driver (*.mdb, *.accdb)};
                        DBQ=.\\foodmart2000.mdb")

sql5_xmas <- "
select
    trim(str(ft.customer_id)) & '-' & trim(str(ft.time_id)) & '-' & trim(str(ft.store_id)) as tid,
    pc.product_category as item
from sales_fact_1998 as ft, product as pd, product_class as pc, time_by_day as dt
where pd.product_id = ft.product_id
and pd.product_class_id = pc.product_class_id
and ft.time_id = dt.time_id
and dt.month_of_year = 12;"


sql5_normal <- "
select
    trim(str(ft.customer_id)) & '-' & trim(str(ft.time_id)) & '-' & trim(str(ft.store_id)) as tid,
    pc.product_category as item
from sales_fact_1998 as ft, product as pd, product_class as pc, time_by_day as dt
where pd.product_id = ft.product_id
and pd.product_class_id = pc.product_class_id
and ft.time_id = dt.time_id
and dt.month_of_year <> 12;"


In [15]:
export_factor_table <- function(db, sql, file_name){
    max_columns <- 2

    ft <- as.data.frame(sqlQuery(db, sql))

    for(ci in 1:max_columns){
       ft[,ci] <- as.character(ft[,ci])
    }

    head(ft)

    columns <- c("tid")

    cate_sql <- "select distinct product_category from product_class;"
    pc <- as.data.frame(sqlQuery(db, cate_sql))
    pc[,1] <- as.character(pc[,1])
    columns <- c(columns, as.vector(t(pc)[1,]))

    len <- length(columns)

    # tids <- head(unique(ft$tid), n=5000)
    tids <- unique(ft$tid)

    df <- as.data.frame(matrix(ncol = length(columns), 
                               nrow=0, 
                               dimnames = list(NULL,columns)))

    ridx <- 1
    for(tid in tids){
        df[ridx, 1] = tid
        df[ridx, 2:len] = 0

        prod_idx <- max_columns
        pcs <- unique(as.vector(ft[ft$tid==tid, prod_idx]))    
        for(pc in pcs){
            df[ridx, pc] = 1
        }

        ridx <- ridx + 1
    }

    for (idx in 1:length(columns)){
        df[,idx] <- as.factor(df[,idx])
    }

    head(df)

    write.arff(df[,2:len], file = paste(file_name, '.arff', sep = ""))
    write.table(
        df[,2:len],
        paste(file_name, '.csv', sep = ""),
        row.names = FALSE,
        col.names = TRUE,
        sep = ",",
        quote = FALSE)
}

In [17]:
export_factor_table(db, sql5_xmas, 'tx_xmas_fp')

In [16]:
export_factor_table(db, sql5_normal, 'tx_normal_fp')