#### FP-Growth Algorithm Model on 30% of the Data

In [7]:
# imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
from fpgrowth_py import fpgrowth
import pyfpgrowth
import random
import matplotlib.pyplot as plt

In [21]:
# reading random 30% of the prior orders data
p = 0.30
order_prior = pd.read_csv('data/order_products__prior.csv', header = 0,  skiprows=lambda i: i>0 and random.random() > p)

In [22]:
# reading random 30% of train orders data
p = 0.30
order_train = pd.read_csv('data/order_products__train.csv', header = 0,  skiprows=lambda i: i>0 and random.random() > p)

In [23]:
# reading in products data
products = pd.read_csv('data/products.csv')

**Merge products and prior Orders**

In [58]:
prod_orders_prior = pd.merge(products, order_prior,  on = 'product_id') # marge products and orders

**Merge products and train Orders**

In [59]:
prod_orders_train = pd.merge(products, order_train,  on = 'product_id') # merge products and train orders

**Concatenate and preprocess prior and train Orders**

In [26]:
# concatenate prior and train orders
orders_train_prior = pd.concat([prod_orders_train, prod_orders_prior], ignore_index = True, sort = False)

In [27]:
# lower case product names
orders_train_prior['product_name'] = orders_train_prior['product_name'].str.lower()

In [28]:
# take out commas in product names
orders_train_prior['product_name'] = orders_train_prior['product_name'].str.replace(",","")

In [29]:
# group products by order_id into a df
df_all_orders = orders_train_prior.groupby('order_id')['product_name'].agg([('product_name', ','.join)]).reset_index()

In [30]:
# reset order_id as index
df_all_orders.set_index('order_id', inplace=True)

In [31]:
# split results product sets by comma
df_orders_split = df_all_orders.product_name.apply(lambda x: x.split(','))

In [32]:
# create a dataframe
df_orders = pd.DataFrame(df_orders_split) 

In [33]:
# Identify products with minimum support level
patterns_all = pyfpgrowth.find_frequent_patterns(df_orders['product_name'], 1000)

In [37]:
# Identify itemsets with minimum confidence level
rules = pyfpgrowth.generate_association_rules(patterns_all, 0.05)

In [46]:
# Create a dataframe of frequent patterns and transpose the dataframe
df_rules = pd.DataFrame(rules).T

In [48]:
# sort a dataframe in descending orders
df_rules_sorted = df_rules.sort_values(ascending = False, by = 1)

In [49]:
# reset index to display pair products
df_rules_sorted.reset_index(level = 0, inplace = True)

In [53]:
# rename columns to antecedent, consequent, and calculated confidence level
df_rules_sorted.rename(columns = {'level_0':'consequent', 0:'antecedent', 1: 'confidence_level'}, inplace = True)

In [54]:
# take out brackets and commas in the consequent column
df_rules_sorted['antecedent'] = df_rules_sorted['antecedent'].astype(str).str.replace(r'\(|\)|,', '')
# reorder columns
cols = ['antecedent', 'consequent', 'confidence_level']
df_rules_sorted = df_rules_sorted.reindex(columns=cols)

In [61]:
df_rules_sorted.head(2)

Unnamed: 0,antecedent,consequent,confidence_level
0,'banana',bartlett pears,0.123199
1,'banana',broccoli crown,0.103162


In [56]:
# save results to a .csv file in data folder
df_rules_sorted.to_csv('data/rules_30_percent.csv', index = False)