#### FP-Growth Algorithm Model on 50% of the Data

In [1]:
# imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
from fpgrowth_py import fpgrowth
import pyfpgrowth
import random
import matplotlib.pyplot as plt

In [2]:
# reading random 50% of the prior orders data
p = 0.50
order_prior = pd.read_csv('data/order_products__prior.csv', header = 0,  skiprows=lambda i: i>0 and random.random() > p)

In [3]:
# reading random 50% of train orders data
p = 0.50
order_train = pd.read_csv('data/order_products__train.csv', header = 0,  skiprows=lambda i: i>0 and random.random() > p)

In [4]:
# reading in products data
products = pd.read_csv('data/products.csv')

**Merge products and prior Orders**

In [5]:
prod_orders_prior = pd.merge(products, order_prior,  on = 'product_id') # marge products and orders

**Merge products and train Orders**

In [6]:
prod_orders_train = pd.merge(products, order_train,  on = 'product_id') # merge products and train orders

**Concatenate and preprocess prior and train Orders**

In [7]:
orders_train_prior = pd.concat([prod_orders_train, prod_orders_prior], ignore_index = True, sort = False)

In [8]:
# lower case product names
orders_train_prior['product_name'] = orders_train_prior['product_name'].str.lower()

In [9]:
# take out commas in product names
orders_train_prior['product_name'] = orders_train_prior['product_name'].str.replace(",","")

In [10]:
# group products by order_id into a df
df_all_orders = orders_train_prior.groupby('order_id')['product_name'].agg([('product_name', ','.join)]).reset_index()

In [11]:
# reset order_id as index
df_all_orders.set_index('order_id', inplace=True)

In [12]:
# split results product sets by comma
df_orders_split = df_all_orders.product_name.apply(lambda x: x.split(','))

In [13]:
# create a dataframe
df_orders = pd.DataFrame(df_orders_split) 

In [14]:
# Identify products with minimum support level
patterns_all = pyfpgrowth.find_frequent_patterns(df_orders['product_name'], 1000)

In [35]:
# Identify itemsets with minimum confidence level
rules = pyfpgrowth.generate_association_rules(patterns_all, 0.10)

In [37]:
# Create a dataframe of frequent patterns and transpose the dataframe
df_rules = pd.DataFrame(rules).T

In [45]:
# sort a dataframe in descending orders
df_rules_sorted = df_rules.sort_values(ascending = False, by = 1)

In [46]:
# reset index to display pair products
df_rules_sorted.reset_index(level = 0, inplace = True)

In [47]:
df_rules_sorted.reset_index(level = 0, inplace = True)

In [50]:
# rename columns to antecedent, consequents, and calculated confidence level
df_rules_sorted.rename(columns = {'index':'consequent_1','level_0':'consequent_2', 0:'antecedent', 1: 'confidence_level'}, inplace = True)

In [51]:
# take out brackets and commas in the consequent column
df_rules_sorted['antecedent'] = df_rules_sorted['antecedent'].astype(str).str.replace(r'\(|\)|,', '')
# reorder columns
cols = ['antecedent', 'consequent_1', 'consequent_2', 'confidence_level']
df_rules_sorted = df_rules_sorted.reindex(columns=cols)

In [53]:
# save results to a .csv file in data folder
df_rules_sorted.to_csv('data/rules_50_percent.csv', index = False)