http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/

## Import Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from mlxtend.frequent_patterns import apriori
%matplotlib inline

plt.rcParams["figure.figsize"] = (15,10)

## Import Data

In [2]:
df = pd.io.parsers.read_csv('Schwab.csv', dtype={'ACCT_NUM':'str',
                                                 'MO_NUM':'str',
                                                 'ZIP_CODE': 'str'})
df.head()

Unnamed: 0,ACCT_NUM,MO_NUM,ORG_NAME,SHARE_CLASS,TRANSACTION_CLASS,COMP_TYPE,TRANSACTION_TYPE,AMOUNT,ZIP_CODE
0,1000129,201701,"Charles Schwab & Co., Inc",A,Retail,Commissions,Redemption,44339861,60565
1,1000128,201701,"Charles Schwab & Co., Inc",F-1,Retail,Performance Based,Exchange Out,94732923,60565
2,1000128,201701,"Charles Schwab & Co., Inc",U1,Retail,Commissions,Sales,34144231,60565
3,1000123,201701,"Charles Schwab & Co., Inc",R-2,Insurance,Percent AUM,Exchange In,66752374,90210
4,1000127,201701,"Charles Schwab & Co., Inc",F-2,Insurance,Fixed Fee,Exchange In,66375012,92620


## Exploratory Data Analysis

In [3]:
df.describe()

Unnamed: 0,AMOUNT
count,2834.0
mean,50185130.0
std,28801060.0
min,1008682.0
25%,25912000.0
50%,49411980.0
75%,75242500.0
max,99997320.0


In [4]:
df.shape

(2834, 9)

In [5]:
headers = sorted(df.columns.values)
headers

['ACCT_NUM',
 'AMOUNT',
 'COMP_TYPE',
 'MO_NUM',
 'ORG_NAME',
 'SHARE_CLASS',
 'TRANSACTION_CLASS',
 'TRANSACTION_TYPE',
 'ZIP_CODE']

## Data Preprocessing

NOTE: Removing **`AMOUNT`, `ORG_NAME`** columns

In [6]:
preprocess_columns = ['SHARE_CLASS', 'TRANSACTION_CLASS', 'ZIP_CODE', 'ACCT_NUM','MO_NUM',  'COMP_TYPE', 'TRANSACTION_TYPE'] #'ZIP_CODE' 'ACCT_NUM','MO_NUM',  'COMP_TYPE', 'TRANSACTION_TYPE'

In [7]:
prep_df = df[preprocess_columns]
prep_df.head()

Unnamed: 0,SHARE_CLASS,TRANSACTION_CLASS,ZIP_CODE,ACCT_NUM,MO_NUM,COMP_TYPE,TRANSACTION_TYPE
0,A,Retail,60565,1000129,201701,Commissions,Redemption
1,F-1,Retail,60565,1000128,201701,Performance Based,Exchange Out
2,U1,Retail,60565,1000128,201701,Commissions,Sales
3,R-2,Insurance,90210,1000123,201701,Percent AUM,Exchange In
4,F-2,Insurance,92620,1000127,201701,Fixed Fee,Exchange In


In [8]:
prep_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2834 entries, 0 to 2833
Data columns (total 7 columns):
SHARE_CLASS          2834 non-null object
TRANSACTION_CLASS    2834 non-null object
ZIP_CODE             2834 non-null object
ACCT_NUM             2834 non-null object
MO_NUM               2834 non-null object
COMP_TYPE            2834 non-null object
TRANSACTION_TYPE     2834 non-null object
dtypes: object(7)
memory usage: 155.1+ KB


### Get Dummy Variables

In [9]:
prep_dummies_df = pd.get_dummies(prep_df)
prep_dummies_df.head()

Unnamed: 0,SHARE_CLASS_A,SHARE_CLASS_F-1,SHARE_CLASS_F-2,SHARE_CLASS_R-1,SHARE_CLASS_R-2,SHARE_CLASS_R-6,SHARE_CLASS_U1,SHARE_CLASS_U2,TRANSACTION_CLASS_Institutional,TRANSACTION_CLASS_Insurance,...,MO_NUM_201802,MO_NUM_201803,COMP_TYPE_Commissions,COMP_TYPE_Fixed Fee,COMP_TYPE_Percent AUM,COMP_TYPE_Performance Based,TRANSACTION_TYPE_Exchange In,TRANSACTION_TYPE_Exchange Out,TRANSACTION_TYPE_Redemption,TRANSACTION_TYPE_Sales
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,1,0,1,0,0,0
4,0,0,1,0,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0


In [10]:
x = True
y = False
print(bool(x))

True


In [11]:
prep_dummies_df = prep_dummies_df.replace({1:x, 0:y})
prep_dummies_df.head()

Unnamed: 0,SHARE_CLASS_A,SHARE_CLASS_F-1,SHARE_CLASS_F-2,SHARE_CLASS_R-1,SHARE_CLASS_R-2,SHARE_CLASS_R-6,SHARE_CLASS_U1,SHARE_CLASS_U2,TRANSACTION_CLASS_Institutional,TRANSACTION_CLASS_Insurance,...,MO_NUM_201802,MO_NUM_201803,COMP_TYPE_Commissions,COMP_TYPE_Fixed Fee,COMP_TYPE_Percent AUM,COMP_TYPE_Performance Based,TRANSACTION_TYPE_Exchange In,TRANSACTION_TYPE_Exchange Out,TRANSACTION_TYPE_Redemption,TRANSACTION_TYPE_Sales
0,True,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,True,False
1,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
2,False,False,False,False,False,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,True
3,False,False,False,False,True,False,False,False,False,True,...,False,False,False,False,True,False,True,False,False,False
4,False,False,True,False,False,False,False,False,False,True,...,False,False,False,True,False,False,True,False,False,False


## Apriori / Association Rule Mining

In [12]:
apriori_df = apriori(prep_dummies_df, min_support=0.01, use_colnames=True).sort_values(by='support', ascending=False).reset_index(drop=True)
apriori_df['length'] = apriori_df['itemsets'].apply(lambda x: len(x))
apriori_df

Unnamed: 0,support,itemsets,length
0,0.337685,(TRANSACTION_CLASS_Insurance),1
1,0.334157,(TRANSACTION_CLASS_Institutional),1
2,0.328158,(TRANSACTION_CLASS_Retail),1
3,0.253705,(TRANSACTION_TYPE_Redemption),1
4,0.253352,(COMP_TYPE_Commissions),1
5,0.251588,(TRANSACTION_TYPE_Exchange In),1
6,0.250176,(COMP_TYPE_Percent AUM),1
7,0.249824,(COMP_TYPE_Performance Based),1
8,0.249118,(TRANSACTION_TYPE_Exchange Out),1
9,0.246648,(COMP_TYPE_Fixed Fee),1


In [13]:
filtered_apriori_df = apriori_df[ (apriori_df['length'] >= 2) & (apriori_df['support'] >= 0.01) ]
filtered_apriori_df = filtered_apriori_df.reset_index(drop=True)

In [15]:
filtered_apriori_df.to_excel('apriori_schwab_output.xlsx')