http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/

### Import Packages

In [1]:
import pandas as pd
from sklearn import preprocessing
from mlxtend.frequent_patterns import apriori

### Import Data

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


### Exploratory Data Analysis

In [4]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
count,887.0,887.0,887.0,887.0,887.0,887.0
mean,0.385569,2.305524,29.471443,0.525366,0.383315,32.30542
std,0.487004,0.836662,14.121908,1.104669,0.807466,49.78204
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.25,0.0,0.0,7.925
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.1375
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [3]:
df.shape

(887, 8)

In [5]:
df.isnull().any()

Survived                   False
Pclass                     False
Name                       False
Sex                        False
Age                        False
Siblings/Spouses Aboard    False
Parents/Children Aboard    False
Fare                       False
dtype: bool

#### Age

In [6]:
df['Age'].head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [7]:
df['Age'].unique()

array([22.  , 38.  , 26.  , 35.  , 27.  , 54.  ,  2.  , 14.  ,  4.  ,
       58.  , 20.  , 39.  , 55.  , 23.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 24.  , 40.  , 48.  , 18.  , 66.  , 42.  , 21.  ,
        3.  , 30.  , 16.  ,  7.  , 49.  , 29.  , 65.  , 46.  , 28.5 ,
        5.  , 11.  , 45.  , 64.  , 17.  , 32.  , 25.  ,  0.83, 33.  ,
       59.  , 71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  ,
       36.5 , 51.  , 55.5 , 40.5 , 44.  ,  1.  , 60.  , 61.  , 56.  ,
       50.  , 36.  , 45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 57.  ,
       23.5 ,  0.92, 43.  , 10.  , 13.  ,  0.75, 69.  , 53.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [8]:
def age_imputer(x):
    if x < 20:
        return 'youth'
    elif 21 < x < 50:
        return 'adult'
    else:
        return 'elderly'

In [9]:
df['Age'] = df['Age'].apply(age_imputer)

In [10]:
Age_dummies = pd.get_dummies(df['Age'], prefix='Age', drop_first=True)
Age_dummies.head()

Unnamed: 0,Age_elderly,Age_youth
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


#### Sex

In [11]:
# Create an encoder
sex_encoder = preprocessing.LabelEncoder()

In [12]:
df['Sex'] = sex_encoder.fit_transform(df['Sex'])

In [13]:
df['Sex'].head()

0    1
1    0
2    0
3    0
4    1
Name: Sex, dtype: int64

In [41]:
print("There are {}% males".format(df['Sex'].sum() / df['Sex'].count()))

There are 0.6459977452085682% males


#### Passenger Class

In [14]:
# Convert the Passenger class feature into dummy variables using one-hot and leave one first category
Pclass_dummies = pd.get_dummies(df['Pclass'], prefix='Pclass', drop_first=True)
Pclass_dummies.head()

Unnamed: 0,Pclass_2,Pclass_3
0,0,1
1,0,0
2,0,1
3,0,0
4,0,1


In [36]:
pclass_pivot = pd.pivot_table(df, index='Pclass', values='Name', aggfunc='count', margins=True, margins_name='Total')
pclass_pivot['Percent'] = pclass_pivot['Name'] / pclass_pivot.Name.Total
pclass_pivot

Unnamed: 0_level_0,Name,Percent
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,216,0.243517
2,184,0.207441
3,487,0.549042
Total,887,1.0


#### Survived

In [15]:
df['Survived'].unique()

array([0, 1], dtype=int64)

#### Siblings/Spouses

In [16]:
df['Siblings/Spouses Aboard'].unique()

array([1, 0, 3, 4, 2, 5, 8], dtype=int64)

#### Parents/Children Aboard

In [17]:
df['Parents/Children Aboard'].unique()

array([0, 1, 2, 5, 3, 4, 6], dtype=int64)

### Data Preprocessing

In [18]:
data = df[['Survived', 'Pclass', 'Sex', 'Age']]
data = data.rename(columns={'Survived':'Survived_yes', 'Sex':'Sex_male'})
data = pd.concat([data, Pclass_dummies, Age_dummies], axis=1)
data = data.drop(['Pclass', 'Age'], axis=1)
data.head()

Unnamed: 0,Survived_yes,Sex_male,Pclass_2,Pclass_3,Age_elderly,Age_youth
0,0,1,0,1,0,0
1,1,0,0,0,0,0
2,1,0,0,1,0,0
3,1,0,0,0,0,0
4,0,1,0,1,0,0


In [19]:
x = True
y = False
print(bool(x))

True


In [20]:
data = data.replace({1:x, 0:y})
data.head()

Unnamed: 0,Survived_yes,Sex_male,Pclass_2,Pclass_3,Age_elderly,Age_youth
0,False,True,False,True,False,False
1,True,False,False,False,False,False
2,True,False,False,True,False,False
3,True,False,False,False,False,False
4,False,True,False,True,False,False


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 6 columns):
Survived_yes    887 non-null bool
Sex_male        887 non-null bool
Pclass_2        887 non-null bool
Pclass_3        887 non-null bool
Age_elderly     887 non-null bool
Age_youth       887 non-null bool
dtypes: bool(6)
memory usage: 5.3 KB


### Apriori / Association Rule Mining 

- **Support** is an indication of how frequently the itemset appears in the dataset.

In [22]:
apriori_df = apriori(data, min_support=0.1, use_colnames=True).sort_values(by='support', ascending=False).reset_index(drop=True)
apriori_df

Unnamed: 0,support,itemsets
0,0.645998,(Sex_male)
1,0.549042,(Pclass_3)
2,0.386697,"(Pclass_3, Sex_male)"
3,0.385569,(Survived_yes)
4,0.224352,(Age_youth)
5,0.207441,(Pclass_2)
6,0.16009,"(Pclass_3, Age_youth)"
7,0.157835,(Age_elderly)
8,0.13416,"(Pclass_3, Survived_yes)"
9,0.124014,"(Sex_male, Age_youth)"


#### Filtering Itemsets

In [42]:
apriori_df['length'] = apriori_df['itemsets'].apply(lambda x: len(x))
apriori_df[ (apriori_df['length'] >= 2) & (apriori_df['support'] >= 0.1) ]

Unnamed: 0,support,itemsets,length
2,0.386697,"(Pclass_3, Sex_male)",2
6,0.16009,"(Pclass_3, Age_youth)",2
8,0.13416,"(Pclass_3, Survived_yes)",2
9,0.124014,"(Sex_male, Age_youth)",2
10,0.122886,"(Survived_yes, Sex_male)",2
11,0.121759,"(Pclass_2, Sex_male)",2
12,0.114994,"(Age_elderly, Sex_male)",2
13,0.100338,"(Survived_yes, Age_youth)",2
