In [94]:
import numpy as np
import numpy.random as nrd
import pandas as pd
import matplotlib.pyplot as plt

import yaml

from sklearn import tree

from itertools import combinations

# Exercise 14.4

Cluster the demographic data of Table 14.1 using a classification
tree. Specifically, generate a reference sample of the same size of the training set, by randomly permuting the values within each feature. Build a
classification tree to the training sample (class 1) and the reference sample
(class 0) and describe the terminal nodes having highest estimated class 1
probability. Compare the results to the PRIM results near Table 14.1 and
also to the results of K-means clustering applied to the same data.

# Solution

Refer to [link](https://web.stanford.edu/~hastie/ElemStatLearn/datasets/marketing.info.txt) for a description of each feature and its values,

In [2]:
url_link = 'https://web.stanford.edu/~hastie/ElemStatLearn/datasets/marketing.data'

# Divide the features in Ordinal and Categorical (this affects how we transform them later)
feature_file_yaml = 'feature_type.yaml'

with open(feature_file_yaml, 'r') as file:
    features_type = yaml.safe_load(file)

# Note, we are using python 3.7 so the dictionary is ordered by insertion!
features = list(features_type.keys())

Load data from url, and remove observations with NA values,

In [3]:
df = pd.read_csv(url_link,sep=' ',header=None,names=features)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

df.tail()

Unnamed: 0,Income,Sex,Marital status,Age,Education,Occupation,Years in Bay Area,Dual incomes,Number in household,Number of children,Householder status,Type of home,Ethnic classification,Language in home
6871,1,2,5.0,1,1.0,2.0,5.0,1,3.0,2,3.0,1.0,7.0,1.0
6872,2,1,5.0,2,4.0,1.0,5.0,1,4.0,0,3.0,1.0,7.0,1.0
6873,1,2,5.0,1,2.0,1.0,5.0,1,3.0,2,3.0,1.0,7.0,1.0
6874,4,1,1.0,6,4.0,3.0,5.0,2,3.0,1,2.0,3.0,7.0,1.0
6875,6,1,5.0,3,4.0,1.0,5.0,1,1.0,0,2.0,3.0,5.0,1.0


Let's find the median of the ordeal variables,

In [4]:
median = {}

for f in features:
    if features_type[f] == 'Ordinal':
        median[f] = np.median(df[f])
        print('Feature = "{}" - Median value = {}'.format(f,median[f]))

Feature = "Income" - Median value = 6.0
Feature = "Age" - Median value = 3.0
Feature = "Education" - Median value = 4.0
Feature = "Years in Bay Area" - Median value = 5.0
Feature = "Number in household" - Median value = 3.0
Feature = "Number of children" - Median value = 0.0


With the above information we can split the 'ordinal' features in two at the median value (this has already be done in the yaml file associated with this notebook), as wella as all the 'categorical' too,

In [5]:
categorical_file_yaml = 'income_data_categorical.yaml'
ordinal_file_yaml = 'income_data_ordinal.yaml'

# Load the transformation rules for the original features from the respective yaml file
with open(categorical_file_yaml, 'r') as file:
    cat_transform = yaml.safe_load(file)
    
with open(ordinal_file_yaml, 'r') as file:
    ord_transform = yaml.safe_load(file)
    
# Create list of dummy features for new dataframe
dummy_features = []

# The categorical ones
for f in cat_transform.keys():
    for val in cat_transform[f].values():
        dummy_features.append(f+' = '+val)
        
# The ordinal ones
for f in ord_transform.keys():
    for val in ord_transform[f].keys():
        dummy_features.append(val)

This function will be useful to make the dummy dataframe,

In [101]:
# Transform dataframe into dummy dataframe
def dummy_dataframe_transform(df,features_type,transform_rule,dummy_features):

    N,_ = df.shape
    cat_transform, ord_transform = transform_rule
    features = list(features_type.keys())

    # Cretate empy dataframe
    dummy_df = pd.DataFrame(0, index=np.arange(N), columns=dummy_features)

    # Fill the dataframe with transformation rules
    for f in features:

        # Categorical variables
        if features_type[f] == 'Categorical':
            for key in cat_transform[f].keys():
                # Get the rows where the feature is equal to the key
                bool_feature = df[f] == key
                # Select the column to update in the dummy dataframe
                dummy_name = f+' = '+cat_transform[f][key]
                # Record the value in the dummy format
                dummy_df[dummy_name][bool_feature] = 1

        # Ordinal variables
        if features_type[f] == 'Ordinal':
            for dummy_name in ord_transform[f].keys():
                # Get rows whose value is within a given set
                bool_feature = df[f].isin(ord_transform[f][dummy_name])
                # Record the value in the dummy format
                dummy_df[dummy_name][bool_feature] = 1
    
    return dummy_df

# Randomise each column of the dataframe independently
def randomise_column_dataframe(df,features,random_seed=140590):
    
    rng = nrd.default_rng(seed=random_seed)

    for feature in features:
        column = df[feature].to_numpy()
        df[feature] = rng.permutation(column)

# Compute the support of a set of rules
def support(df,rules):
    N,_ = df.shape
    bool_rule = np.ones(N,dtype=bool)

    for feature, value in rules:
        bool_rule = np.logical_and(bool_rule,df[feature] == value)

    return np.mean(bool_rule)

# Find association rule with highest confidence from an item set
def association_rule(df,rules):
    
    m = len(rules)
    T_rule = support(df,rules)
    max_confidence = 0
    
    # Find optimal antecedent, measured in confidence
    for antecedent_size in range(1,m):
        for antecedent in combinations(rules,antecedent_size):
            T_antecedent = support(df,antecedent)
            confidence = T_rule/T_antecedent
            if confidence > max_confidence:
                max_confidence = confidence
                opt_antecedent = antecedent
    
    # Get the optimal consequent
    opt_consequent = []
    for rule in rules:
        if not rule in opt_antecedent:
            opt_consequent.append(rule)
            
    # Compute the lift
    T_consequent = support(df,opt_consequent)
    lift = max_confidence/T_consequent
                
    return opt_antecedent,opt_consequent,max_confidence,lift

# Print the association rule and its confidence + lift
def print_association_rule(A,B,confidence,lift):
    print('Association Rule (confidence = {:.2f}, lift = {:.2f})\n'.format(confidence,lift))
    print('If,')
    for rule_A in A:
        print('\t- "{}" is {}'.format(*rule_A))
    print('Then,')
    for rule_B in B:
        print('\t- "{}" is {}'.format(*rule_B))

### Training data and simulated data

Let's first create the dummy dataframe for our problem,

In [7]:
# The rules for transforming the categorical/ordinal features
transform_rule = (cat_transform, ord_transform)

dummy_df = dummy_dataframe_transform(df,features_type,transform_rule,dummy_features)
dummy_df.tail()

Unnamed: 0,Sex = Male,Sex = Female,Marital status = Married,"Marital status = Living together, not married",Marital status = Divorced or separated,Marital status = Widowed,"Marital status = Single, never married",Occupation = Professional/Managerial,Occupation = Sales Worker,Occupation = Factory Worker/Laborer/Driver,...,Age < 35,Age >= 35,Undergrad or less,Graduate,Years in Bay Area <= 10,Years in Bay Area > 10,Housemates <= 3,Housemates > 3,Without children,With children
6871,0,1,0,0,0,0,1,0,1,0,...,1,0,1,0,0,1,1,0,0,1
6872,1,0,0,0,0,0,1,1,0,0,...,1,0,1,0,0,1,0,1,1,0
6873,0,1,0,0,0,0,1,1,0,0,...,1,0,1,0,0,1,1,0,0,1
6874,1,0,1,0,0,0,0,0,0,1,...,0,1,1,0,0,1,1,0,0,1
6875,1,0,0,0,0,0,1,1,0,0,...,1,0,1,0,0,1,1,0,1,0


We can generate the simulated data (those generated by the independent marginal distributions of the original features) by simply permuting each column of the original database independently,

In [8]:
df_ind = df.copy()

# We randomise each column of the dataframe
randomise_column_dataframe(df_ind,features)
dummy_df_ind = dummy_dataframe_transform(df_ind,features_type,transform_rule,dummy_features)

Now we can gather together the input data and the response,

In [55]:
X = np.vstack((dummy_df.to_numpy(),dummy_df_ind.to_numpy()))
N,_ = X.shape
y = np.concatenate((np.ones(N//2),np.zeros(N//2)))

## Decision tree and Association Rules

Let's use a decision tree over the supervised learning problem we generated to perform Marker Basket Analysis over the income dataset. This will find regions in feature space where that have high support (they appear in the observations often).

In [118]:
model = tree.DecisionTreeClassifier(min_samples_leaf=.1,random_state=3)
model.fit(X, y)

model_description = tree.export_text(model, feature_names=dummy_features, show_weights=True)
print(model_description)

|--- Income < $40k <= 0.50
|   |--- Householder status = Own <= 0.50
|   |   |--- weights: [1635.00, 921.00] class: 0.0
|   |--- Householder status = Own >  0.50
|   |   |--- weights: [961.00, 1675.00] class: 1.0
|--- Income < $40k >  0.50
|   |--- Householder status = Own <= 0.50
|   |   |--- Marital status = Married <= 0.50
|   |   |   |--- Type of home = Apartment <= 0.50
|   |   |   |   |--- weights: [1167.00, 1582.00] class: 1.0
|   |   |   |--- Type of home = Apartment >  0.50
|   |   |   |   |--- weights: [463.00, 1245.00] class: 1.0
|   |   |--- Marital status = Married >  0.50
|   |   |   |--- weights: [1027.00, 544.00] class: 0.0
|   |--- Householder status = Own >  0.50
|   |   |--- weights: [1623.00, 909.00] class: 0.0



### Summary of the rules we found

1. Support = 0.19 (random seed = 0)
    - Marital status = Single, never married
    - Age < 35
    - Householder status = Live with Parents/Family

In [102]:
rules1 = [('Marital status = Single, never married',True),
          ('Age < 35',True),
          ('Householder status = Live with Parents/Family',True)
         ]

# Compute association rule with highest confidence
A,B,confidence,lift = association_rule(dummy_df,rules1)
print_association_rule(A,B,confidence,lift)

Association Rule (confidence = 0.99, lift = 1.68)

If,
	- "Marital status = Single, never married" is True
	- "Householder status = Live with Parents/Family" is True
Then,
	- "Age < 35" is True


2. Support = 0.18 (random seed = 3)
    - Income < $40k
    - Householder status != Own
    - Marital status != Married
    - Type of home = Apartment

In [104]:
rules2 = [('Income < $40k',True),
          ('Householder status = Own',False),
          ('Marital status = Married',False),
          ('Type of home = Apartment',True)
         ]

# Compute association rule with highest confidence
A,B,confidence,lift = association_rule(dummy_df,rules2)
print_association_rule(A,B,confidence,lift)

Association Rule (confidence = 0.99, lift = 1.59)

If,
	- "Income < $40k" is True
	- "Marital status = Married" is False
	- "Type of home = Apartment" is True
Then,
	- "Householder status = Own" is False


3. Support = 0.14 (random seed = 14)
    - Marital status != Living together, not married
    - Householder status != Rent
    - Type of home != Apartment
    - Years in Bay Area > 10
    - Age >= 35
    - Income >= $40k

In [116]:
rules3 = [('Marital status = Living together, not married',False),
          ('Householder status = Rent',False),
          ('Type of home = Apartment',False),
          ('Years in Bay Area > 10',True),
          ('Age >= 35', True),
          ('Income >= $40k',True)
         ]

# Compute association rule with highest confidence
A,B,confidence,lift = association_rule(dummy_df,rules3)
print_association_rule(A,B,confidence,lift)

Association Rule (confidence = 0.99, lift = 1.37)

If,
	- "Marital status = Living together, not married" is False
	- "Householder status = Rent" is False
	- "Years in Bay Area > 10" is True
	- "Age >= 35" is True
	- "Income >= $40k" is True
Then,
	- "Type of home = Apartment" is False
