# Prepare data

In [1]:
input_dir = "./data"
raw_input_dir = "."

In [2]:
import pandas as pd
import numpy as np
import os

df_raw = pd.read_csv(os.path.join(raw_input_dir, "housing-price-prediction/Housing.csv"))
df_raw.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
from fca_utils import common_prepare

housing_bool_cols = ["mainroad", "guestroom", "basement", "hotwaterheating", "airconditioning", "prefarea"]
housing_int_cols = ["area", "bedrooms", "bathrooms", "stories", "parking"]
housing_cat_cols = ["furnishingstatus"]
housing_target = 'price'

df = common_prepare(df_raw, 
                    bool_columns=housing_bool_cols, 
                    categorical_cols=housing_cat_cols, 
                    ordinal_columns=housing_int_cols,
                    y_target=housing_target,
                    bool_subs={"yes": 1, "no": 0}
) 
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,furnishingstatus,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea
0,1,7420,4,2,3,2,furnished,1,0,0,0,1,1
1,1,8960,4,4,4,3,furnished,1,0,0,0,1,0
2,1,9960,3,2,2,2,semi-furnished,1,0,1,0,0,1
3,1,7500,4,2,2,3,furnished,1,0,1,0,1,1
4,1,7420,4,1,2,2,furnished,1,1,1,0,1,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   parking           545 non-null    int64 
 6   furnishingstatus  545 non-null    object
 7   mainroad          545 non-null    int64 
 8   guestroom         545 non-null    int64 
 9   basement          545 non-null    int64 
 10  hotwaterheating   545 non-null    int64 
 11  airconditioning   545 non-null    int64 
 12  prefarea          545 non-null    int64 
dtypes: int64(12), object(1)
memory usage: 55.5+ KB


# FCA

In [5]:
from itertools import product
import fcalc

In [6]:
from fca_utils import build_configs


param_grid1 = {
    'qcut_cols': [{}],
    'onehot_cols': [['furnishingstatus']],
    'factorize_cols': [["area"]],
    'binarize_all': [True]
}

param_grid2 = {
    'qcut_cols': [{}, {"area": 2}, {"area": 3}, {"area": 4}, {"area": 10}],  
    'onehot_cols': [['furnishingstatus', 'bedrooms', 'bathrooms', 'stories', 'parking'], ['furnishingstatus']],
    'factorize_cols': [[]],
    'binarize_all': [True]
}

configs = build_configs(param_grid1) + build_configs(param_grid2)

methods = [
    'standard',
    'standard-support',
    'ratio-support'
]

In [7]:
from fca_utils import grid_search

best_binary_config, best_binary_intersecs, best_binary_score = grid_search(
    df,
    fcalc.classifier.BinarizedBinaryClassifier,
    y_target=housing_target,
    n_folds=5,
    methods=methods,
    configs=configs,
)

Fitting 33 configurations
1
classifier=<class 'fcalc.classifier.BinarizedBinaryClassifier'>
config={'qcut_cols': {}, 'onehot_cols': ['furnishingstatus'], 'factorize_cols': ['area'], 'binarize_all': True}
method='standard'
f1_macro (mean)=0.7460871314709211
f1_cls_1=[0.796875, 0.7751937984496124, 0.7256637168141593, 0.8159999999999998, 0.7567567567567568]
f1_cls_0=[0.711111111111111, 0.6741573033707866, 0.7047619047619049, 0.7526881720430108, 0.7476635514018691]
f1_macro=[0.7539930555555555, 0.7246755509101995, 0.7152128107880321, 0.7843440860215053, 0.7522101540793129]

2
classifier=<class 'fcalc.classifier.BinarizedBinaryClassifier'>
config={'qcut_cols': {}, 'onehot_cols': ['furnishingstatus', 'bedrooms', 'bathrooms', 'stories', 'parking'], 'factorize_cols': [], 'binarize_all': True}
method='standard'
f1_macro (mean)=0.7460871314709211
f1_cls_1=[0.796875, 0.7751937984496124, 0.7256637168141593, 0.8159999999999998, 0.7567567567567568]
f1_cls_0=[0.711111111111111, 0.6741573033707866, 0.

In [8]:
from collections import Counter

print("Best config for binary")
print(best_binary_config)
print()
print("Most important intersections (positive):")
print("\n".join(map(str, best_binary_intersecs[0].most_common(10))))
print()
print("Most important intersections (negative):")
print("\n".join(map(str, best_binary_intersecs[1].most_common(10))))
print()
print("Best f1_macro score for binary")
print(best_binary_score)

Best config for binary
('ratio-support', {'qcut_cols': {'area': 4}, 'onehot_cols': ['furnishingstatus', 'bedrooms', 'bathrooms', 'stories', 'parking'], 'factorize_cols': [], 'binarize_all': True})

Most important intersections (positive):
(('mainroad',), 4726)
(('mainroad', 'bedrooms_3'), 2228)
(('mainroad', 'bedrooms_3', 'bathrooms_1'), 2209)
(('mainroad', 'furnishingstatus_semi-furnished'), 1282)
(('mainroad', 'stories_2'), 1261)
(('mainroad', 'furnishingstatus_semi-furnished', 'bathrooms_1'), 1026)
(('mainroad', 'airconditioning'), 863)
(('mainroad', 'furnishingstatus_semi-furnished', 'bedrooms_3', 'bathrooms_1'), 760)
(('mainroad', 'bedrooms_3', 'parking_0'), 738)
(('mainroad', 'area_3'), 674)

Most important intersections (negative):
(('mainroad', 'bathrooms_1'), 4570)
(('mainroad', 'bathrooms_1', 'parking_0'), 3150)
(('bathrooms_1',), 2042)
(('mainroad', 'bathrooms_1', 'stories_1'), 2019)
(('bathrooms_1', 'parking_0'), 1919)
(('mainroad', 'bedrooms_2', 'bathrooms_1', 'stories_1')

## Pattern FCA

In [9]:
param_grid1 = {
    'qcut_cols': [{}],
    'onehot_cols': [[], ['furnishingstatus']],
    'factorize_cols': [["area"]],
    'binarize_all': [False]
}

param_grid2 = {
    'qcut_cols': [{}, {"area": 2}, {"area": 4}],  
    'onehot_cols': [['furnishingstatus', 'bedrooms', 'bathrooms', 'stories', 'parking'], ['furnishingstatus']],
    'factorize_cols': [[]],
    'binarize_all': [False]
}

configs = build_configs(param_grid1) + build_configs(param_grid2)

methods = [
    'standard',
    'standard-support',
    'ratio-support'
]

best_pattern_config, best_pattern_intersectinos, best_pattern_score = grid_search(
    df,
    fcalc.classifier.PatternBinaryClassifier,
    y_target=housing_target,
    n_folds=5,
    methods=methods,
    configs=configs,
    categorical=housing_cat_cols,
)

Fitting 24 configurations
1
classifier=<class 'fcalc.classifier.PatternBinaryClassifier'>
config={'qcut_cols': {}, 'onehot_cols': [], 'factorize_cols': ['area'], 'binarize_all': False}
method='standard'
f1_macro (mean)=0.8077622825165521
f1_cls_1=[0.8495575221238938, 0.8032786885245901, 0.7884615384615384, 0.8148148148148148, 0.7789473684210526]
f1_cls_0=[0.838095238095238, 0.7500000000000001, 0.8070175438596491, 0.8181818181818182, 0.8292682926829269]
f1_macro=[0.8438263801095659, 0.7766393442622951, 0.7977395411605938, 0.8164983164983165, 0.8041078305519898]

2
classifier=<class 'fcalc.classifier.PatternBinaryClassifier'>
config={'qcut_cols': {}, 'onehot_cols': ['furnishingstatus'], 'factorize_cols': ['area'], 'binarize_all': False}
method='standard'
f1_macro (mean)=0.8003398352786176
f1_cls_1=[0.8392857142857144, 0.8099173553719008, 0.75, 0.8148148148148148, 0.7741935483870969]
f1_cls_0=[0.8301886792452831, 0.7628865979381443, 0.7719298245614035, 0.8181818181818182, 0.832]
f1_macro=

In [10]:
print("Best config for binary")
print(best_pattern_config)
print()
print("Most important intersections (positive):")
print("\n".join(map(str, best_pattern_intersectinos[0].most_common(10))))
print()
print("Most important intersections (negative):")
print("\n".join(map(str, best_pattern_intersectinos[1].most_common(10))))
print()
print("Best f1_macro score for binary")
print(best_pattern_score)

Best config for binary
('standard', {'qcut_cols': {}, 'onehot_cols': [], 'factorize_cols': ['area'], 'binarize_all': False})

Most important intersections (positive):
(('mainroad', 'guestroom', 'basement', 'hotwaterheating', 'prefarea'), 781)
(('mainroad', 'guestroom', 'basement', 'hotwaterheating'), 519)
(('bedrooms', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'prefarea'), 487)
(('mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea'), 485)
(('bedrooms', 'bathrooms', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'prefarea'), 388)
(('mainroad', 'hotwaterheating'), 378)
(('mainroad', 'guestroom', 'hotwaterheating'), 378)
(('bedrooms', 'bathrooms', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea'), 373)
(('mainroad', 'guestroom', 'basement', 'hotwaterheating', 'prefarea', 'furnishingstatus'), 370)
(('bedrooms', 'mainroad', 'guestroom', 'basement', 'hotwaterheating'), 364)

Most important intersecti