# Prepare data

In [1]:
input_dir = "./data"
raw_input_dir = "."

In [2]:
import pandas as pd
import numpy as np
import os

df_raw = pd.read_csv(os.path.join(raw_input_dir, "mobile-price-classification/train.csv"))
df_raw.price_range = (df_raw.price_range >= 2).astype(int)
df_raw = df_raw.sample(512, random_state=42).reset_index(drop=True)
df_raw.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,1646,0,2.5,0,3,1,25,0.6,200,2,...,211,1608,686,8,6,11,1,1,0,0
1,1182,0,0.5,0,7,1,8,0.5,138,8,...,275,986,2563,19,17,19,1,0,0,1
2,1972,0,2.9,0,9,0,14,0.4,196,7,...,293,952,1316,8,1,8,1,1,0,0
3,989,1,2.0,0,4,0,17,0.2,166,3,...,256,1394,3892,18,7,19,1,1,0,1
4,615,1,0.5,1,7,0,58,0.5,130,5,...,1021,1958,1906,14,5,5,1,0,0,0


In [3]:
from fca_utils import common_prepare

mobile_target_variable = "price_range"
mobile_bool_columns = ["blue", "dual_sim", "four_g", "three_g", "touch_screen", "wifi"]
mobile_float_columns = ['clock_speed', 'm_dep']
mobile_int_columns = ['battery_power', 'fc', 'int_memory', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time']

df = common_prepare(df_raw, 
                    bool_columns=mobile_bool_columns, 
                    categorical_cols=[], 
                    ordinal_columns=mobile_float_columns + mobile_int_columns,
                    y_target=mobile_target_variable,
) 

df[mobile_float_columns] *= 10
df = df.astype(int)

df.head()

Unnamed: 0,price_range,clock_speed,m_dep,battery_power,fc,int_memory,mobile_wt,n_cores,pc,px_height,...,ram,sc_h,sc_w,talk_time,blue,dual_sim,four_g,three_g,touch_screen,wifi
0,0,25,6,1646,3,25,200,2,5,211,...,686,8,6,11,0,0,1,1,1,0
1,1,5,5,1182,7,8,138,8,16,275,...,2563,19,17,19,0,0,1,1,0,0
2,0,29,4,1972,9,14,196,7,18,293,...,1316,8,1,8,0,0,0,1,1,0
3,1,20,2,989,4,17,166,3,19,256,...,3892,18,7,19,1,0,0,1,1,0
4,0,5,5,615,7,58,130,5,8,1021,...,1906,14,5,5,1,1,0,1,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512 entries, 0 to 511
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   price_range    512 non-null    int64
 1   clock_speed    512 non-null    int64
 2   m_dep          512 non-null    int64
 3   battery_power  512 non-null    int64
 4   fc             512 non-null    int64
 5   int_memory     512 non-null    int64
 6   mobile_wt      512 non-null    int64
 7   n_cores        512 non-null    int64
 8   pc             512 non-null    int64
 9   px_height      512 non-null    int64
 10  px_width       512 non-null    int64
 11  ram            512 non-null    int64
 12  sc_h           512 non-null    int64
 13  sc_w           512 non-null    int64
 14  talk_time      512 non-null    int64
 15  blue           512 non-null    int64
 16  dual_sim       512 non-null    int64
 17  four_g         512 non-null    int64
 18  three_g        512 non-null    int64
 19  touch_sc

# Binary FCA 

In [5]:
from itertools import product
import fcalc

In [6]:
from fca_utils import build_configs

param_grid = {
    'onehot_cols': [[]],
    'factorize_cols': [[]],
    'qcut_cols': [
        {},
        dict.fromkeys(mobile_int_columns + mobile_float_columns, 2),
        dict.fromkeys(mobile_int_columns + mobile_float_columns, 3),
        dict.fromkeys(mobile_int_columns + mobile_float_columns, 4),
        dict.fromkeys(mobile_int_columns + mobile_float_columns, 10),
    ],
    'binarize_all': [True],
}

configs = build_configs(param_grid)

methods = [
    'standard',
    'standard-support',
    'ratio-support'
]

In [7]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = "1"
os.environ['TORCH_USE_CUDA_DSA']='1'

In [8]:
from fca_utils import grid_search

best_binary_config, best_binary_intersecs, best_binary_score = grid_search(
    df,
    fcalc.classifier.BinarizedBinaryClassifier,
    y_target=mobile_target_variable,
    n_folds=5,
    methods=methods,
    configs=configs,
)

Fitting 15 configurations
1
classifier=<class 'fcalc.classifier.BinarizedBinaryClassifier'>
config={'onehot_cols': [], 'factorize_cols': [], 'qcut_cols': {}, 'binarize_all': True}
method='standard'
f1_macro (mean)=0.5049748922174055
f1_cls_1=[0.5294117647058824, 0.48543689320388356, 0.5660377358490567, 0.4583333333333333, 0.5]
f1_cls_0=[0.5384615384615383, 0.48543689320388356, 0.5306122448979591, 0.5185185185185185, 0.43750000000000006]
f1_macro=[0.5339366515837103, 0.48543689320388356, 0.5483249903735079, 0.48842592592592593, 0.46875]

2
classifier=<class 'fcalc.classifier.BinarizedBinaryClassifier'>
config={'onehot_cols': [], 'factorize_cols': [], 'qcut_cols': {'battery_power': 2, 'fc': 2, 'int_memory': 2, 'mobile_wt': 2, 'n_cores': 2, 'pc': 2, 'px_height': 2, 'px_width': 2, 'ram': 2, 'sc_h': 2, 'sc_w': 2, 'talk_time': 2, 'clock_speed': 2, 'm_dep': 2}, 'binarize_all': True}
method='standard'
f1_macro (mean)=0.6583052616913353
f1_cls_1=[0.736, 0.7761194029850746, 0.7175572519083969, 0

In [9]:
from collections import Counter

print("Best config for binary")
print(best_binary_config)
print()
print("Most important intersections (positive):")
print("\n".join(map(str, best_binary_intersecs[0].most_common(10))))
print()
print("Most important intersections (negative):")
print("\n".join(map(str, best_binary_intersecs[1].most_common(10))))
print()
print("Best f1_macro score for binary")
print(best_binary_score)

Best config for binary
('standard-support', {'onehot_cols': [], 'factorize_cols': [], 'qcut_cols': {'battery_power': 4, 'fc': 4, 'int_memory': 4, 'mobile_wt': 4, 'n_cores': 4, 'pc': 4, 'px_height': 4, 'px_width': 4, 'ram': 4, 'sc_h': 4, 'sc_w': 4, 'talk_time': 4, 'clock_speed': 4, 'm_dep': 4}, 'binarize_all': True})

Most important intersections (positive):
((), 226)
(('three_g',), 143)
(('four_g', 'three_g'), 127)
(('wifi',), 113)
(('three_g', 'wifi'), 81)
(('touch_screen',), 76)
(('four_g', 'three_g', 'wifi'), 69)
(('blue',), 69)
(('blue', 'four_g', 'three_g'), 69)
(('dual_sim',), 61)

Most important intersections (negative):
(('three_g',), 89)
(('four_g', 'three_g'), 73)
((), 67)
(('dual_sim',), 59)
(('four_g', 'three_g', 'touch_screen'), 54)
(('dual_sim', 'three_g'), 49)
(('dual_sim', 'four_g', 'three_g'), 47)
(('three_g', 'm_dep_1'), 40)
(('three_g', 'touch_screen'), 38)
(('m_dep_1',), 37)

Best f1_macro score for binary
0.8993604850821191


# Pattern FCA

In [10]:
param_grid = {
    'onehot_cols': [[]],
    'factorize_cols': [[]],
    'qcut_cols': [
        {},
        dict.fromkeys(mobile_int_columns + mobile_float_columns, 2),
        dict.fromkeys(mobile_int_columns + mobile_float_columns, 3),
        dict.fromkeys(mobile_int_columns + mobile_float_columns, 4),
        dict.fromkeys(mobile_int_columns + mobile_float_columns, 10),
    ],
    'binarize_all': [False],
}

configs = build_configs(param_grid)

methods = [
    'standard',
    'standard-support',
    'ratio-support'
]

best_pattern_config, best_pattern_intersectinos, best_pattern_score  = grid_search(
    df,
    fcalc.classifier.PatternBinaryClassifier,
    y_target=mobile_target_variable,
    n_folds=5,
    methods=methods,
    configs=configs,
    categorical=[],
)

Fitting 15 configurations
1
classifier=<class 'fcalc.classifier.PatternBinaryClassifier'>
config={'onehot_cols': [], 'factorize_cols': [], 'qcut_cols': {}, 'binarize_all': False}
method='standard'
f1_macro (mean)=0.35331712593517045
f1_cls_1=[0.04081632653061225, 0.10344827586206896, 0.04081632653061225, 0.0, 0.03225806451612903]
f1_cls_0=[0.7006369426751593, 0.6486486486486487, 0.6967741935483871, 0.6923076923076924, 0.5774647887323945]
f1_macro=[0.3707266346028858, 0.3760484622553588, 0.3687952600394997, 0.3461538461538462, 0.30486142662426174]

2
classifier=<class 'fcalc.classifier.PatternBinaryClassifier'>
config={'onehot_cols': [], 'factorize_cols': [], 'qcut_cols': {'battery_power': 2, 'fc': 2, 'int_memory': 2, 'mobile_wt': 2, 'n_cores': 2, 'pc': 2, 'px_height': 2, 'px_width': 2, 'ram': 2, 'sc_h': 2, 'sc_w': 2, 'talk_time': 2, 'clock_speed': 2, 'm_dep': 2}, 'binarize_all': False}
method='standard'
f1_macro (mean)=0.9095608456243618
f1_cls_1=[0.9400000000000001, 0.8952380952380952

In [11]:
print("Best config for binary")
print(best_pattern_config)
print()
print("Most important intersections (positive):")
print("\n".join(map(str, best_pattern_intersectinos[0].most_common(10))))
print()
print("Most important intersections (negative):")
print("\n".join(map(str, best_pattern_intersectinos[1].most_common(10))))
print()
print("Best f1_macro score for binary")
print(best_pattern_score)

Best config for binary
('ratio-support', {'onehot_cols': [], 'factorize_cols': [], 'qcut_cols': {'battery_power': 2, 'fc': 2, 'int_memory': 2, 'mobile_wt': 2, 'n_cores': 2, 'pc': 2, 'px_height': 2, 'px_width': 2, 'ram': 2, 'sc_h': 2, 'sc_w': 2, 'talk_time': 2, 'clock_speed': 2, 'm_dep': 2}, 'binarize_all': False})

Most important intersections (positive):
(('blue', 'touch_screen', 'wifi', 'battery_power', 'px_height', 'px_width', 'ram', 'sc_w', 'clock_speed', 'm_dep'), 6)
(('dual_sim', 'four_g', 'three_g', 'wifi', 'fc', 'int_memory', 'mobile_wt', 'pc', 'px_width', 'ram', 'sc_h', 'sc_w'), 6)
(('dual_sim', 'three_g', 'wifi', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'ram', 'm_dep'), 6)
(('blue', 'four_g', 'three_g', 'fc', 'int_memory', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time'), 6)
(('blue', 'four_g', 'three_g', 'battery_power', 'mobile_wt', 'n_cores', 'px_height', 'px_width', 'ram', 'sc_h', 'talk_time', 'm_dep'), 6)
(('dual_sim', 'wifi', 'battery_power', 'fc', '