In [1]:
%run ./distance.py
%run ./hierarchical.py
%run ../lasso_plot/duck_db.py

# from cluster_features.distance import euclidean_distance_pl

db = DuckDB("./data.duckdb")

In [2]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
X = pd.DataFrame(
    data.data, columns=[col.replace(" ", "_") for col in data.feature_names]
)
y = pd.DataFrame(pd.Series(data.target), columns=["target"])

X.columns = [col.replace(" ", "_") for col in X.columns]

for col in [
    "mean_radius",
    "mean_texture",
    "mean_perimeter",
    "mean_area",
    "mean_smoothness",
    "mean_compactness",
    "mean_concavity",
    "mean_concave_points",
    "mean_symmetry",
]:
    X[f"is_{col}_gt_median"] = (X[col] > X[col].median()).astype(int)
    X[f"{col}_decile"] = pd.qcut(
        X[col], 10, labels=[f"Q{'0' if i < 10 else ''}{i}" for i in range(1, 11)]
    ).astype(str)


db.write("create schema if not exists cluster")
db.write(f"create or replace table cluster.X as (select * from X)")
db.write(f"create or replace table cluster.y as (select * from y)")

y = y.values.ravel()

X.shape, y.shape

((569, 48), (569,))

In [3]:
X.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,is_mean_smoothness_gt_median,mean_smoothness_decile,is_mean_compactness_gt_median,mean_compactness_decile,is_mean_concavity_gt_median,mean_concavity_decile,is_mean_concave_points_gt_median,mean_concave_points_decile,is_mean_symmetry_gt_median,mean_symmetry_decile
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,1,Q10,1,Q10,1,Q10,1,Q10,1,Q10
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,0,Q03,0,Q04,1,Q07,1,Q08,1,Q06
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,1,Q09,1,Q09,1,Q09,1,Q10,1,Q09
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,1,Q10,1,Q10,1,Q10,1,Q10,1,Q10
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,1,Q07,1,Q08,1,Q09,1,Q10,1,Q06


In [4]:
db("from cluster.X").head()

mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,radius_error,texture_error,perimeter_error,area_error,smoothness_error,compactness_error,concavity_error,concave_points_error,symmetry_error,fractal_dimension_error,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,is_mean_radius_gt_median,mean_radius_decile,is_mean_texture_gt_median,mean_texture_decile,is_mean_perimeter_gt_median,mean_perimeter_decile,is_mean_area_gt_median,mean_area_decile,is_mean_smoothness_gt_median,mean_smoothness_decile,is_mean_compactness_gt_median,mean_compactness_decile,is_mean_concavity_gt_median,mean_concavity_decile,is_mean_concave_points_gt_median,mean_concave_points_decile,is_mean_symmetry_gt_median,mean_symmetry_decile
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,str,i64,str,i64,str,i64,str,i64,str,i64,str,i64,str,i64,str,i64,str
17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1,"""Q09""",0,"""Q01""",1,"""Q09""",1,"""Q09""",1,"""Q10""",1,"""Q10""",1,"""Q10""",1,"""Q10""",1,"""Q10"""
20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1,"""Q10""",0,"""Q04""",1,"""Q10""",1,"""Q10""",0,"""Q03""",0,"""Q04""",1,"""Q07""",1,"""Q08""",1,"""Q06"""
19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1,"""Q10""",1,"""Q07""",1,"""Q10""",1,"""Q10""",1,"""Q09""",1,"""Q09""",1,"""Q09""",1,"""Q10""",1,"""Q09"""
11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0,"""Q03""",1,"""Q07""",0,"""Q04""",0,"""Q02""",1,"""Q10""",1,"""Q10""",1,"""Q10""",1,"""Q10""",1,"""Q10"""
20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1,"""Q10""",0,"""Q02""",1,"""Q10""",1,"""Q10""",1,"""Q07""",1,"""Q08""",1,"""Q09""",1,"""Q10""",1,"""Q06"""


In [5]:
db("select count(distinct columns(*)) from cluster.X")
# .to_series()

mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,radius_error,texture_error,perimeter_error,area_error,smoothness_error,compactness_error,concavity_error,concave_points_error,symmetry_error,fractal_dimension_error,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,is_mean_radius_gt_median,mean_radius_decile,is_mean_texture_gt_median,mean_texture_decile,is_mean_perimeter_gt_median,mean_perimeter_decile,is_mean_area_gt_median,mean_area_decile,is_mean_smoothness_gt_median,mean_smoothness_decile,is_mean_compactness_gt_median,mean_compactness_decile,is_mean_concavity_gt_median,mean_concavity_decile,is_mean_concave_points_gt_median,mean_concave_points_decile,is_mean_symmetry_gt_median,mean_symmetry_decile
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
456,479,522,539,474,537,537,542,432,499,540,519,533,528,547,541,533,507,498,545,457,511,514,544,411,529,539,492,500,535,2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10


In [6]:
import itertools
from cluster_features.queries import (
    cts_cts_query,
    binary_binary_query,
    binary_categorical_query,
)
from predictables.util import get_column_dtype

n_distinct = db("select count(distinct columns(*)) from cluster.X")


def is_binary(col):
    if col.startswith("is_"):
        return True
    elif n_distinct[col][0] == 2:
        return True
    return False


def is_continuous(col):
    # if float, then continuous
    if X[col].dtype in ["float64", "float32"]:
        return True
    elif (X[col].dtype in ["int64", "int32", "int16", "int8", "bool"]) and is_binary(
        col
    ):
        return False
    elif (
        X[col].dtype in ["int64", "int32", "int16", "int8", "bool"]
    ) and not is_binary(col):
        return True

    return False


def get_type(col):
    if is_continuous(col):
        return "continuous"
    elif is_binary(col):
        return "binary"
    return "categorical"


binary_cols = [col for col in X.columns if is_binary(col)]

with db as conn:
    for col1, col2 in itertools.combinations(X.columns.tolist(), 2):
        dtype1 = get_type(col1)
        dtype2 = get_type(col2)

        if dtype1 == "continuous" and dtype2 == "continuous":
            conn.execute(cts_cts_query(col1, col2))
        elif dtype1 == "binary" and dtype2 == "binary":
            conn.execute(binary_binary_query(col1, col2))
        elif dtype1 == "binary" and dtype2 == "categorical":
            conn.execute(binary_categorical_query(col1, col2))
        else:
            continue

ConversionException: Conversion Error: Could not convert string 'Q09' to INT64
LINE 13: ... if not exists bin_cat;
        create or replace table bin_cat.is_mean_radius...
                                                   ^

In [None]:
db("from is_mean_area_gt_median__is_mean_compactness_gt_median")

is_mean_area_gt_median,is_mean_compactness_gt_median,_intersection,_union,jaccard_similarity,hamming_distance
i64,i64,f64,f64,f64,f64
1,1,389.0,569.0,0.683656,180.0
1,0,389.0,569.0,0.683656,180.0
1,1,389.0,569.0,0.683656,180.0
0,1,389.0,569.0,0.683656,180.0
1,1,389.0,569.0,0.683656,180.0
…,…,…,…,…,…
1,1,389.0,569.0,0.683656,180.0
1,1,389.0,569.0,0.683656,180.0
1,1,389.0,569.0,0.683656,180.0
1,1,389.0,569.0,0.683656,180.0


In [None]:
db(
    "select table_schema, table_name from information_schema.tables where starts_with(table_name, 'is')"
).to_pandas().table_name[0]

'is_mean_area_gt_median__is_mean_compactness_gt_median'

In [None]:
db("show tables").with_columns(
    [
        pl.col("name").str.split("__").list.get(0).alias("col1"),
        pl.col("name").str.split("__").list.get(1).alias("col2"),
    ]
)

name,col1,col2
str,str,str
"""area_error__compactness_error""","""area_error""","""compactness_error"""
"""area_error__concave_points_err…","""area_error""","""concave_points_error"""
"""area_error__concavity_error""","""area_error""","""concavity_error"""
"""area_error__fractal_dimension_…","""area_error""","""fractal_dimension_error"""
"""area_error__smoothness_error""","""area_error""","""smoothness_error"""
…,…,…
"""worst_texture__worst_concavity""","""worst_texture""","""worst_concavity"""
"""worst_texture__worst_fractal_d…","""worst_texture""","""worst_fractal_dimension"""
"""worst_texture__worst_perimeter""","""worst_texture""","""worst_perimeter"""
"""worst_texture__worst_smoothnes…","""worst_texture""","""worst_smoothness"""


In [None]:
from math import comb

comb(30, 2)

435