In [None]:
import numpy as np
import pandas as pd

from llm_elicited_priors.datasets import (
    load_fake_data, load_uti, load_breast_cancer, 
    load_california_housing, load_wine_quality, load_heart_disease
)

## Fake data:

In [2]:
dataset = load_fake_data()

In [3]:
dataset.feature_names

array(['feature 0', 'feature 1', 'feature 2'], dtype=object)

In [4]:
dataset.target_names

array(['target'], dtype='<U6')

In [5]:
dataset.frame.describe()

Unnamed: 0,feature 0,feature 1,feature 2,target
count,250.0,250.0,250.0,250.0
mean,-0.040335,-0.057954,-0.037685,-0.05938
std,1.020356,0.976924,0.959565,2.315106
min,-2.964529,-2.409922,-2.566658,-6.272688
25%,-0.719637,-0.814574,-0.686765,-1.667914
50%,-0.073336,-0.087925,0.091005,-0.081608
75%,0.634185,0.485746,0.551376,1.54342
max,2.327653,2.913862,2.905067,5.981182


## UTI data:

In [6]:
dataset = load_uti()

In [7]:
dataset.feature_names

array(['bathroom frequency', 'bedroom frequency',
       'night time awake frequency', 'mean night time heart rate',
       'standard deviation of night time heart rate',
       'mean night time respiratory rate',
       'standard deviation of night time respiratory rate',
       'night time bathroom frequency', 'daytime bathroom frequency',
       'number of previous urinary tract infections',
       'sex (male = 0, female = 1)'], dtype='<U49')

In [8]:
dataset.target_names

array(['no urinary tract infection', 'urinary tract infection'],
      dtype='<U26')

## Breast Cancer data:

In [9]:
dataset = load_breast_cancer()

In [10]:
dataset.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [11]:
dataset.target_names

array(['benign', 'malignant'], dtype='<U9')

In [12]:
dataset.frame.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.372583
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,0.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


## California Housing data:

In [13]:
dataset = load_california_housing()

In [14]:
dataset.feature_names

array(['median income in block', 'median house age in block',
       'average number of rooms in block',
       'average number of bedrooms in block', 'block population',
       'average house occupancy in block', 'house block latitude',
       'house block longitude'], dtype='<U35')

In [15]:
dataset.target_names

array(['medium house value (in $100k)'], dtype='<U29')

In [16]:
dataset.frame.describe()

Unnamed: 0,median income in block,median house age in block,average number of rooms in block,average number of bedrooms in block,block population,average house occupancy in block,house block latitude,house block longitude,medium house value (in $100k)
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


## Wine Quality data:

In [17]:
dataset = load_wine_quality()

In [18]:
dataset.feature_names

array(['fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
      dtype=object)

In [19]:
dataset.target_names

array(['bad quality', 'good quality'], dtype='<U12')

In [20]:
dataset.frame.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,target
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,7.215307,0.339666,0.318633,5.443235,0.056034,30.525319,115.744574,0.994697,3.218501,0.531268,10.491801,0.633061
std,1.296434,0.164636,0.145318,4.757804,0.035034,17.7494,56.521855,0.002999,0.160787,0.148806,1.192712,0.482007
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,0.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,0.0
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,1.0
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,1.0
max,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,1.0


## Heart Disease data:

In [21]:
dataset = load_heart_disease()

In [22]:
dataset.feature_names

array(['age', 'sex (1 = male, 0 = female)',
       'resting blood pressure (on admission to the hospital)',
       'serum cholestoral in mg/dl',
       'fasting blood sugar > 120 mg/dl (1 = true, 0 = false)',
       'resting electrocardiographic results (1 = abnormal, 0 = normal)',
       'maximum heart rate achieved', 'exercise induced angina',
       'ST depression induced by exercise relative to rest',
       'number of major vessels (0 - 3) colored by flourosopy'],
      dtype=object)

In [23]:
dataset.target_names

array(['no heart disease', 'heart disease'], dtype='<U16')

In [24]:
dataset.frame.describe()

Unnamed: 0,age,"sex (1 = male, 0 = female)",resting blood pressure (on admission to the hospital),serum cholestoral in mg/dl,"fasting blood sugar > 120 mg/dl (1 = true, 0 = false)","resting electrocardiographic results (1 = abnormal, 0 = normal)",maximum heart rate achieved,exercise induced angina,ST depression induced by exercise relative to rest,number of major vessels (0 - 3) colored by flourosopy,target
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,54.528428,0.675585,131.668896,247.100334,0.147157,0.505017,149.505017,0.327759,1.051839,0.672241,0.461538
std,9.02095,0.468941,17.705668,51.914779,0.354856,0.500813,22.954927,0.470183,1.163809,0.937438,0.499354
min,29.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,0.0,0.0
50%,56.0,1.0,130.0,242.0,0.0,1.0,153.0,0.0,0.8,0.0,0.0
75%,61.0,1.0,140.0,275.5,0.0,1.0,165.5,1.0,1.6,1.0,1.0
max,77.0,1.0,200.0,564.0,1.0,1.0,202.0,1.0,6.2,3.0,1.0
