In [1]:
import openml
import pandas as pd
import numpy as np

In [2]:
dataset_id = 23
dataset = openml.datasets.get_dataset(dataset_id)
X, y, categorical, names = dataset.get_data(
    target=dataset.default_target_attribute,
    return_categorical_indicator=True,
    return_attribute_names=True)
categorical

[False, True, True, False, True, True, True, True, True]

In [3]:
pd.DataFrame(X, columns=names)

Unnamed: 0,Wifes_age,Wifes_education,Husbands_education,Number_of_children_ever_born,Wifes_religion,Wifes_now_working%3F,Husbands_occupation,Standard-of-living_index,Media_exposure
0,24.0,1.0,2.0,3.0,1.0,1.0,1.0,2.0,0.0
1,45.0,0.0,2.0,10.0,1.0,1.0,2.0,3.0,0.0
2,43.0,1.0,2.0,7.0,1.0,1.0,2.0,3.0,0.0
3,42.0,2.0,1.0,9.0,1.0,1.0,2.0,2.0,0.0
4,36.0,2.0,2.0,8.0,1.0,1.0,2.0,1.0,0.0
5,19.0,3.0,3.0,0.0,1.0,1.0,2.0,2.0,0.0
6,38.0,1.0,2.0,6.0,1.0,1.0,2.0,1.0,0.0
7,21.0,2.0,2.0,1.0,1.0,0.0,2.0,1.0,0.0
8,27.0,1.0,2.0,3.0,1.0,1.0,2.0,3.0,0.0
9,45.0,0.0,0.0,8.0,1.0,1.0,1.0,1.0,1.0


In [4]:
def logical_filter(l, logicals):
    return [val for (val, b) in zip(l, logicals) if b]

In [5]:
print(logical_filter(range(5), [True, False, False, True, False]))

[0, 3]


In [6]:
def separate_categorical(X, categorical, column_names=None):
    """
    Separate the categorical variables from the non-categorical variables
    as specificed by the logical 'categorical' list
    """
    non_categorical = np.logical_not(categorical)
    
    if column_names:
        names_categ = logical_filter(column_names, categorical)
        names_non_categ = logical_filter(column_names, non_categorical)
        return ((X[:, categorical], names_categ),
                (X[:, non_categorical], names_non_categ))
    
    return X[:, categorical], X[:, non_categorical]

In [7]:
X_categ, X_non = separate_categorical(X, categorical)
print(X_categ.shape, X_non.shape)

(X_categ, names_categ), (X_non, names_non) = separate_categorical(X, categorical, names)
print(X_categ.shape, names_categ)
print(X_non.shape, names_non)

(1473, 7) (1473, 2)
(1473, 7) ['Wifes_education', 'Husbands_education', 'Wifes_religion', 'Wifes_now_working%3F', 'Husbands_occupation', 'Standard-of-living_index', 'Media_exposure']
(1473, 2) ['Wifes_age', 'Number_of_children_ever_born']


In [8]:
x = np.zeros((5, 5))
x[:, 0] = range(5)
x[:, 1] = 3.3
x[2, 3] = 1.2
x[:, 4] = 4.5
x

array([[ 0. ,  3.3,  0. ,  0. ,  4.5],
       [ 1. ,  3.3,  0. ,  0. ,  4.5],
       [ 2. ,  3.3,  0. ,  1.2,  4.5],
       [ 3. ,  3.3,  0. ,  0. ,  4.5],
       [ 4. ,  3.3,  0. ,  0. ,  4.5]])

In [9]:
def separate_discrete(X, column_names=None):
    """
    Separate the discrete variables from the continuous variables
    (assuming all discrete variables are integers and
    everything else is a continuous variable)
    """
    is_integer = np.vectorize(lambda x: x.is_integer(), otypes=[np.bool])

    def is_int_column(col):
        return np.all(is_integer(col))
    
    discrete = np.apply_along_axis(is_int_column, 0, X)
    continuous = np.logical_not(discrete)
    
    if column_names:
        names_discrete = logical_filter(column_names, discrete)
        names_continuous = logical_filter(column_names, continuous)
        return ((X[:, discrete], names_discrete),
                (X[:, continuous], names_continuous))

    return X[:, discrete], X[:, continuous]

In [10]:
discrete, continuous = separate_discrete(x)
print(discrete)
print(continuous)

x_names = 'how now brown cow ?'.split()
(X_d, names_d), (X_c, names_c) = separate_discrete(x, x_names)
print(X_d, names_d)
print(X_c, names_c)

[[ 0.  0.]
 [ 1.  0.]
 [ 2.  0.]
 [ 3.  0.]
 [ 4.  0.]]
[[ 3.3  0.   4.5]
 [ 3.3  0.   4.5]
 [ 3.3  1.2  4.5]
 [ 3.3  0.   4.5]
 [ 3.3  0.   4.5]]
[[ 0.  0.]
 [ 1.  0.]
 [ 2.  0.]
 [ 3.  0.]
 [ 4.  0.]] ['how', 'brown']
[[ 3.3  0.   4.5]
 [ 3.3  0.   4.5]
 [ 3.3  1.2  4.5]
 [ 3.3  0.   4.5]
 [ 3.3  0.   4.5]] ['now', 'cow', '?']


In [11]:
def separate_features(X, categorical, column_names=None):
    """
    Separate data into categorical, discrete, and continuous variables
    (assuming all discrete variables are integers)
    """
    if column_names:
        (X_categ, names_categ), (X_non_categ, names_non_categ) = \
            separate_categorical(X, categorical, column_names)
        (X_discrete, names_discrete), (X_continuous, names_continuous) = \
            separate_discrete(X_non_categ, names_non_categ)
        return ((X_categ, names_categ),
                (X_discrete, names_discrete),
                (X_continuous, names_continuous))
    else:
        X_categ, X_non_categ = separate_categorical(X, categorical)
        X_discrete, X_continuous = separate_discrete(X_non_categ)
        return X_categ, X_discrete, X_continuous

In [12]:
X_categ, X_discrete, X_continuous = separate_features(x,
                                        [False, False, True, False, False])
print(x)
print(X_categ)
print(X_discrete)
print(X_continuous)

[[ 0.   3.3  0.   0.   4.5]
 [ 1.   3.3  0.   0.   4.5]
 [ 2.   3.3  0.   1.2  4.5]
 [ 3.   3.3  0.   0.   4.5]
 [ 4.   3.3  0.   0.   4.5]]
[[ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]]
[[ 0.]
 [ 1.]
 [ 2.]
 [ 3.]
 [ 4.]]
[[ 3.3  0.   4.5]
 [ 3.3  0.   4.5]
 [ 3.3  1.2  4.5]
 [ 3.3  0.   4.5]
 [ 3.3  0.   4.5]]


In [13]:
# empty categorical set
# NOTE: the shape of the empty array is (5, 0), which seems slightly off,
# but it doesn't really matter and can be easily fixed
X_categ, X_discrete, X_continuous = separate_features(x,
                                        [False, False, False, False, False])
print(x)
print(X_categ, 'of shape:', X_categ.shape)
print(X_discrete)
print(X_continuous)

[[ 0.   3.3  0.   0.   4.5]
 [ 1.   3.3  0.   0.   4.5]
 [ 2.   3.3  0.   1.2  4.5]
 [ 3.   3.3  0.   0.   4.5]
 [ 4.   3.3  0.   0.   4.5]]
[] of shape: (5, 0)
[[ 0.  0.]
 [ 1.  0.]
 [ 2.  0.]
 [ 3.  0.]
 [ 4.  0.]]
[[ 3.3  0.   4.5]
 [ 3.3  0.   4.5]
 [ 3.3  1.2  4.5]
 [ 3.3  0.   4.5]
 [ 3.3  0.   4.5]]


In [14]:
# empty discrete set
X_categ, X_discrete, X_continuous = separate_features(x,
                                        [True, False, True, False, False])
print(x)
print(X_categ)
print(X_discrete)
print(X_continuous)

[[ 0.   3.3  0.   0.   4.5]
 [ 1.   3.3  0.   0.   4.5]
 [ 2.   3.3  0.   1.2  4.5]
 [ 3.   3.3  0.   0.   4.5]
 [ 4.   3.3  0.   0.   4.5]]
[[ 0.  0.]
 [ 1.  0.]
 [ 2.  0.]
 [ 3.  0.]
 [ 4.  0.]]
[]
[[ 3.3  0.   4.5]
 [ 3.3  0.   4.5]
 [ 3.3  1.2  4.5]
 [ 3.3  0.   4.5]
 [ 3.3  0.   4.5]]


In [15]:
# empty discrete set again
((X_categ, names_categ), (X_d, names_d), (X_cont, names_cont)) = \
    separate_features(x, [True, False, True, False, False], x_names)
print(X_categ, names_categ)
print(X_d, names_d)
print(X_cont, names_cont)

[[ 0.  0.]
 [ 1.  0.]
 [ 2.  0.]
 [ 3.  0.]
 [ 4.  0.]] ['how', 'brown']
[] []
[[ 3.3  0.   4.5]
 [ 3.3  0.   4.5]
 [ 3.3  1.2  4.5]
 [ 3.3  0.   4.5]
 [ 3.3  0.   4.5]] ['now', 'cow', '?']


In [16]:
# test on real data
((X_categ, names_categ), (X_d, names_d), (X_cont, names_cont)) = \
    separate_features(X, categorical, names)
print(names_categ)
print(names_d)
print(names_cont)

['Wifes_education', 'Husbands_education', 'Wifes_religion', 'Wifes_now_working%3F', 'Husbands_occupation', 'Standard-of-living_index', 'Media_exposure']
['Wifes_age', 'Number_of_children_ever_born']
[]
