In [2]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression

In [5]:
from scipy.io.arff import loadarff 
raw_data = loadarff('datasets/houses/dataset_823_houses.arff')
df_data = pd.DataFrame(raw_data[0])
df_data.to_csv('datasets/houses/dataset_823_houses.csv', index=False)

In [6]:
df_data

Unnamed: 0,median_house_value,median_income,housing_median_age,total_rooms,total_bedrooms,population,households,latitude,binaryClass
0,452600.0,8.3252,41.0,880.0,129.0,322.0,126.0,37.88,b'P'
1,358500.0,8.3014,21.0,7099.0,1106.0,2401.0,1138.0,37.86,b'P'
2,352100.0,7.2574,52.0,1467.0,190.0,496.0,177.0,37.85,b'P'
3,341300.0,5.6431,52.0,1274.0,235.0,558.0,219.0,37.85,b'P'
4,342200.0,3.8462,52.0,1627.0,280.0,565.0,259.0,37.85,b'P'
...,...,...,...,...,...,...,...,...,...
20635,78100.0,1.5603,25.0,1665.0,374.0,845.0,330.0,39.48,b'P'
20636,77100.0,2.5568,18.0,697.0,150.0,356.0,114.0,39.49,b'P'
20637,92300.0,1.7000,17.0,2254.0,485.0,1007.0,433.0,39.43,b'P'
20638,84700.0,1.8672,18.0,1860.0,409.0,741.0,349.0,39.43,b'P'


In [9]:
df_data.columns

Index(['median_house_value', 'median_income', 'housing_median_age',
       'total_rooms', 'total_bedrooms', 'population', 'households', 'latitude',
       'binaryClass'],
      dtype='object')

In [8]:
df_data.binaryClass.value_counts()

b'N'    11726
b'P'     8914
Name: binaryClass, dtype: int64

In [14]:
def get_flow(openml_id, numerical_columns, categorical_columns):
    
    flow = None
    applicable_on_dataframe = False
    
    if openml_id == '5055':
        flow = Pipeline([('scale', RobustScaler()), ('rf', RandomForestClassifier())])
        
    elif openml_id == '8774':
        num_pipe = Pipeline([('imputer', SimpleImputer(add_indicator=True)), 
                             ('standardscaler', StandardScaler())])
        cat_pipe = Pipeline([('simpleimputer', SimpleImputer(strategy='most_frequent')), 
                             ('onehotencoder', OneHotEncoder())])
        applicable_on_dataframe = True
        flow = Pipeline([
            ('columntransformer', ColumnTransformer([
                ('num', num_pipe, numerical_columns),
                ('cat', cat_pipe, categorical_columns),
            ])),
            ('decisiontreeclassifier', DecisionTreeClassifier())])                 
        
    elif openml_id == '17315':
        num_pipe = Pipeline([('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())])
        applicable_on_dataframe = True
        flow = Pipeline([
            ('columntransformer', ColumnTransformer([('cont', num_pipe, numerical_columns)])),
            ('decisiontreeclassifier', DecisionTreeClassifier())])
        
    elif openml_id == '17326':
        applicable_on_dataframe = True
        flow = Pipeline([
            ('columntransformer', ColumnTransformer([
                ('num', Pipeline([('standardscaler', StandardScaler())]), numerical_columns),
                ('cat', Pipeline([('onehotencoder', OneHotEncoder())]), categorical_columns),
            ])),
            ('logisticregression', LogisticRegression(solver='liblinear'))])          
        
    elif openml_id == '17322':
        
        def another_imputer(df_with_categorical_columns):
            return df_with_categorical_columns.fillna('__missing__')
        
        applicable_on_dataframe = True
        num_pipe = Pipeline([('imputer', SimpleImputer()), 
                             ('standardscaler', StandardScaler())])
        cat_pipe = Pipeline([('anothersimpleimputer', FunctionTransformer(another_imputer)), 
                             ('onehotencoder', OneHotEncoder())])
        applicable_on_dataframe = True
        flow = Pipeline([
            ('columntransformer', ColumnTransformer([
                ('num', num_pipe, numerical_columns),
                ('cat', cat_pipe, categorical_columns),
            ])),
            ('decisiontreeclassifier', DecisionTreeClassifier())])                 
        
    elif openml_id == '17337':    
        num_pipe = Pipeline([('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())])
        cat_pipe = Pipeline([('simpleimputer', SimpleImputer(strategy='most_frequent')), 
                             ('onehotencoder', OneHotEncoder())])        
        applicable_on_dataframe = True        
        flow = Pipeline([                     
            ('columntransformer', ColumnTransformer([('num', num_pipe, numerical_columns),
                                                     ('cat', OneHotEncoder(), categorical_columns)])),
            ('svc', SVC())])            
        
    elif openml_id == '17655' or openml_id == '18576':    
        num_pipe = Pipeline([('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())])
        cat_pipe = Pipeline([('simpleimputer', SimpleImputer(strategy='most_frequent')), 
                             ('onehotencoder', OneHotEncoder())])        
        applicable_on_dataframe = True        
        flow = Pipeline([                     
            ('columntransformer', ColumnTransformer([('num', num_pipe, numerical_columns),
                                                     ('cat', OneHotEncoder(), categorical_columns)])),
            ('randomforestclassifier', RandomForestClassifier())])             
        
        
    elif openml_id == '17355':
        flow = Pipeline([('imputer', SimpleImputer()), ('classifier', SVC())])
        
    elif openml_id == '17400':
        flow = Pipeline([('standardscaler', SimpleImputer()), ('svc', SVC())])
        
    elif openml_id == '17496':
        flow = Pipeline([('simpleimputer', SimpleImputer()), ('decisiontreeclassifier', DecisionTreeClassifier())])
        
    elif openml_id == '18922':
        flow = Pipeline([('imputer', SimpleImputer()), ('estimator', DecisionTreeClassifier())])
        
    elif openml_id == '18720':
        applicable_on_dataframe = True
        flow = Pipeline([
            ('columntransformer', ColumnTransformer([('num', StandardScaler(), numerical_columns),
                                                     ('cat', OneHotEncoder(), categorical_columns)])),
            ('svc', SVC())])
    else:
        raise ValueError(f"Invalid flow id: {openml_id}!")
    
    return flow, applicable_on_dataframe
    

In [15]:
def get_dataset(openml_id, seed):
    
    numerical_columns = []
    categorical_columns = []
    train = None
    train_labels = None
    test = None
    test_labels = None
    
    if openml_id == '44':
        # Spambase dataset (44)
        data = pd.read_csv('datasets/spambase/dataset_44_spambase.csv')
        
        label_column = 'class'
        numerical_columns = [column for column in spambase.columns if column != label_column]
        
        data_train, data_test = train_test_split(data, test_size=0.2, random_state=seed)
        
        train = data_train[numerical_columns]
        train_labels = np.array(data_train[label_column] == "b'1'")

        test = data_test[numerical_columns]
        test_labels = np.array(data_test[label_column] == "b'1'")

    elif openml_id == '246':
        # BNG labor dataset (246)
        data = pd.read_csv('datasets/labor/dataset_246_labor.csv')        
        label_column = 'class'
        numerical_columns = ['duration', 'wage-increase-first-year', 'wage-increase-second-year',
                             'wage-increase-third-year', 'working-hours', 'standby-pay', 
                             'shift-differential', 'statutory-holidays']
        categorical_columns = ['cost-of-living-adjustment', 'pension', 'education-allowance', 'vacation',
                               'longterm-disability-assistance', 'contribution-to-dental-plan',
                               'bereavement-assistance', 'contribution-to-health-plan']
        
        data_train, data_test = train_test_split(data, test_size=0.2, random_state=seed)
        
        train = data_train[numerical_columns + categorical_columns]
        train_labels = np.array(data_train[label_column] == "b'good'")
        
        test = data_test[numerical_columns + categorical_columns]
        test_labels = np.array(data_test[label_column] == "b'good'")        
        
    elif openml_id == '823':
        # Houses dataset (823)
        data = pd.read_csv('datasets/houses/dataset_823_houses.csv')        
        label_column = 'binaryClass'
        numerical_columns = ['median_house_value', 'median_income', 'housing_median_age',
                             'total_rooms', 'total_bedrooms', 'population', 'households', 'latitude']
        categorical_columns = []
        
        data_train, data_test = train_test_split(data, test_size=0.2, random_state=seed)
        
        train = data_train[numerical_columns + categorical_columns]
        train_labels = np.array(data_train[label_column] == "b'P'")
        
        test = data_test[numerical_columns + categorical_columns]
        test_labels = np.array(data_test[label_column] == "b'P'")          
        
    elif openml_id == '1461':
        # Bankmarketing dataset (44)
        data = pd.read_csv('datasets/bankmarketing/dataset_1461_bankmarketing.csv')        
        label_column = 'Class'
        numerical_columns = ['V1', 'V6', 'V10', 'V12', 'V13', 'V14', 'V15']
        categorical_columns = ['V2', 'V3', 'V4', 'V5', 'V7', 'V8', 'V9', 'V11', 'V16']
        
        data_train, data_test = train_test_split(data, test_size=0.2, random_state=seed)
        
        train = data_train[numerical_columns + categorical_columns]
        train_labels = np.array(data_train[label_column] == "b'2'")
        
        test = data_test[numerical_columns + categorical_columns]
        test_labels = np.array(data_test[label_column] == "b'2'")
        
    else:
        raise ValueError("Invalid dataset id!")
    
    
    return train, train_labels, test, test_labels, numerical_columns, categorical_columns

In [17]:
seed = 1234
np.random.seed(seed)

dataset_ids = ['823']#['1461', '44']
flow_ids = ['5055', '8774', '17315', '17322', '17326', '17337', '17355', '17400', '17496', 
            '17655', '18720', '18576', '18922']

for dataset_id in dataset_ids:
    print('Dataset: ', dataset_id)
    train, train_labels, test, test_labels, numerical_columns, categorical_columns = get_dataset(dataset_id, seed)
    
    for flow_id in flow_ids:
        print('  Flow: ', flow_id)
        flow, applicable_on_df = get_flow(flow_id, numerical_columns, categorical_columns)
        
        if not applicable_on_df:
            model = flow.fit(train[numerical_columns].values, train_labels)
            print('    Score: ', model.score(test[numerical_columns].values, test_labels))
            pass
        else:
            model = flow.fit(train, train_labels)
            print('    Score: ', model.score(test, test_labels))

Dataset:  823
  Flow:  5055
    Score:  0.9825581395348837
  Flow:  8774
    Score:  0.9745639534883721
  Flow:  17315
    Score:  0.9733527131782945
  Flow:  17322
    Score:  0.9743217054263565
  Flow:  17326
    Score:  0.9590600775193798
  Flow:  17337
    Score:  0.9660852713178295
  Flow:  17355
    Score:  0.5763081395348837
  Flow:  17400
    Score:  0.5763081395348837
  Flow:  17496
    Score:  0.9735949612403101
  Flow:  17655
    Score:  0.9835271317829457
  Flow:  18720
    Score:  0.9660852713178295
  Flow:  18576
    Score:  0.9832848837209303
  Flow:  18922
    Score:  0.9745639534883721
