In [1]:
import pandas as pd

## Util Function

In [38]:
def filter_question_mark(row):
    return all([i != '?' for i in row])

In [39]:
def encode_value(x):
    if x == 'n':
        return 0
    if x == 'y':
        return 1
    if x == '?':
        return 0.5
    return x

## Wine

In [18]:
column_name = [
    'label',
    'alcohol', 'malic_acid', 'ash', 
    'alcalinity_of_ash', 'magnesium', 
    'total_phenols', 'flavanoids',
    'nonflavanoids_phenols', 'proanthocyanins',
    'color_intensity', 'hue', 'OD280_or_OD315_of_diluted_wines',
    'proline']

In [19]:
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data',
    names=column_name)

In [20]:
df.isnull().sum()

label                              0
alcohol                            0
malic_acid                         0
ash                                0
alcalinity_of_ash                  0
magnesium                          0
total_phenols                      0
flavanoids                         0
nonflavanoids_phenols              0
proanthocyanins                    0
color_intensity                    0
hue                                0
OD280_or_OD315_of_diluted_wines    0
proline                            0
dtype: int64

In [21]:
df.to_csv('wine.csv', index=False)

## Breast Cancer Coimbra

In [22]:
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/00451/dataR2.csv')

In [23]:
df.columns = ['age', 'bmi', 'glucose', 'insulin', 'HOMA', 'leptin', 'adiponectin', 'resitin', 'MCP_1', 'label']

In [24]:
df.isnull().sum()

age            0
bmi            0
glucose        0
insulin        0
HOMA           0
leptin         0
adiponectin    0
resitin        0
MCP_1          0
label          0
dtype: int64

In [25]:
df.to_csv('breast_cancer_coimbra.csv', index=False)

## Iris

In [26]:
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
     names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])

In [27]:
df.to_csv('iris.csv', index=False)

## Hepatitis

In [4]:
column_name = [
    'label', 
    'age', 'sex', 'steroid',
    'antivirals', 'fatigure',
    'malaise', 'anorexia',
    'liver_big', 'liver_firm',
    'spleen_palpable', 'spiders',
    'ascites', 'varices',
    'bilirubin', 'alk_phospates',
    'sgot', 'albumim', 'protime',
    'histology'
]

In [5]:
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis/hepatitis.data',
    names = column_name)

In [7]:
df = df[df.apply(filter_question_mark, axis=1)]

In [8]:
df.to_csv('hepatitis.csv', index=False)

## House Votes

In [3]:
column_name = [
    'label',
    'handicapped-infants',
    'water-project-cost-sharing',
    'adoption-of-the-budget-resolution',
    'physician-fee-freeze',
    'el-salvador-aid', 'religious-groups-in-schools',
    'anti-satellite-test-ban', 'aid-to-nicaraguan-contras',
    'mx-missile', 'immigration', 'synfuels-corporation-cutback',
    'education-spending', 'superfund-right-to-sue',
    'crime', 'duty-free-exports', 'export-administration-act-south-africa'
]

In [4]:
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', 
    names=column_name)

In [7]:
df = df.applymap(encode_value)

In [8]:
df.to_csv('voting.csv', index=False)

## Zoo Animal

In [11]:
column_name = [
    'animal_name',
    'hair', 'feathers',
    'eggs', 'milk',
    'airborne', 'aquatic',
    'predator', 'toothed', 'backbone',
    'breathes', 'venomous',
    'fins', 'legs',
    'tail', 'domestic', 'catsize',
    'label'
]

In [12]:
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/zoo/zoo.data',
     names=column_name)

In [15]:
df.to_csv('zoo.csv', index=False)

## Yeast

In [8]:
column_name = [
    'seq_name', 'mcg', 'gvh', 'alm',
    'mit', 'erl', 'pox', 'vac', 'nuc',
    'label'
]

In [9]:
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data',
    sep=r'\s+', names= column_name)

In [14]:
df = df.drop('seq_name', axis=1).sample(random_state=42, n=120)

In [15]:
df.to_csv('yeast.csv', index=False)

## E. Coli

In [16]:
column_name = [
    'seq_name',  'mcg', 'gvh', 'lip',
    'chg', 'aac', 'alm1', 'alm2',
    'label'
]

In [17]:
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/ecoli/ecoli.data',
    sep=r'\s+', names= column_name)

In [20]:
df = df.drop('seq_name', axis=1).sample(random_state=42, n=120)

In [21]:
df.to_csv('ecoli.csv', index=False)

## Echocardiogram

In [29]:
column_name = [
    'survival', 'label', 'age_at_heart_attack', 
    'perircadial_effusion', 'fractional_shortenting', 
    'epss', 'lvdd', 'wall_motion_score', 'wall_motion_index', 
    'mult', 'name', 'group', 'alive_at_1']

In [30]:
df = pd.read_csv(
    'raw/echocardiogram.data',
    names= column_name)

In [34]:
# drop derived attribute (see: raw/echocardiogram.names)
df = df.drop(['mult', 'name', 'group', 'alive_at_1'], axis=1)

In [40]:
df = df[df.apply(filter_question_mark, axis=1)]

In [42]:
df.to_csv('echocardiogram.csv', index=False)