# Preparing the dataset

## Data initializing

Due to uselessness of `Speaker_Number` column we prefer to drop this information from our dataset.

In [1]:
import sklearn as sk
import pandas as pd
import numpy as np

data = pd.read_csv('../data/vowel.csv', sep = ',', header = 0)
data.drop('Speaker_Number', axis = 1, inplace = True)
data.head()

Unnamed: 0,Sex,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Class
0,Male,-3.639,0.418,-0.67,1.779,-0.168,1.627,-0.388,0.529,-0.874,-0.814,hid
1,Male,-3.327,0.496,-0.694,1.365,-0.265,1.933,-0.363,0.51,-0.621,-0.488,hId
2,Male,-2.12,0.894,-1.576,0.147,-0.707,1.559,-0.579,0.676,-0.809,-0.049,hEd
3,Male,-2.287,1.809,-1.498,1.012,-1.053,1.06,-0.567,0.235,-0.091,-0.795,hAd
4,Male,-2.598,1.938,-0.846,1.062,-1.633,0.764,0.394,-0.15,0.277,-0.396,hYd


## Filling in missing values

Let's classify the type for each column in the dataset. This helps us to fill in the gaps and choose what to do with data.

In [2]:
data_description = data.describe(include=[object])
numerical_features = [f for f in data.columns if data[f].dtype.name != 'object']
categorical_features = [f for f in data.columns if data[f].dtype.name == 'object']
binary_categorical_features = [f for f in categorical_features if data_description[f]['unique'] == 2]
non_binary_categorical_features = [f for f in categorical_features if data_description[f]['unique'] > 2]
print(numerical_features, binary_categorical_features, non_binary_categorical_features, sep = '\n')

['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9']
['Sex']
['Class']


Now we can fill in the gaps if they exist.

In [3]:
if (data.count(axis = 0).max() != data.count(axis = 0).min()):
    data = data.fillna(data.median(axis = 0), axis = 0)
    for f in categorical_features:
        data[f] = data[f].fillna(data_description[f]['top'])
    print('The gaps are filled in')
else:
    print('There are no gaps')

There are no gaps


## Normalization



In [4]:
for f in numerical_features:
    data[f] = (data[f] - data[f].mean()) / data[f].std()
data.head()

Unnamed: 0,Sex,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Class
0,Male,-0.500881,-1.245468,-0.227868,1.66414,0.207128,1.65061,-0.830511,0.335683,-1.001513,-1.2296,hid
1,Male,-0.141843,-1.1791,-0.261578,1.118873,0.061176,2.157341,-0.77639,0.302541,-0.557779,-0.689852,hId
2,Male,1.24713,-0.840455,-1.500433,-0.485318,-0.603884,1.538003,-1.243996,0.592092,-0.88751,0.036987,hEd
3,Male,1.054953,-0.061912,-1.390874,0.653948,-1.124496,0.711668,-1.218018,-0.177136,0.371782,-1.198142,hAd
4,Male,0.697065,0.04785,-0.475077,0.719801,-1.997198,0.221497,0.862398,-0.848684,1.017213,-0.53753,hYd


## Vectorization

We can easily vectorize the binary features automatically. Let's choose the top value and assign `1` if our value is top and `0` if not.

In [5]:
for f in binary_categorical_features:
    true_items = data[f] == data_description[f]['top']
    false_items = np.logical_not(true_items)
    data.loc[true_items, f] = 1
    data.loc[false_items, f] = 0

On this stage we should decade the method of vectorization for non-binary features. Let's clone the dataset and use two different methods:
1. Naive method: let's map each value to a number.
2. Smarter method: let's use [One-hot mapping](https://en.wikipedia.org/wiki/One-hot).

In [6]:
data_naive = data.copy()

mapping_naive = dict()
for f in non_binary_categorical_features:
    for i, k in enumerate(data_naive[f].unique()):
        if not k in mapping_naive.keys():
            mapping_naive[k] = i
    data_naive[f] = data_naive[f].map(mapping_naive)
data_naive.head()

Unnamed: 0,Sex,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Class
0,1,-0.500881,-1.245468,-0.227868,1.66414,0.207128,1.65061,-0.830511,0.335683,-1.001513,-1.2296,0
1,1,-0.141843,-1.1791,-0.261578,1.118873,0.061176,2.157341,-0.77639,0.302541,-0.557779,-0.689852,1
2,1,1.24713,-0.840455,-1.500433,-0.485318,-0.603884,1.538003,-1.243996,0.592092,-0.88751,0.036987,2
3,1,1.054953,-0.061912,-1.390874,0.653948,-1.124496,0.711668,-1.218018,-0.177136,0.371782,-1.198142,3
4,1,0.697065,0.04785,-0.475077,0.719801,-1.997198,0.221497,0.862398,-0.848684,1.017213,-0.53753,4


In [7]:
data_smarter = data.copy()

for f in non_binary_categorical_features:
    data_smarter = pd.concat([data_smarter.drop(f, axis = 1), pd.get_dummies(data_smarter[f])], axis = 1)
data_smarter.head()

Unnamed: 0,Sex,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,...,hEd,hId,hOd,hUd,hYd,had,hed,hid,hod,hud
0,1,-0.500881,-1.245468,-0.227868,1.66414,0.207128,1.65061,-0.830511,0.335683,-1.001513,...,0,0,0,0,0,0,0,1,0,0
1,1,-0.141843,-1.1791,-0.261578,1.118873,0.061176,2.157341,-0.77639,0.302541,-0.557779,...,0,1,0,0,0,0,0,0,0,0
2,1,1.24713,-0.840455,-1.500433,-0.485318,-0.603884,1.538003,-1.243996,0.592092,-0.88751,...,1,0,0,0,0,0,0,0,0,0
3,1,1.054953,-0.061912,-1.390874,0.653948,-1.124496,0.711668,-1.218018,-0.177136,0.371782,...,0,0,0,0,0,0,0,0,0,0
4,1,0.697065,0.04785,-0.475077,0.719801,-1.997198,0.221497,0.862398,-0.848684,1.017213,...,0,0,0,0,1,0,0,0,0,0
