In [1]:
import pandas as pd
import numpy as np

df = pd.read_excel('gonogowindsurfing_clean.xlsx')

In [2]:
df.loc[df.age<0,'age'] = df.age[df.age>0].mean()
df

Unnamed: 0,clientId,age,windSpeed,windDirection,temperature,goWindsurfing
0,0,16,13.0,N,11.0,1
1,1,38,16.0,NE,13.0,1
2,2,15,,N,15.0,0
3,3,45,27.0,NE,23.0,0
4,4,67,23.0,NE,17.0,0
5,5,39,25.0,N,32.0,1
6,6,35,12.0,SE,15.0,1
7,7,18,17.0,E,17.0,1
8,8,22,18.0,N,19.0,1
9,9,55,11.0,S,,1


In [3]:
descriptive = df.iloc[:,1:-1].values
descriptive

array([[16, 13.0, 'N', 11.0],
       [38, 16.0, 'NE', 13.0],
       [15, nan, 'N', 15.0],
       [45, 27.0, 'NE', 23.0],
       [67, 23.0, 'NE', 17.0],
       [39, 25.0, 'N', 32.0],
       [35, 12.0, 'SE', 15.0],
       [18, 17.0, 'E', 17.0],
       [22, 18.0, 'N', 19.0],
       [55, 11.0, 'S', nan]], dtype=object)

In [4]:
target = df.iloc[:,-1].values
target

array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [5]:
from sklearn.impute import SimpleImputer
simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
simple_imputer.fit(descriptive[:,[0,1,3]])


In [6]:
descriptive[:,[0,1,3]] = simple_imputer.transform(descriptive[:,[0,1,3]])

In [7]:
descriptive

array([[16.0, 13.0, 'N', 11.0],
       [38.0, 16.0, 'NE', 13.0],
       [15.0, 18.0, 'N', 15.0],
       [45.0, 27.0, 'NE', 23.0],
       [67.0, 23.0, 'NE', 17.0],
       [39.0, 25.0, 'N', 32.0],
       [35.0, 12.0, 'SE', 15.0],
       [18.0, 17.0, 'E', 17.0],
       [22.0, 18.0, 'N', 19.0],
       [55.0, 11.0, 'S', 18.0]], dtype=object)

In [8]:
df

Unnamed: 0,clientId,age,windSpeed,windDirection,temperature,goWindsurfing
0,0,16,13.0,N,11.0,1
1,1,38,16.0,NE,13.0,1
2,2,15,,N,15.0,0
3,3,45,27.0,NE,23.0,0
4,4,67,23.0,NE,17.0,0
5,5,39,25.0,N,32.0,1
6,6,35,12.0,SE,15.0,1
7,7,18,17.0,E,17.0,1
8,8,22,18.0,N,19.0,1
9,9,55,11.0,S,,1


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

column_transformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2])], remainder='passthrough')

In [10]:
descriptive = np.array(column_transformer.fit_transform(descriptive))
descriptive

array([[0.0, 1.0, 0.0, 0.0, 0.0, 16.0, 13.0, 11.0],
       [0.0, 0.0, 1.0, 0.0, 0.0, 38.0, 16.0, 13.0],
       [0.0, 1.0, 0.0, 0.0, 0.0, 15.0, 18.0, 15.0],
       [0.0, 0.0, 1.0, 0.0, 0.0, 45.0, 27.0, 23.0],
       [0.0, 0.0, 1.0, 0.0, 0.0, 67.0, 23.0, 17.0],
       [0.0, 1.0, 0.0, 0.0, 0.0, 39.0, 25.0, 32.0],
       [0.0, 0.0, 0.0, 0.0, 1.0, 35.0, 12.0, 15.0],
       [1.0, 0.0, 0.0, 0.0, 0.0, 18.0, 17.0, 17.0],
       [0.0, 1.0, 0.0, 0.0, 0.0, 22.0, 18.0, 19.0],
       [0.0, 0.0, 0.0, 1.0, 0.0, 55.0, 11.0, 18.0]], dtype=object)

In [16]:
from sklearn.model_selection import train_test_split

descriptive_train, descriptive_test, target_train, target_test = train_test_split(descriptive, target, test_size=0.25, random_state=0)

In [21]:
descriptive_train

array([[0.0, 0.0, 0.0, 1.0, 0.0, 55.0, 11.0, 18.0],
       [0.0, 0.0, 1.0, 0.0, 0.0, 38.0, 16.0, 13.0],
       [0.0, 0.0, 0.0, 0.0, 1.0, 35.0, 12.0, 15.0],
       [1.0, 0.0, 0.0, 0.0, 0.0, 18.0, 17.0, 17.0],
       [0.0, 0.0, 1.0, 0.0, 0.0, 45.0, 27.0, 23.0],
       [0.0, 1.0, 0.0, 0.0, 0.0, 16.0, 13.0, 11.0],
       [0.0, 1.0, 0.0, 0.0, 0.0, 39.0, 25.0, 32.0]], dtype=object)

In [20]:
descriptive_test

array([[0.0, 1.0, 0.0, 0.0, 0.0, 15.0, 18.0, 15.0],
       [0.0, 1.0, 0.0, 0.0, 0.0, 22.0, 18.0, 19.0],
       [0.0, 0.0, 1.0, 0.0, 0.0, 67.0, 23.0, 17.0]], dtype=object)

In [22]:
target_train

array([1, 1, 1, 1, 0, 1, 1], dtype=int64)

In [23]:
target_test

array([0, 1, 0], dtype=int64)

## Normalization and standardization

### Normalization

```python
xNew = (x - xMin) / (xMax - xMin)
```

### Standardization

```python
xNew = (x - mean) / std
```

where xMin and xMax are the minimum and maximum values of x, respectively, and mean and std are the mean and standard deviation of x, respectively.

In [24]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()

descriptive_train[:,5:] = standard_scaler.fit_transform(descriptive_train[:,5:])
descriptive_test[:,5:] = standard_scaler.transform(descriptive_test[:,5:])

In [25]:
descriptive_train

array([[0.0, 0.0, 0.0, 1.0, 0.0, 1.5327567925845271, -1.0703093619031436,
        -0.06506383000977495],
       [0.0, 0.0, 1.0, 0.0, 0.0, 0.2205405456956152, -0.2189269149347338,
        -0.8241418467904854],
       [0.0, 0.0, 0.0, 0.0, 1.0, -0.011027027284781006,
        -0.9000328725094616, -0.5205106400782012],
       [1.0, 0.0, 0.0, 0.0, 0.0, -1.3232432741736928,
        -0.04865042554105182, -0.21687943336591703],
       [0.0, 0.0, 1.0, 0.0, 0.0, 0.7608648826498731, 1.6541144683957678,
        0.6940141867709355],
       [0.0, 1.0, 0.0, 0.0, 0.0, -1.4776216561606237,
        -0.7297563831157797, -1.1277730535027697],
       [0.0, 1.0, 0.0, 0.0, 0.0, 0.2977297366890806, 1.313561489608404,
        2.0603546169762144]], dtype=object)

In [26]:
descriptive_test

array([[0.0, 1.0, 0.0, 0.0, 0.0, -1.5548108471540891,
        0.12162606385263015, -0.5205106400782012],
       [0.0, 1.0, 0.0, 0.0, 0.0, -1.0144865101998313,
        0.12162606385263015, 0.08675177334636713],
       [0.0, 0.0, 1.0, 0.0, 0.0, 2.4590270845061117, 0.97300851082104,
        -0.21687943336591703]], dtype=object)