In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

# Preprocessing

Which features should we use to train our models?

In [None]:
data = pd.read_csv('Mall_Customers.csv')
print(data.head())

In [3]:
features = ['Gender', 'Age', 'Annual Income (k$)']
target = ['Spending Score (1-100)']

X = data[features].values
y = data[target].values

## One-hot encoding

<img src="https://miro.medium.com/max/1879/1*O_pTwOZZLYZabRjw3Ga21A.png" height=200>

_* Image is taken from: https://morioh.com/p/811a5d22bbca_




In [None]:
genders = X[:, 0]
genders

In [None]:
one_hot_encoding = []

for sample in genders:
  if sample == 'Male':
    one_hot_encoding.append([1, 0])
  elif sample == 'Female':
    one_hot_encoding.append([0, 1])

one_hot_encoding[:10]

## Label encoding

In [6]:
X[:, 0][X[:, 0] == 'Male'] = 0
X[:, 0][X[:, 0] == 'Female'] = 1

In [None]:
X[:10]

## Normalization

Are the quantities of the features equally weighted?

In [None]:
data.head()

> Normalization

$X_{norm} = \frac{X - X_{min}}{X_{max} - X_{min}}$

$X_{norm} = \frac{X}{max(X)}$ (Max)

$X_{norm} = \frac{X}{\sum_{i}{|X_i|}}$ (L1 Norm)

$X_{norm} = \frac{X}{\sqrt{\sum_{i}{X_i^2}}}$ (L2 Norm)

> Standardization

$X_{stad} = \frac{X - mean(X)}{std(X)}$

In [9]:
minX = X.min(axis=0)
maxX = X.max(axis=0)

X = (X - minX) / (maxX - minX)

# sklearn.preprocessing.normalize(X, norm='l2')

In [None]:
X[:10]

# Dealing with Empty values

_Which features should be restricted from this operation?_ 

In [None]:
titanic_data = sns.load_dataset('titanic')

print(titanic_data.head())

In [12]:
## Dropping empty values
# titanic_data = titanic_data.dropna()

_Can missing features be replaced with most-common or expected(mean) values? What consequences may happen?_

In [13]:
## Replacing empty values with most-common or mean values
titanic_data = sns.load_dataset('titanic')

most_common_deck = titanic_data["deck"].value_counts().idxmax()
titanic_data["deck"].fillna(most_common_deck, inplace=True)

mean_age = titanic_data["age"].mean()
titanic_data["age"].fillna(mean_age, inplace=True)

most_common_embarked = titanic_data["embarked"].value_counts().idxmax()
titanic_data["embarked"].fillna(most_common_embarked, inplace=True)

most_common_embarked = titanic_data["embark_town"].value_counts().idxmax()
titanic_data["embark_town"].fillna(most_common_embarked, inplace=True)

In [None]:
titanic_data

_What disadvantages may present in predicting empty values?_

In [15]:
# Training models for empty values 
titanic_data = sns.load_dataset('titanic')

# remove empty age values
mean_age = titanic_data["age"].mean()
titanic_data["age"].fillna(mean_age, inplace=True)
# titanic_data = titanic_data[titanic_data["age"].notnull()] 

features = ["pclass", "age", "fare"]
label = "deck"

In [16]:
nan_idxs = titanic_data["deck"].isnull()
fill_idxs = titanic_data["deck"].notnull()

In [17]:
X = titanic_data[fill_idxs][features].values
y = titanic_data[fill_idxs][label].values

In [18]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)
model = model.fit(X, y)

In [19]:
predictions = model.predict(titanic_data[nan_idxs][features].values)

In [20]:
titanic_data[nan_idxs] = titanic_data[nan_idxs].assign(deck=predictions)

In [None]:
titanic_data

# Splitting the dataset

In [22]:
from sklearn.model_selection import train_test_split

# 80% Train, 10% Validation, 10% Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

# 80% Train, 20% Test
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
print(f'Total # of sample in whole dataset: {len(X)}\n')
print(f'Total # of sample in train dataset: {len(X_train)}')
print(f'Total # of sample in validation dataset: {len(X_valid)}')
print(f'Total # of sample in test dataset: {len(X_test)}')

# k-fold Cross validation

<img src="https://www.googleapis.com/download/storage/v1/b/kaggle-forum-message-attachments/o/inbox%2F4788946%2F82b5a41b6693a313b246f02d79e972d5%2FK%20FOLD.png?generation=1608195745131795&alt=media" height=300>

_* Image is taken from: https://www.kaggle.com/discussion/204878_