# Chapter 01
# Chapter 02
# Chapter 03

# Chapter 04 - Handling Numerical Data

## Rescaling a Feature

In [3]:
# Load libraries
import numpy as np
from sklearn import preprocessing

# Create feature
feature = np.array([[-500.5],
                    [-100.1],
                    [0],
                    [100.1],
                    [900.9]])

# Create scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))

# Scale feature
scaled_feature = minmax_scale.fit_transform(feature)

# Show feature
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

It is called min-max scaling.

$x_i{new} = \frac{x_i - min(x)}{max(x) - min(x)}$	

## Standardizing a Feature

In [4]:
# Load libraries
import numpy as np
from sklearn import preprocessing

# Create feature
x = np.array([[-1000.1],
                    [-200.2],
                    [500.5],
                    [600.6],
                    [9000.9]])

# Create scaler
scaler = preprocessing.StandardScaler()

# Transform the feature
standardized = scaler.fit_transform(x)

# Show feature
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

Standardization is more used than min-max scaling. But it depends. For examle, PCA works better using standardization , while min-max works better with neural networks. It is a common alternative to min-max scaling. It becomes such that it has a zero(0) mean and one(1) standard deviation.

$x_{inew} = \frac{x_i - x_{hat}}{standard deviation}$

In [5]:
# print mean and standard deviation
print("Mean:", round(standardized.mean()))
print("Std:", round(standardized.std()))

Mean: 0.0
Std: 1.0


However, if we have outliers, it can badly effect on mean and std. To solve it, we can rescale the feature using the median and quartile range.

In [None]:
# Create scaler
robust_scaler = preprocessinge.RobustScaler()

# Transform feature
robust_scaler.fit_transform(x)

## Normalizing Observations

In [8]:
# Load libraries
import numpy as np
from sklearn.preprocessing import Normalizer

# Create feature
features = np.array([[0.5, 0.5],
                    [1.1, 3.4],
                    [1.5, 20.2],
                    [1.63, 34.4],
                    [10.9, 3.3]])

# Create normalizer
normalizer = Normalizer(norm="l2")

# Transform feature matrix
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

- Euclidean norm, often called L2
- Manhattan norm, often called L1

L2 norm is the distance between two point in New York for a bird, while L1 is the distance for human walking on the street, which is why it is calle Manhattan or Taxicab norm.

In [9]:
# Load libraries
import numpy as np
from sklearn.preprocessing import Normalizer

# Create feature
features = np.array([[0.5, 0.5],
                    [1.1, 3.4],
                    [1.5, 20.2],
                    [1.63, 34.4],
                    [10.9, 3.3]])

# Create normalizer
normalizer = Normalizer(norm="l1")

# Transform feature matrix
normalizer.transform(features)

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

Practically, L1 rescales an observation's value so they sum to 1.

## Generating Polynomial and Interaction Features

In [10]:
# Load libraries
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

# Create feature matrix
features = np.array([[2,3],
                     [2,3],
                     [2,3]])

# Create PolynomialFeatures object
polynomial_interaction = PolynomialFeatures(degree=2, include_bias=False)

# Create polynomial features
polynomial_interaction.fit_transform(features)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

## Transforming Features

In [11]:
# Load libraries
import numpy as np
from sklearn.preprocessing import FunctionTransformer

# Create feature matrix
features = np.array([[2,3],
                     [2,3],
                     [2,3]])

def add_ten(x):
    return x + 10

# Create transformer
ten_transformer = FunctionTransformer(add_ten)

# Transform feature matrix
ten_transformer.transform(features)



array([[12, 13],
       [12, 13],
       [12, 13]])

In [13]:
# Alternatively
# Load library
import pandas as pd

# Create DataFrame
df = pd.DataFrame(features, columns=["feature_1", "feature_2"])

# Apply Function
df.apply(add_ten)

Unnamed: 0,feature_1,feature_2
0,12,13
1,12,13
2,12,13


## Detection Outliers

In [14]:
# Load libraries
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

# Create simulated data
features, _ = make_blobs(n_samples=10,
                         n_features=2,
                         centers=1,
                         random_state=1)

features

array([[-1.83198811,  3.52863145],
       [-2.76017908,  5.55121358],
       [-1.61734616,  4.98930508],
       [-0.52579046,  3.3065986 ],
       [ 0.08525186,  3.64528297],
       [-0.79415228,  2.10495117],
       [-1.34052081,  4.15711949],
       [-1.98197711,  4.02243551],
       [-2.18773166,  3.33352125],
       [-0.19745197,  2.34634916]])

A common method is to assume the data is normally distributed and based on that assumption draw an ellipse around the data, classifying any observation inside the ellipse as an inlier(labeled as 1) and any observation outside the ellipse as an outlier (labeled as -1)

In [16]:
# Replace the first observation's values with extreme values
features[0,0] = 10000
features[0,1] = 10000

# Create detector
outlier_detector = EllipticEnvelope(contamination=.1)

# Fit detector
outlier_detector.fit(features)

# Predict outliers
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

If we expect our data to have few outliers, we can set contamination to something small.

## Handling Outliers

In [18]:
# Load libraries
import pandas as pd

# Create DataFrame
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

# Filter observations
houses[houses['Bathrooms'] < 20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [20]:
import numpy as np

# Create feature based on boolean condition
houses['Outlier'] = np.where(houses['Bathrooms'] < 20, 0, 1)

houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [22]:
# Log feature to dampen the effect of outliers
houses['Log_of_Square_Feet'] = [np.log(x) for x in houses["Square_Feet"]]

# Show data
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


If you suffer from outliers still, use RobustScaler.

## Discretizating Features

In [23]:
# Load libraries
import numpy as np
from sklearn.preprocessing import Binarizer

# Create feature
age = np.array([[6],
                [12],
                [20],
                [36],
                [65]])

# Create binarizer
binarizer = Binarizer(18)

# Transform feature
binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [26]:
# Alternatively
# Bin feature
np.digitize(age, bins=[18], right=True)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [25]:
# alternatively we can put multiple thresholds
# Bin feature
np.digitize(age, bins=[20,30,64], right=True)

array([[0],
       [0],
       [0],
       [2],
       [3]])

## Grouping Observations Using Clustering

In [27]:
# Using k-means clustering
# Load libraries
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

# Make simulated feature matrix
features, _ = make_blobs(n_samples=50,
                         n_features=2,
                         centers=3,
                         random_state=1)

# Create DataFrame
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])

# Make k-means clusterer
clusterer = KMeans(3, random_state=0)

# Fit clusterer
clusterer.fit(features)

# Predict values
dataframe["group"] = clusterer.predict(features)

# view first few observations
dataframe.head()

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,0
1,-7.28721,-8.353986,2
2,-6.943061,-7.023744,2
3,-7.440167,-8.791959,2
4,-6.641388,-8.075888,2


## Deleting Observations with Missing Values

In [38]:
# Load library
import numpy as np

# Create feature matrix
features = np.array([[1.1, 11.1],
                     [2.2, 22.2],
                     [3.3, 33.3],
                     [4.4, 44.4],
                     [np.nan, 55]])

# Keep only observations that are not (denoted by ~) missing
features[~np.isnan(features).any(axis=1)] # row

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [40]:
# Alternatively we can drop missing values using pandas
# load library
import pandas as pd

# Load data
df = pd.DataFrame(features, columns=["feature_1", "feature_2"])

# Remove observations with missing values
df.dropna()

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


## Imputing Missing Values

In [1]:
# Load libraries
import numpy as np
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

# Make a simulated feature matrix
features, _ = make_blobs(n_samples=1000,
                         n_features=2,
                         random_state=1)

# Standardize the feature
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

# Replace the first feature's value with a missing value
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan

# Predict the missing values in the feature matrix
features_knn_imputed = KNN(k=5, verbose=0).complete(standardized_features)

# Compate true and imputed values
print("True Value:",true_value)
print("Imputed Value:",features_knn_imputed[0,0])

ImportError: cannot import name 'logsumexp'

In [46]:
# Alternatively, we can use scikit-learn's Imputer module
# Load library
from sklearn.preprocessing import Imputer

# Create imputer
mean_imputer = Imputer(strategy="mean", axis=0)

# Impute values
features_mean_imputed = mean_imputer.fit_transform(features)

# Compate true and imputed values
#print("True Value:", true_value)
print("Imputed Value:", features_mean_imputed[0,0])

Imputed Value: 1.1


KNN, k nearest observations to predict the missing value. In smaller dataset, it works well, but quickly becomes problematic if a dataset has millions of observations. 