In [53]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import Binarizer
from sklearn.cluster import KMeans
from fancyimpute import KNN
from sklearn.datasets import make_blobs # For Make simulated feature matrix

# **Rescaling a Feature**

In [54]:
# Create feature
feature = np.array(
    [
        [-500.5],
        [-100.1],
        [0],
        [100.1],
        [900.9]
    ]
)
# Create scaler
minmax = preprocessing.MinMaxScaler(feature_range=(0,1))
# Scale feature
feature_minmax = minmax.fit_transform(feature)
# Show feature
feature_minmax

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

### Min-Max Scaler Formula

$$
x' = \frac{x - x_{\text{min}}}{x_{\text{max}} - x_{\text{min}}} \cdot (b - a) + a
$$

- ( x ) is the original value  
- ( x_min ) is the minimum value in the feature  
- ( x_max ) is the maximum value in the feature  
- ( a ) is the minimum of the target range (typically 0)  
- ( b ) is the maximum of the target range (typically 1)


# **Standardizing a Feature**

In [55]:
# Create feature
x = np.array(
    [
      [-1000.1],
      [-200.2],
      [500.5],
      [600.6],
      [9000.9]
    ]
)

# Create scaler
sacler = preprocessing.StandardScaler()
# Transform the feature
standardized = sacler.fit_transform(x)
# Show feature
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [56]:
# Print mean and standard deviation
print("Mean:", round(standardized.mean()))
print("Standard deviation:", standardized.std())

Mean: 0
Standard deviation: 1.0


Standard Scaling Formula (Z-score Normalization)


$$
z = \frac{x - \mu}{\sigma}
$$

Where:

- ( x ) is the original value  
- ( mu ) is the mean of the feature  
- ( sigma ) is the standard deviation of the feature  
- ( z ) is the standardized value (also called the Z-score)


In [57]:
# If data has significant outliers , we use RobustScaler method

# Create scaler
robust_scaler = preprocessing.RobustScaler()
# Transform feature
robust_scaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

# **Normalizing Observations**

In [58]:
# Create feature matrix
features = np.array(
    [
      [0.5, 0.5],
      [1.1, 3.4],
      [1.5, 20.2],
      [1.63, 34.4],
      [10.9, 3.3]
    ]
)
# Create normalizer
features_l2_norm = Normalizer(norm='l2')
# Transform feature matrix
features_l2_norm.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [59]:
# Create normalizer
features_l1_norm = Normalizer(norm="l1")
# Transform feature matrix
features_l1_norm.transform(features)

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

Normalization Formulas
---
**L2 Normalization (Euclidean Norm)**

$$
\|\mathbf{x}\|_2 = \sqrt{x_1^2 + x_2^2 + \cdots + x_n^2}
$$

---

**L1 Normalization (Manhattan Norm)**

$$
\|\mathbf{x}\|_1 = |x_1| + |x_2| + \cdots + |x_n|
$$

# **Generating Polynomial and Interaction Features**

In [60]:
# Create feature matrix
features = np.array(
    [
        [1,2],
        [2,3],
        [3,4],
        [4,5]
    ]
)
# Create PolynomialFeatures object
polynomial_interaction =  PolynomialFeatures(
    degree = 2 ,
    include_bias = False
    )
# Create polynomial features
polynomial_interaction.fit_transform(features)

array([[ 1.,  2.,  1.,  2.,  4.],
       [ 2.,  3.,  4.,  6.,  9.],
       [ 3.,  4.,  9., 12., 16.],
       [ 4.,  5., 16., 20., 25.]])

degree = 2 look like this
$$
[x_1,\ x_2,\ x_1^2,\ x_1 x_2,\ x_2^2]
$$
degree = 3 look like this
$$
[x_1,\ x_2,\ x_1^2,\ x_1 x_2,\ x_2^2,\ x_1^3,\ x_1^2 x_2,\ x_1 x_2^2,\ x_2^3]
$$

- by default PolynomialFeatures includes interaction features $\ x_1 x_2$
- We can restrict the features created to only interaction features by setting  **interaction_only to True**

In [61]:
interaction = PolynomialFeatures(
    degree=2,
    interaction_only=True,
    include_bias=False
    )

interaction.fit_transform(features)

array([[ 1.,  2.,  2.],
       [ 2.,  3.,  6.],
       [ 3.,  4., 12.],
       [ 4.,  5., 20.]])

# **Transforming Features**

**we can make a custom transformation to one or more features**

In [62]:
# Create feature matrix
features = np.array([[1, 2],
                     [2, 3],
                     [3, 4]])

# Define a simple function
def sub_two(x):
    return x - 2
#Create transformer
two_transformer = FunctionTransformer(sub_two)
# Transform feature matrix
two_transformer.transform(features)

array([[-1,  0],
       [ 0,  1],
       [ 1,  2]])

# **Detecting Outliers**

In [63]:
# Create simulated data
n_samples = 10
n_features = 2
center = np.array([5.0, 5.0])
std_dev = 1.0
random_state = 1
np.random.seed(random_state)
features = np.random.randn(n_samples, n_features) * std_dev + center

# Replace the first observation's values with extreme values
features[0,0] = 10000
features[0,1] = 10000

print(features)

[[1.00000000e+04 1.00000000e+04]
 [4.47182825e+00 3.92703138e+00]
 [5.86540763e+00 2.69846130e+00]
 [6.74481176e+00 4.23879310e+00]
 [5.31903910e+00 4.75062962e+00]
 [6.46210794e+00 2.93985929e+00]
 [4.67758280e+00 4.61594565e+00]
 [6.13376944e+00 3.90010873e+00]
 [4.82757179e+00 4.12214158e+00]
 [5.04221375e+00 5.58281521e+00]]


In [64]:
# Create detector
outlier_detector = EllipticEnvelope(
    contamination=.1
    )
# Fit detector
outlier_detector.fit(features)
# Predict outliers
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

- Any point inside the ellipse is considered an **inlier (labeled 1)**, and any point outside the ellipse is considered an **outlier (labeled -1)**
- we can instead look at **individual featuresand** identify extreme values in those features using **interquartile range (IQR)**

In [65]:
# Create one feature
feature = features[:,0]

# Create a function to return index of outliers
def indicies_of_outliers(x):
  q1,q3 = np.percentile(x,[25,75])
  iqr = q3 - q1
  lower_bound = q1 - (iqr * 1.5)
  upper_bound = q3 + (iqr * 1.5)
  return np.where((x > upper_bound) | (x < lower_bound))

# Run function
indicies_of_outliers(feature)

(array([0]),)

# **Discretizating Features**

In [66]:
# Create feature
age = np.array([[6],
                [12],
                [20],
                [36],
                [65]])

# Create binarizer
binarizer = Binarizer(threshold=18)
# Transform feature
binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

**We can NumPy’s digitize for multiple thresholds also**

In [67]:
np.digitize(age, bins=[20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]])

In [68]:
np.digitize(age, bins=[18])

array([[0],
       [0],
       [1],
       [1],
       [1]])

# **Grouping Observations Using Clustering**

In [69]:
# Make simulated feature matrix
features, _ = make_blobs(n_samples = 50,
n_features = 2,
centers = 3,
random_state = 1)
# Create DataFrame
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
dataframe.head(5)

Unnamed: 0,feature_1,feature_2
0,-9.877554,-3.336145
1,-7.28721,-8.353986
2,-6.943061,-7.023744
3,-7.440167,-8.791959
4,-6.641388,-8.075888


In [70]:
# Make k-means clusterer
clusterer = KMeans(3, random_state=0)

# Fit clusterer
clusterer.fit(features)

# Predict values
dataframe["group"] = clusterer.predict(features)

# View first few observations
dataframe.head(5)

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


# **Imputing Missing Values**

**If we have a small amount of data, predict the missing values using k-nearest neighbors (KNN)**

In [71]:
# Make a simulated feature matrix
features, _ = make_blobs(n_samples = 1000,
n_features = 2,
random_state = 1)

features

array([[-3.05837272,  4.48825769],
       [-8.60973869, -3.72714879],
       [ 1.37129721,  5.23107449],
       ...,
       [-1.91854276,  4.59578307],
       [-1.79600465,  4.28743568],
       [-6.97684609, -8.89498834]])

In [72]:
# Standardize the features
scaler = preprocessing.StandardScaler()
standardized_features = scaler.fit_transform(features)

In [73]:
# Replace the first feature's first value with a missing value
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan

In [74]:
# Predict the missing values in the feature matrix
features_knn_imputed = KNN(k=5, verbose=0).fit_transform(standardized_features)



In [75]:
# Compare true and imputed values
print("True Value:", true_value)
print("Imputed Value:", features_knn_imputed[0,0])

True Value: 0.8730186113995938
Imputed Value: 1.0955332713113226
