# Transformations

A few common feature transformation scenarios:
* Scaling
* Log transform
* Polynomial expansion

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, root_mean_squared_error, r2_score

## Scaling

Scaling is useful for distance‑based algorithms.  Let's look at it's impact on a dataset whose features are imbalanced in scale.

In [None]:
from sklearn.datasets import make_classification

x, y = make_classification(n_samples=1000, n_features=2,
                           n_informative=2, n_redundant=0,
                           flip_y=0, class_sep=1.0, random_state=0)

x[:, 1] *= 1000        # increase feature‑2 scale by 3 orders of magnitude

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=0)


In [None]:
plt.scatter(x[:,0],x[:,1],color=['k' if label==0 else 'r' for label in y])

In [None]:
pipe_raw  = Pipeline([('knn', KNeighborsClassifier(n_neighbors=7))])

pipe_scaled = Pipeline([('scale', StandardScaler()),
                        ('knn',   KNeighborsClassifier(n_neighbors=7))])

for name, pipe in [('No scaling', pipe_raw), ('With scaling', pipe_scaled)]:
    pipe.fit(x_train, y_train)
    acc = accuracy_score(y_test, pipe.predict(x_test))
    print(f"{name:<12s}: test accuracy = {acc:.3f}")


Euclidean distance is meaningless when one feature dwarfs all others; scaling fixes that.

In [None]:
pipe_raw  = Pipeline([('knn', LogisticRegression())])

pipe_scaled = Pipeline([('scale', StandardScaler()),
                        ('knn',   LogisticRegression())])

for name, pipe in [('No scaling', pipe_raw), ('With scaling', pipe_scaled)]:
    pipe.fit(x_train, y_train)
    acc = accuracy_score(y_test, pipe.predict(x_test))
    print(f"{name:<12s}: test accuracy = {acc:.3f}")


## Log / power transforms

These transformations can act to linearize skewed data.  Let's look at an artificial dataset that's skewed.

$y = 10\log(1+x) + noise$

In [None]:
n = 1000
x = np.random.exponential(scale=2, size=(n, 1))
e = np.random.normal(0, 0.3, size=n)

y = 10 * np.log1p(x[:, 0]) + e

x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=0.2,
                                                    random_state=0)


In [None]:
plt.plot(x,y,'o')

In [None]:
plt.plot(np.log1p(x),y,'o');

In [None]:
plt.hist(x, bins=30);

In [None]:
plt.hist(np.log1p(x), bins=30);

In [None]:
model = LinearRegression()

x_new = np.log1p(x_train)
x_new_test = np.log1p(x_test)

model.fit(x_new, y_train)

rmse = root_mean_squared_error(y_test, model.predict(x_new_test))
print(f"{name:<7s}: RMSE = {rmse:.3f}")


LinearRegression vs. log‑transformed LinearRegression

In [None]:
pipe_raw = Pipeline([('lin', LinearRegression())])

pipe_log = Pipeline([
        ('log1p', FunctionTransformer(np.log1p)),
        ('lin',   LinearRegression())
    ])

for name, pipe in [('No log', pipe_raw), ('Log1p', pipe_log)]:
    pipe.fit(x_train, y_train)
    rmse = root_mean_squared_error(y_test, pipe.predict(x_test))
    print(f"{name:<7s}: RMSE = {rmse:.3f}")


The log transform converts a curved relationship into one the linear model can capture almost perfectly.

# Polynomial features — adding interactions & curvature

$y = (x_1 - x_2)^2 + noise$

In [None]:
m = 1200

x = np.random.uniform(-3, 3, size=(m, 2))
noise = np.random.normal(0, 0.4, size=m)
y = (x[:, 0] - x[:, 1])**2 + noise

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
                                                    random_state=0)


In [None]:
plt.scatter(x[:,0],x[:,1],c=y)

In [None]:
pipe_lin = Pipeline([('lin', LinearRegression())])

pipe_poly = Pipeline([
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('lin',  LinearRegression())
    ])

for name, pipe in [('Linear', pipe_lin), ('Poly d=2', pipe_poly)]:
    pipe.fit(x_train, y_train)
    r2 = r2_score(y_test, pipe.predict(x_test))
    print(f"{name:<8s}: R2 = {r2:.3f}")


PolynomialFeatures supplies squared and interaction terms that turn a previously unlearnable pattern into a linear one.

## Transformation Results

| Transformation          | Model type helped                   | Metric gain (above demos) |
| ----------------------- | ----------------------------------- | ---------------------- |
| **StandardScaler**      | Distance‑based (K‑NN, SVM, K‑Means) | Accuracy 0.48 → 0.99   |
| **Log1p / power**       | Linear                              | RMSE 1.84 → 0.29       |
| **Polynomial features** | Any linear model                    | R2 0.000 → 0.997       |


Raw features rarely line up with an algorithm’s assumptions.

Clever application of transformations helps to illuminate signals, improve convergence, and improve generalization.