# Common pitfalls and recommended practices

<img src = 'https://media.istockphoto.com/photos/businessmen-of-the-pitfalls-and-miniature-picture-id657556628?b=1&k=20&m=657556628&s=170667a&w=0&h=vOvwJpfBZjBcgY0J67Fzr6-_bpRJc7V8kcsy_0-e8cQ='>

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import numpy as np

# 1. Inconsistent preprocessing

In [3]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

random_state = 42
X, y = make_regression(random_state=random_state, n_features=1, noise=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=random_state)

## <font color='red'> Wrong

In [4]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_transformed = scaler.fit_transform(X_train)
model = LinearRegression().fit(X_train_transformed, y_train)
mean_squared_error(y_test, model.predict(X_test))

62.80867119249539

## <font color='blue'> Right

In [5]:
X_test_transformed = scaler.transform(X_test)
mean_squared_error(y_test, model.predict(X_test_transformed))

0.902797546636954

## <font color='green'> pipeline

In [6]:
from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(), LinearRegression())
model.fit(X_train, y_train)

mean_squared_error(y_test, model.predict(X_test))

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

0.902797546636954

# 2. Data leakage
## 2.1. Data leakage during pre-processing

In [10]:
n_samples, n_features, n_classes = 200, 1000, 2
rng = np.random.RandomState(42)
X = rng.standard_normal((n_samples, n_features))
y = rng.choice(n_classes, n_samples)

## <font color='red'> Wrong

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Incorrect preprocessing: the entire data is transformed
X_selected = SelectKBest(k=25).fit_transform(X, y)

X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, random_state=42)
gbc = GradientBoostingClassifier(random_state=1)
gbc.fit(X_train, y_train)

y_pred = gbc.predict(X_test)
accuracy_score(y_test, y_pred)

GradientBoostingClassifier(random_state=1)

0.64

## <font color='blue'> Right

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42)

select = SelectKBest(k=25)
X_train_selected = select.fit_transform(X_train, y_train)

gbc = GradientBoostingClassifier(random_state=1)
gbc.fit(X_train_selected, y_train)

X_test_selected = select.transform(X_test)
y_pred = gbc.predict(X_test_selected)
accuracy_score(y_test, y_pred)

GradientBoostingClassifier(random_state=1)

0.48

## <font color='green'> pipeline

In [14]:
from sklearn.pipeline import make_pipeline
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42)
pipeline = make_pipeline(SelectKBest(k=25),
                         GradientBoostingClassifier(random_state=1))
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

Pipeline(steps=[('selectkbest', SelectKBest(k=25)),
                ('gradientboostingclassifier',
                 GradientBoostingClassifier(random_state=1))])

0.48

# 2.2. How to avoid data leakage

- Always split the data into train and test subsets first, particularly before any preprocessing steps.(선교후처)
- Never include test data when using the fit and fit_transform methods. Using all the data, e.g., fit(X), can result in overly optimistic scores.
- Conversely, the transform method should be used on both train and test subsets as the same preprocessing should be applied to all the data. This can be achieved by using fit_transform on the train subset and transform on the test subset.
- The scikit-learn pipeline is a great way to prevent data leakage as it ensures that the appropriate method is performed on the correct data subset. The pipeline is ideal for use in cross-validation and hyper-parameter tuning functions.

# 3. Controlling randomness
## 3.1. Using None or RandomState instances, and repeated calls to fit and split
### 3.1.1. Estimators

In [28]:
np.random.RandomState(0)

RandomState(MT19937) at 0x29DCF862640

In [82]:
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import make_classification
import numpy as np

rng = np.random.RandomState(0)
X, y = make_classification(n_features=5, random_state=rng)
sgd = SGDClassifier(random_state=rng)
sgd.fit(X, y).coef_
sgd.fit(X, y).coef_

array([[ 8.85418642,  4.79084103, -3.13077794,  8.11915045, -0.56479934]])

array([[ 6.70814003,  5.25291366, -7.55212743,  5.18197458,  1.37845099]])

### 3.1.2. CV splitters

In [43]:
from sklearn.model_selection import KFold

X = y = np.arange(10)
rng = np.random.RandomState(0)
cv = KFold(n_splits=2, shuffle=True, random_state=rng)

for train, test in cv.split(X, y):
    print(train, test)
print('*'*100)
for train, test in cv.split(X, y):
    print(train, test)

[0 3 5 6 7] [1 2 4 8 9]
[1 2 4 8 9] [0 3 5 6 7]
****************************************************************************************************
[0 4 6 7 8] [1 2 3 5 9]
[1 2 3 5 9] [0 4 6 7 8]


## 3.2. Common pitfalls and subtleties
### 3.2.1. Estimators

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
import numpy as np

X, y = make_classification(random_state=0)

rf_123 = RandomForestClassifier(random_state=123)
cross_val_score(rf_123, X, y)


rf_inst = RandomForestClassifier(random_state=np.random.RandomState(0))
cross_val_score(rf_inst, X, y)

array([0.85, 0.95, 0.95, 0.9 , 0.9 ])

array([0.9 , 0.95, 0.95, 0.9 , 0.9 ])

In [55]:
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np

rng = np.random.RandomState(0)
X, y = make_classification(random_state=rng)
cv = KFold(shuffle=True, random_state=rng)
lda = LinearDiscriminantAnalysis()
nb = GaussianNB()

for est in (lda, nb):
    print(cross_val_score(est, X, y, cv=cv))

[0.8  0.75 0.75 0.7  0.85]
[0.85 0.95 0.95 0.85 0.95]


In [90]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np

rng = np.random.RandomState(0)
X, y = make_classification(random_state=rng)
rf = RandomForestClassifier(random_state=rng)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=rng)
rf.fit(X_train, y_train).score(X_test, y_test)

0.84

# End