## Load necessary modules

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline

## Load necessary data

In [None]:
drugs = pd.read_csv("raw_data/drug-use-by-age.csv")
drugs.head()

## Clean `age` such that it is an integer

In [None]:
drugs["age"] = drugs["age"].str[0:2].astype(int)

## Train Test Split Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(drugs.drop(["age"], axis=1),
                                                    drugs["age"],
                                                    random_state=624,
                                                    test_size=0.3)

### Note about EDA

Traditionally, it is best practice to visually explore the contents of `X_train` through a process known as Exploratory Data Analysis (EDA). EDA will show you distributions, anomalies, and trends your data has. This will utlimately lead you to a checklist of preprocessing steps before you begin modeling:

* outliers that need to be removed/impute
* features that need to be created (i.e. feature engineering)
* numerical features that need to be scaled
* categorical features that need to be OHE

*In this case, we're skipping EDA only for the purpose of jumping straight to `Pipeline` objects*

In [None]:
X_train.head()

## Let's create a `mapper` object that stores our preprocessing steps

In [None]:
mapper = DataFrameMapper([
    (["n", "alcohol-use"], StandardScaler()),
    #(["alcohol_use"], StandardScaler())
])

## Let's create a `Pipeline` object that will apply our preprocessing steps before the data goes into our model

In [None]:
pipe = Pipeline(steps=[
    ("preprocessing", mapper),
    ("linear_reg", LinearRegression())
])

## Let's fit `X_train` and `y_train` onto our pipeline

In [None]:
pipe.fit(X_train, y_train)

## Store predictions

In [None]:
y_pred = pipe.predict(X_test)
y_pred[0:5]

In [None]:
y_test[0:5]