<a href="https://colab.research.google.com/github/arkoleini/Kolini-ML/blob/main/ML_PipeLine_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# Learn Pipeline
[Learning Course YouTube](https://www.youtube.com/watch?v=HZ9MUzCRlzI)



In [None]:
from sklearn.pipeline import Pipeline
##feature Scalling
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [None]:
steps=[("standard_scaler",StandardScaler()),("classifier",LogisticRegression())]

In [None]:
steps

[('standard_scaler', StandardScaler()), ('classifier', LogisticRegression())]

# advantages of using Pipeline():

Prevents Data Leakage: Crucially ensures that data
*  **Prevents Data Leakage**: Crucially ensures that data preprocessing steps (like scaling or imputation) learn their parameters only from the training data, preventing information from the test set from inadvertently influencing the model's training and leading to more realistic performance estimates.
*   **Simplifies Cross-Validation**: Seamlessly integrates with cross-validation utilities (e.g., GridSearchCV), automatically applying all preprocessing and modeling steps correctly within each fold, reducing manual error and ensuring robust evaluation.
*  **Enhances Code Conciseness and Readability**: Chains multiple sequential steps (preprocessing, modeling) into a single, clean object, making your code easier to write, understand, and maintain compared to manual step-by-step execution.
*  **Facilitates Model Persistence and Deployment**: Allows the entire workflow (from preprocessing to the final model) to be saved and loaded as a single unit, guaranteeing consistent application of the exact same transformations and model when making predictions on new data.

In [None]:
pipe = Pipeline(steps)

## Visual Pipeline

In [None]:
from sklearn import set_config

In [None]:
set_config(display="diagram")

In [None]:
pipe

*   We have option to create our own dataset or use exisiting dataset.

*   for our purpose we choose to create our own dataset

In [None]:
##creating a dataset
from sklearn.datasets import make_classification
X,y=make_classification(n_samples=1000)

In [None]:
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")


Shape of X: (1000, 20)
Shape of y: (1000,)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)


In [None]:
X_train

array([[-1.0223351 ,  0.69400551, -0.43813055, ...,  0.98762359,
         0.33359649,  0.73199734],
       [ 0.57529284, -0.63013334, -0.53399755, ...,  1.51422654,
        -2.03996079,  1.08284912],
       [ 2.10085675,  1.49389732,  0.02574664, ..., -0.54084343,
         1.64214729,  1.39172801],
       ...,
       [ 1.57304086, -0.8153434 , -0.17201214, ...,  0.84717704,
        -0.77038495,  1.35420858],
       [-0.80448144,  0.23155108,  0.44518324, ...,  0.22578615,
        -0.88176923,  1.24172   ],
       [ 1.0912274 , -0.06220616,  0.29511805, ..., -0.24882055,
        -0.1496322 , -0.96822927]])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
y_pred

array([1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,

# Complex Pipeline


In [None]:
# *Numerical Pipeline*
# for Numerical imputing use simple imputer
from sklearn.impute  import SimpleImputer
numberic_processor = Pipeline(
    steps=[("imputer",SimpleImputer(strategy="mean")),
           ("scaler",StandardScaler())]
)

In [None]:
numberic_processor

In [None]:
# *Categorical Pipeline*
# for Categorical imputing use OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
categorical_processor = Pipeline(
    steps=[("imputation_constant",SimpleImputer(fill_value="missing", strategy="constant")),
           ("oneHot",OneHotEncoder(handle_unknown="ignore"))]
)


## combine both pipelines

In [None]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    [
        ("categorical", categorical_processor, ["gender","city"]),
        ("numerical", numberic_processor, ["age","hieght"])
    ]
)

In [None]:
preprocessor

## import custom pipeline to include all of these

In [None]:
from sklearn.pipeline import make_pipeline

# encorporate last pipeline with additional steps
pipe= make_pipeline(preprocessor, LogisticRegression())

In [None]:
pipe