In [1]:
#Pipelines - Basic Template

#Import required Python packages
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
#Import sample data
my_df = pd.read_csv("pipeline_data.csv")
#Split for training and testing

X = my_df.drop(["purchase"], axis=1)
y = my_df["purchase"]
#Spilt data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42, stratify=y) #stratify y = Test set and Train set will have same ratio of 0s and 1s as data set

In [3]:
#Specify numeric and categorical features

numeric_features = ["age","credit_score"]
categorical_features = ["gender"]

In [4]:
#Numerical feature transformer
numeric_transformer = Pipeline(steps = [("imputer", SimpleImputer()),
                                       ("scaler", StandardScaler())])


In [5]:
#Categorical feature transformer
categorical_transformer = Pipeline(steps = [ ("imputer", SimpleImputer(strategy = "constant", fill_value = "U")), #Missing value filled with U for unknown,
                                       ("ohe", OneHotEncoder(handle_unknown="ignore"))
                                           ]) #If it comes across new category other than M or F in gender, will be 0


In [6]:
#Preprocessing pipline
preprocessing_pipeline = ColumnTransformer(transformers = [("numeric", numeric_transformer, numeric_features),
                                                          ("categorical", categorical_transformer, categorical_features)])


In [7]:
#Apply the pipeline

#Logistic Regression

clf = Pipeline(steps = [("preprocessing_pipeline", preprocessing_pipeline),
                       ("classifier", LogisticRegression(random_state=42))])
#train model
clf.fit(X_train, y_train)

In [8]:
y_pred_class = clf.predict(X_test)
accuracy_score(y_test,y_pred_class)

0.85

In [9]:
#Random Forest

clf = Pipeline(steps = [("preprocessing_pipeline", preprocessing_pipeline),
                       ("classifier", RandomForestClassifier(random_state=42))])

clf.fit(X_train, y_train)
y_pred_class = clf.predict(X_test)
accuracy_score(y_test, y_pred_class)

0.85

In [10]:
#Save the pipeline
joblib.dump(clf, r"C:\Users\19144\OneDrive\Desktop\Data Science Infinity\Python Fundamentals\model.joblib")

['C:\\Users\\19144\\OneDrive\\Desktop\\Data Science Infinity\\Python Fundamentals\\model.joblib']

In [12]:
#Import pipeline object and predict on new data

clf = joblib.load("C:\\Users\\19144\\OneDrive\\Desktop\\Data Science Infinity\\Python Fundamentals\\model.joblib")


In [13]:
#Create new data

new_data = pd.DataFrame({"age" : [25, np.nan, 50],
                        "gender" : ["M", "F", np.nan],
                        "credit_score" : [200,100, 500]})

In [14]:
#Pass new data in and receive predictions
clf.predict(new_data)

array([1, 0, 0], dtype=int64)

In [None]:
#Therefore, someone 25 years old, Male, and with a credit score of 200 is inclined to make a purchase