# Pipeline


In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.datasets import fetch_california_housing

# Load the California Housing dataset.
housing = fetch_california_housing(as_frame = True)

# Define the numeric features and categorical features.
numeric_features = housing.feature_names[:2]
categorical_features = housing.feature_names[2:]

# Define the preprocessing pipelines for the numeric features and the categorical features.
numeric_transformer = Pipeline(steps = [('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps = [('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

# Use ColumnTransformer to combine the numeric and categorical transformers.
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the pipeline with the preprocessor and the LinearRegression model
pipeline = Pipeline(steps = [('preprocessor', preprocessor),
                            ('regressor', LinearRegression())])

# Convert the data and target to pandas DataFrame.
x = pd.DataFrame(housing.data, columns = housing.feature_names)
y = pd.Series(housing.target)

# Fit the pipeline to the data.
pipeline.fit(x,y)

# Predict on new data.
x_new = x.iloc[:10]
y_pred = pipeline.predict(x_new)
print(y_pred)

[4.52601218 3.58499971 3.52099601 3.41299871 3.42199676 2.6970015
 2.99199919 2.41399946 2.26699906 2.61098686]


In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv("covid_toy.csv")

In [4]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [5]:
x = df.drop(columns = ['has_covid'], axis = 1)
y = df['has_covid']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state=42)

In [7]:
# Define the columns that need to be preprocessed
categorical_features = ['gender', 'city']
numeric_features = ['age', 'fever']

In [8]:
#Create transformers
numeric_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

categorical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

#combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numeric_transformer,numeric_features),
        ('cat',categorical_transformer,categorical_features)
    ])

# Create the pipeline
clf = Pipeline(steps = 
              [('preprocessor', preprocessor),
              ('classifier', LogisticRegression())])

# Train the model
clf.fit(x_train, y_train)

# Evaluate the model
y_pred = clf.predict(x_test)

In [9]:
acc = accuracy_score(y_test, y_pred)
print(acc)

0.65


In [10]:
import numpy as np
import pandas as pd

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [12]:
df = pd.read_csv("Social_Network_Ads.csv")

In [13]:
df = df.drop(columns = ['User ID', 'Gender'])


In [14]:
x = df.drop(columns = ['Purchased'], axis = 1)
y = df['Purchased']

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components = 2)),
    ('classifier', RandomForestClassifier(n_estimators = 100, random_state = 42))
])

In [16]:
pipe

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=2)),
                ('classifier', RandomForestClassifier(random_state=42))])

In [17]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=2)),
                ('classifier', RandomForestClassifier(random_state=42))])

In [18]:
y_pred = pipe.predict(x_test)

In [19]:
acc = accuracy_score(y_test, y_pred)
print(acc)

0.8875


In [20]:
df = pd.read_csv("tips.csv")

In [21]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [23]:
x = df.drop(columns = ['smoker'], axis = 1)
y = df['smoker']

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [25]:
categorical_features = ['sex','day','time']
numeric_features = ['total_bill','tip','size']

In [26]:
# Create Transformer
numeric_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# create the Pipeline
clf = Pipeline(steps = 
              [('preprocessor', preprocessor),
              ('classifier', LogisticRegression())])

# Train the model
clf.fit(x_train, y_train)

# Evaluate the model
y_pred = clf.predict(x_test)

In [27]:
accuracy_score(y_test, y_pred)

0.7346938775510204