In [1]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

In [5]:
# create a dataset
X = np.array([[1,2],[3,4]])
print(X)

# define the transformation function
log_transform = FunctionTransformer(np.log1p)

# apply the transform to the dataset
X_transformed = log_transform.transform(X)

print(X_transformed)

[[1 2]
 [3 4]]
[[0.69314718 1.09861229]
 [1.38629436 1.60943791]]


In [8]:
# create a dataset
X = np.array([[1,2] , [3,4]])
print(X)

# define a custom feature engineering function 
def squ(X):
    return np.hstack((X,X**2))

# CREATE  a function transformer to apply the custom function
custom_transformer = FunctionTransformer (squ)

# apply the transformer to the input data 
X_transformed = custom_transformer.transform(X)

print(X_transformed)

[[1 2]
 [3 4]]
[[ 1  2  1  4]
 [ 3  4  9 16]]


In [10]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# create a dataset
X = np.array([[1, 2], [3, 4]])

# define a custom scaling function
def my_scaling(X):
    return X / np.max(X)

# create a FunctionTransformer to apply the custom function
custom_transformer = FunctionTransformer(my_scaling)

# apply the transformer to the input datal
X_transformed = custom_transformer.transform(X)

# view the transformed data
print(X_transformed)



[[0.25 0.5 ]
 [0.75 1.  ]]


# Pipeline --- > container steps of process but sequentially .

# column transformer --- > 1 step output will not the 2 step input
# Pipeline ---- > output of 1 step will be input of second step .

# Data ---- > x,y ---- > categorical data [] --- > encoding --- > Normal distribution , Numerical data[]---normal


In [12]:
import pandas as pd

In [13]:
df = pd.read_csv("covid_toy.csv")
df.head(2)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes


In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [16]:
X = df.drop(columns = ['has_covid'])
y = df['has_covid' ]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,
    random_state=42)



In [18]:
# Define the columns that need to be preprocessed
categorical_features = ['gender', 'city']
numeric_features = ['age', 'fever' ]



In [19]:
# Create transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
# Create the pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

# Train the model
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)


In [23]:
from sklearn.metrics import accuracy_score

In [24]:
accuracy_score(y_test,y_pred)

0.65