In [48]:
import pandas as pd
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression

## Question 1

In [3]:
data = load_iris()

In [5]:
data.__dir__()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [24]:
X = pd.DataFrame(data["data"], columns=data["feature_names"])
Y = pd.DataFrame(data["target"])

In [38]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.15, random_state=29873, shuffle=True)

In [None]:
pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("decomposition", PCA(n_components=2)),
        ("estimator", LogisticRegression())
    ]
)

In [None]:
pipeline.fit(x_train, y_train)

In [46]:
pipeline.predict(x_test)

array([1, 0, 1, 1, 2, 2, 0, 1, 1, 0, 1, 2, 0, 1, 2, 1, 1, 2, 0, 1, 2, 0,
       0])

## Question 2

In [76]:
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    """add new feature transformer"""
    def __init__(self, feature_to_create: list[dict]) -> None:
        """Initialization
        Args:
            feature_to_create (list[dict]) : list of dict contain \
                'new_feature_name: str', 'columns: list[str] | str', \
                'calc_func: Callable'
            
        Returns: None
        """
        self.feature_to_create = feature_to_create

    def fit(self, X, Y=None) -> None:
        return self
        
    def transform(self, X):
        df = pd.DataFrame(X).copy()

        for feature in self.feature_to_create:
            new_column = feature["new_feature_name"]
            columns = feature["columns"]
            calc_func = feature["calc_func"]
            df[new_column] = calc_func(df[columns])
        return df 

In [77]:
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [78]:
feature_to_create = [
    {
        "new_feature_name" : "kindi",
        "columns" : ["sepal length (cm)", "sepal width (cm)"],
        "calc_func" : lambda df: df["sepal length (cm)"] + df["sepal width (cm)"]
    },
    {
        "new_feature_name" : "kalam",
        "columns" : ["petal length (cm)", "petal width (cm)"],
        "calc_func" : lambda df: df["petal length (cm)"] + df["petal width (cm)"]
    }
]

In [84]:
question2_pipeline = Pipeline(
    steps=[
        ("feature engineering", FeatureEngineeringTransformer(feature_to_create=feature_to_create)),
    ]
)

In [85]:
transformed_df = question2_pipeline.fit_transform(x_train, y_train)

In [86]:
pd.DataFrame(transformed_df)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),kindi,kalam
113,5.7,2.5,5.0,2.0,8.2,7.0
116,6.5,3.0,5.5,1.8,9.5,7.3
83,6.0,2.7,5.1,1.6,8.7,6.7
53,5.5,2.3,4.0,1.3,7.8,5.3
119,6.0,2.2,5.0,1.5,8.2,6.5
...,...,...,...,...,...,...
149,5.9,3.0,5.1,1.8,8.9,6.9
44,5.1,3.8,1.9,0.4,8.9,2.3
65,6.7,3.1,4.4,1.4,9.8,5.8
33,5.5,4.2,1.4,0.2,9.7,1.6
