In [66]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

## Q1

In [51]:
data = sns.load_dataset('tips')
X = data.drop(labels=['time'], axis=1)
y = data.time
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0,
                                                 shuffle=True)

In [52]:
num_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
cat_cols = [col for col in X.columns if X[col].dtype == 'category']

In [53]:
cat_cols

['sex', 'smoker', 'day']

In [54]:
num_cols

['total_bill', 'tip', 'size']

In [55]:
# make pipelines

In [56]:
num_pipeline = Pipeline(steps=[('imputer',SimpleImputer(strategy='mean')),
                               ('scaler', StandardScaler()) 
                              ]
                       )

cat_pipeline = Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                               ('encoder',OneHotEncoder(handle_unknown='ignore'))
                              ]
                       )

In [57]:
preprocessor = ColumnTransformer([
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
        ])

In [58]:
classifier_pipeline = Pipeline(steps=[
('preprocess', preprocessor),
('classifier',RandomForestClassifier())
                                     ])

In [59]:
# fit the pipeline on training data

classifier_pipeline.fit(x_train,y_train)

In [69]:
accuracy=classifier_pipeline.score(x_test,y_test)
accuracy

0.8775510204081632

In [71]:
# feature selection
rf = RandomForestClassifier(random_state=42)
selector = SelectFromModel(estimator=rf)

feature_selection_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', selector)
                                            ]
                                     )

In [87]:
feature_selection_pipeline.fit(x_train,y_train)

In [104]:
important_features = feature_selection_pipeline.get_feature_names_out()
important_features

array(['cat__day_Sat', 'cat__day_Sun', 'cat__day_Thur'], dtype=object)

## Q2

In [105]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler

In [107]:
data = sns.load_dataset('iris')
X = data.drop(labels=['species'], axis=1)
y = data.species
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0,
                                                 shuffle=True)

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', VotingClassifier(estimators=[
        ('rf', RandomForestClassifier()),
        ('lr', LogisticRegression())
    ]))
])

# Define hyperparameter grid for grid search
param_grid = {
    'clf__rf__n_estimators': [100, 200, 300],
    'clf__rf__max_depth': [5, 10, 15],
    'clf__lr__C': [0.1, 1, 10]
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid_search.fit(x_train, y_train)

# Print best parameters and accuracy score
print("Best parameters:", grid_search.best_params_)
print("Accuracy:", grid_search.best_score_)


Best parameters: {'clf__lr__C': 1, 'clf__rf__max_depth': 5, 'clf__rf__n_estimators': 100}
Accuracy: 0.9416666666666667
