## Q1.

In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

df = pd.read_csv('dataset.csv')
# X is the feature matrix, y is the target variable
X = df.drop("target", axis=1)
y = df["target"]

# Step 1: Automated feature selection
feature_selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
X_selected = feature_selector.fit_transform(X, y)

# Step 2: Numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Step 3: Categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Step 4: Combine numerical and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, X.columns[X.dtypes != 'object']),
        ('cat', categorical_pipeline, X.columns[X.dtypes == 'object'])
    ])

# Step 5: Final pipeline with RandomForestClassifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the model
pipeline.fit(X_train, y_train)

# Step 7: Evaluate the accuracy on the test dataset
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on the test dataset: {accuracy}')

Accuracy on the test dataset: 0.8360655737704918


## Interpretation and suggestions for improvements:
- Feature selection is important for model interpretability and reducing overfitting. You can experiment with different feature selection methods and hyperparameters.
- Adjust the hyperparameters of the RandomForestClassifier for better performance.
- Evaluate other imputation strategies and scaling methods.
- Consider hyperparameter tuning for the entire pipeline using techniques like GridSearchCV.
- Monitor other metrics (precision, recall, F1-score) for a more comprehensive evaluation.
- Experiment with different models and ensemble methods for comparison.
- Consider handling imbalanced classes if applicable to your dataset.

## Q2.

In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='target')

# Step 1: Separate numerical and categorical features
numerical_features = X.columns
categorical_features = []

# Step 2: Numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Step 3: Categorical pipeline (not applicable for Iris dataset)

# Step 4: Combine numerical and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', 'passthrough', categorical_features)
    ])

# Step 5: Build the ensemble pipeline
ensemble_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('lr', LogisticRegression(random_state=42))
    ], voting='hard'))
])

# Step 6: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the pipeline
ensemble_pipeline.fit(X_train, y_train)

# Step 8: Evaluate the accuracy on the test dataset
y_pred = ensemble_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on the test dataset: {accuracy}')

Accuracy on the test dataset: 1.0
