# Q1. You are working on a machine learning project where you have a dataset containing numerical and categorical features. You have identified that some of the features are highly correlated and there are missing values in some of the columns. You want to build a pipeline that automates the feature engineering process and handles the missing values.

# Design a pipeline that includes the following steps: 
- Use an automated feature selection method to identify the important features in the dataset 
- Create a numerical pipeline that includes the following steps 
- Impute the missing values in the numerical columns using the mean of the column values 
- Scale the numerical columns using standardisation
- Create a categorical pipeline that includes the following steps 
- Impute the missing values in the categorical columns using the most frequent value of the column 
- One-hot encode the categorical columns 
- Combine the numerical and categorical pipelines using a ColumnTransformer 
- Use a Random Forest Classifier to build the final model 
- Evaluate the accuracy of the model on the test dataset. 

# Note: Your solution should include code snippets for each step of the pipeline, and a brief explanation of each step. You should also provide an interpretation of the results and suggest possible improvements for the pipeline.

# Import necessary libraries

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer ##handling missing values 
from sklearn.preprocessing import StandardScaler ## Feature scaling
from sklearn.preprocessing import OneHotEncoder ## categorical to numerical 
from sklearn.compose import ColumnTransformer

# Loading iris dataset

In [2]:
df = sns.load_dataset('titanic')

In [3]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


# data Exploration

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [5]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [7]:
df = df.drop(['embarked','class','who','adult_male','alive','deck'],axis = 1)

In [8]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,3,male,22.0,1,0,7.2500,Southampton,False
1,1,1,female,38.0,1,0,71.2833,Cherbourg,False
2,1,3,female,26.0,0,0,7.9250,Southampton,True
3,1,1,female,35.0,1,0,53.1000,Southampton,False
4,0,3,male,35.0,0,0,8.0500,Southampton,True
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,Southampton,True
887,1,1,female,19.0,0,0,30.0000,Southampton,True
888,0,3,female,,1,2,23.4500,Southampton,False
889,1,1,male,26.0,0,0,30.0000,Cherbourg,True


## independent and dependent feature

In [9]:
## independent and dependent feature
X=df.drop(['survived'],axis=1)
y=df['survived']

In [10]:
# splitting the data in train data and test data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [11]:
categorical_cols = []
numerical_cols = []

for col in X.columns:
    if X[col].dtype == 'int64':
        numerical_cols.append(col)
    else:
        categorical_cols.append(col)


In [12]:
categorical_cols

['sex', 'age', 'fare', 'embark_town', 'alone']

In [13]:
numerical_cols

['pclass', 'sibsp', 'parch']

# feature engineering automation

In [14]:
## feature engineering automation 
##  numerical Pipeline 
num_pipeline = Pipeline(
steps = [
    ('imputer',SimpleImputer(strategy = 'mean')), ##Missing Values 
    ('scaler',StandardScaler()) ## feature Scaling
]
)
#categorical Pipeline
cat_pipeline=Pipeline(
                steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')), ## handling Missing values
                ('onehotencoder',OneHotEncoder(handle_unknown='ignore')) ## Categorical features to numerical
                ]

            )


# combing the pipeline using column transformer

In [15]:
preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

preprocessor

In [16]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
from sklearn.metrics import accuracy_score

In [19]:
clf = RandomForestClassifier()

In [20]:
clf.fit(X_train,y_train)

In [21]:
y_pred = clf.predict(X_test)

In [22]:
accuracy_score(y_test,y_pred)

0.8044692737430168

# Q2. Build a pipeline that includes a random forest classifier and a logistic regression classifier, and then use a voting classifier to combine their predictions. Train the pipeline on the iris dataset and evaluate its accuracy.

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


In [48]:
import seaborn as sns 
df = sns.load_dataset('iris')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [49]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [50]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [51]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['species'] = encoder.fit_transform(df['species'])

In [52]:
df.species.unique()

array([0, 1, 2])

In [53]:
## independent and dependent feature
X=df.drop('species',axis=1)
y=df['species']

In [54]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [55]:
# Define the classifiers
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
lr_classifier = LogisticRegression(max_iter=1000, random_state=42)

# Define the pipeline with a StandardScaler (for logistic regression) and the classifiers
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ensemble', VotingClassifier(estimators=[
        ('rf', rf_classifier),
        ('lr', lr_classifier)
    ], voting='soft'))  # 'soft' for probability voting
])


In [56]:
# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

In [57]:
# Make predictions on the test set
predictions = pipeline.predict(X_test)

In [58]:
# Evaluate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 1.0000
