In [140]:
import logging
logging.basicConfig(filename="15AprInfo.log", level=logging.INFO, format="%(asctime)s %(name)s %(message)s")

# answer 1
We have to design a pipeline that includes the following steps- 
- Use an automated feature selection method to identify the important features in the dataset
- Create a numerical pipeline that includes the following steps"
- Impute the missing values in the numerical columns using the mean of the column values
- Scale the numerical columns using standardisation
- Create a categorical pipeline that includes the following steps"
- Impute the missing values in the categorical columns using the most frequent value of the column
- One-hot encode the categorical columns
- Combine the numerical and categorical pipelines using a ColumnTransformer
- Use a Random Forest Classifier to build the final model
- Evaluate the accuracy of the model on the test dataset

In [141]:
# importing necessary libraries
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [142]:
# loading dataset
df=sns.load_dataset('tips')
df.head()

# encoding target variable 
encoder=LabelEncoder()
df['time']=encoder.fit_transform(df['time'])

# dependent and independent features
y = df.pop('time')
x = df

# splitting into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=52)

# specifying categorical and numerical features
categorical_cols = ['sex', 'smoker','day']
numerical_cols = ['total_bill', 'tip','size']

# numerical Pipelines
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='mean')), 
        ('scaler',StandardScaler())
    ]

)

# categorical Pipeline
cat_pipeline=Pipeline(
                steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')),
                ('onehotencoder',OneHotEncoder())
                ]

            )  

# combine both the pipelines
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)

])

# passing data through transformer
x_train=preprocessor.fit_transform(x_train)
x_test=preprocessor.transform(x_test)

# train the model
model = RandomForestClassifier()
model.fit(x_train,y_train)
y_test_pred =model.predict(x_test)

# evaluate model accuracy
test_model_score = accuracy_score(y_test,y_test_pred)
print(test_model_score)

0.9591836734693877


In [143]:
# answer 2
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# load the dataset
iris = load_iris()
X = iris.data
y = iris.target

In [144]:
# split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [145]:
# make the voting classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
log_reg_clf = LogisticRegression(random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('estimators', [('rf', rf_clf), ('log_reg', log_reg_clf)])
])

In [146]:
voting_clf = VotingClassifier(estimators=pipeline.named_steps['estimators'], voting='hard')

voting_clf.fit(X_train, y_train)

In [147]:
# evaluate model accuracy
from sklearn.metrics import accuracy_score

y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 1.0


In [149]:
# misc for answer
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Split the dataset into features (X) and target (y)
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the feature selection pipeline
select_pipeline = Pipeline([
    ('selector', SelectFromModel(RandomForestClassifier(random_state=42)))
])

# Define the numerical pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define the categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the numerical and categorical pipelines using a ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, X.select_dtypes(include=['int64', 'float64']).columns),
    ('cat', cat_pipeline, X.select_dtypes(include=['object']).columns)
])

# Combine the feature selection, preprocessing, and classification pipelines
pipeline = Pipeline([
    ('select', select_pipeline),
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model on the test dataset
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

ValueError: ignored