In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer

In [3]:
df=pd.read_csv("data.csv")

In [4]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [6]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

## Column Transformer demo

In [25]:
x_train,x_test,y_train,y_test=train_test_split(df.drop("has_covid",axis=1),df["has_covid"],test_size=0.2,random_state=1)

In [26]:
trans=ColumnTransformer(transformers=[
    ("imputer",SimpleImputer(),["fever"]),
    ("encoding",OrdinalEncoder(),["cough"]),
    ("encoding2",OneHotEncoder(sparse_output=False,drop="first"),["gender","city"])
],remainder="passthrough")

In [27]:
x_train=pd.DataFrame(trans.fit_transform(x_train),columns=trans.get_feature_names_out())

In [28]:
x_train.head()

Unnamed: 0,imputer__fever,encoding__cough,encoding2__gender_Male,encoding2__city_Delhi,encoding2__city_Kolkata,encoding2__city_Mumbai,remainder__age
0,101.0,0.0,1.0,1.0,0.0,0.0,42.0
1,98.0,1.0,1.0,0.0,1.0,0.0,34.0
2,101.0,0.0,0.0,0.0,0.0,0.0,20.0
3,104.0,1.0,0.0,0.0,0.0,0.0,56.0
4,100.958333,1.0,0.0,0.0,0.0,0.0,42.0


In [21]:
x_test=pd.DataFrame(trans.transform(x_test),columns=trans.get_feature_names_out())

In [22]:
x_test.head()

Unnamed: 0,imputer__fever,encoding__cough,encoding2__gender_Male,encoding2__city_Delhi,encoding2__city_Kolkata,encoding2__city_Mumbai,remainder__age
0,99.0,0.0,0.0,0.0,0.0,1.0,14.0
1,98.0,1.0,0.0,0.0,0.0,1.0,69.0
2,98.0,0.0,0.0,0.0,1.0,0.0,26.0
3,99.0,0.0,1.0,1.0,0.0,0.0,65.0
4,100.0,0.0,1.0,0.0,1.0,0.0,27.0


In [None]:
#to make transformers provide output as pandas instead og numpy array use .set_output(transform="pandas")

## Pipeline 

In [50]:
#it chains together multiple steps so that output of each step is used as input to next step
#this makes it easy to apply same preprocessing steps on new test data
#we can take multiple transformers and at end estimator

In [3]:
df=pd.read_csv("titanic.csv")

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.drop(["PassengerId","Name","Ticket","Cabin"],axis=1,inplace=True)

In [54]:
x=df.drop("Survived",axis=1)
y=df["Survived"]

In [6]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

In [15]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
301,3,male,,2,0,23.25,Q
309,1,female,30.0,0,0,56.9292,C
516,2,female,34.0,0,0,10.5,S
120,2,male,21.0,2,0,73.5,S
570,2,male,62.0,0,0,10.5,S


In [8]:
x_train["Parch"].unique()

array([0, 2, 1, 3, 6, 5, 4], dtype=int64)

In [57]:
x_train.isnull().sum()

Pclass        0
Sex           0
Age         144
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [33]:
#when making pipeline try to use column index that its name in column transformers here instead of ["Age"]-->[2]
t1=ColumnTransformer([
    ("impute_age",SimpleImputer(strategy="mean"),[2]),
    ("impute_emb",SimpleImputer(strategy="most_frequent"),[6])
],remainder="passthrough")

In [23]:
#u can use the pipeline without any model but use fit_transform
pipe=Pipeline([
    ("t1",t1)
])

In [22]:
pd.DataFrame(pipe.fit_transform(x_train),columns=pipe.get_feature_names_out())

Unnamed: 0,impute_age__Age,impute_emb__Embarked,remainder__Pclass,remainder__Sex,remainder__SibSp,remainder__Parch,remainder__Fare
0,30.166232,Q,3,male,2,0,23.25
1,30.0,C,1,female,0,0,56.9292
2,34.0,S,2,female,0,0,10.5
3,21.0,S,2,male,2,0,73.5
4,62.0,S,2,male,0,0,10.5
...,...,...,...,...,...,...,...
707,19.0,S,3,male,0,0,7.65
708,30.5,Q,3,female,0,0,7.75
709,21.0,S,2,male,0,0,73.5
710,30.166232,S,3,female,0,0,7.55


In [None]:
# as we can see order changes after transformation so it becomes difficult to know the column index

In [35]:
t1=ColumnTransformer([
    ("impute_age",SimpleImputer(strategy="mean"),[2]),
    ("impute_emb",SimpleImputer(strategy="most_frequent"),[6])
],remainder="passthrough")

In [36]:
t2=ColumnTransformer([("ohe",OneHotEncoder(sparse_output=False,handle_unknown="ignore"),[1,3])],remainder="passthrough")

In [41]:
t3=ColumnTransformer([("scaling",MinMaxScaler(),[5,7,8,9])],remainder="passthrough")

In [44]:
t4=SelectKBest(score_func=chi2,k=8)

In [45]:
t5=DecisionTreeClassifier()

In [46]:
pipe=Pipeline([
    ("t1",t1),
    ("t2",t2),
    ("t3",t3),
    ("t4",t4),
    ("t5",t5)
])

In [47]:
from sklearn import set_config
set_config(display="diagram")

In [48]:
pipe.fit(x_train,y_train)

In [49]:
pipe.score(x_test,y_test)

0.7821229050279329

In [50]:
y_pred=pipe.predict(x_test)

In [51]:
from sklearn.metrics import accuracy_score

In [52]:
accuracy_score(y_test,y_pred)

0.7821229050279329

In [None]:
#as we can see here both the .score and accuracy score are same
#score does find predicted internally and then calculates accuracy

## crossvalidations using pipeline

In [53]:
from sklearn.model_selection import cross_val_score

In [56]:
cross_val_score(pipe,x,y,scoring="accuracy",cv=10).mean()

0.7901622971285893

## hyperparameter tuning

In [None]:
#specify transformer name followed by the parameter.
#you can mix parameters of all different tranformers in single dictionary bcz anyhow we are giving its name also like t5

In [59]:
p={'t5__max_depth':[1,2,3,4,5,None]}

In [60]:
from sklearn.model_selection import GridSearchCV

In [61]:
gv=GridSearchCV(pipe,p,cv=5,scoring="accuracy")
gv.fit(x_train,y_train)

In [63]:
gv.best_params_

{'t5__max_depth': 3}

In [64]:
gv.best_score_

0.81185856397124