## In this Notebook We will use Column Transformer

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer

In [2]:
data=pd.read_csv('covid_toy.csv')

In [4]:
data.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [6]:
data.describe()

Unnamed: 0,age,fever
count,100.0,90.0
mean,44.22,100.844444
std,24.878931,2.054926
min,5.0,98.0
25%,20.0,99.0
50%,45.0,101.0
75%,66.5,102.75
max,84.0,104.0


In [11]:
data.drop_duplicates(inplace=True)

In [13]:
data.sample(9)

Unnamed: 0,age,gender,fever,cough,city,has_covid
94,79,Male,,Strong,Kolkata,Yes
66,51,Male,104.0,Mild,Kolkata,No
29,34,Female,,Strong,Mumbai,Yes
68,54,Female,104.0,Strong,Kolkata,No
58,23,Male,98.0,Strong,Mumbai,Yes
50,19,Male,101.0,Mild,Delhi,Yes
12,25,Female,99.0,Strong,Kolkata,No
85,16,Female,103.0,Mild,Bangalore,Yes
69,73,Female,103.0,Mild,Delhi,No


In [14]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.model_selection import train_test_split

In [16]:
x_train,x_test,y_train,y_test=train_test_split(data.drop('has_covid',axis=1),data['has_covid'],test_size=0.3,random_state=42)


In [25]:
x_train.cough.unique()

array(['Mild', 'Strong'], dtype=object)

In [33]:
labelencode=LabelEncoder()
y_train_encoded=labelencode.fit_transform(y_train)
y_test_encoded=labelencode.transform(y_test)

## We are going to use simpleImputer,Ordinal encoder,One hot Encoder and Label Encoder


In [34]:
transformer=ColumnTransformer([
    ('trf1',SimpleImputer(),['fever']),
    ('trf2',OneHotEncoder(sparse_output=False,drop='first',handle_unknown='ignore'),['gender','city']),
    ('trf3',OrdinalEncoder(categories=[['Mild','Strong']]),['cough'])
]
,remainder='passthrough')

In [38]:
x_train_transform=transformer.fit_transform(x_train)

In [41]:
x_train_transform.shape

(69, 7)

In [39]:
x_test_transform=transformer.transform(x_test)

In [42]:
x_test_transform.shape

(30, 7)

In [37]:
from sklearn.tree import DecisionTreeClassifier

In [44]:
clf=DecisionTreeClassifier()
clf.fit(x_train_transform,y_train_encoded)

In [45]:
y_pred=clf.predict(x_test_transform)

In [47]:
from sklearn.metrics import accuracy_score

In [48]:
accuracy_score(y_test_encoded,y_pred)

0.4

In [49]:
from sklearn.model_selection import cross_val_score

In [53]:
cross_val_score(clf,x_train_transform,y_train_encoded,cv=5,scoring='accuracy')

array([0.64285714, 0.28571429, 0.5       , 0.64285714, 0.69230769])