In [115]:
import numpy as np
import pandas as pd

In [147]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [149]:
df = pd.read_csv('train.csv')

In [151]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [153]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [157]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                random_state=42)

In [159]:

X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [161]:
y_train.sample(5)

81     1
710    1
775    0
427    1
57     0
Name: Survived, dtype: int64

In [163]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

In [167]:
# one hot encoding
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [169]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

In [171]:
# Feature selection
trf4 = SelectKBest(score_func=chi2,k=8)

In [173]:
# train the model
trf5 = DecisionTreeClassifier()

In [175]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

In [177]:
# train
pipe.fit(X_train,y_train)

# Practice

In [281]:
df = pd.read_csv('covid_toy.csv')

In [283]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [285]:
df['cough'].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [287]:
df.shape

(100, 6)

In [289]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [291]:
X = df.drop(columns=['has_covid'])

In [293]:
Y = df['has_covid']

In [295]:
le = LabelEncoder()
Y = le.fit_transform(Y)

In [297]:
Y = pd.DataFrame(Y, columns=['has_covid'])

In [299]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42, test_size=0.2)

X_train.shape, X_test.shape

((80, 5), (20, 5))

In [317]:
y_train

Unnamed: 0,has_covid
55,1
88,0
26,1
42,1
69,0
...,...
60,1
71,0
14,0
92,0


In [301]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier

In [303]:
# Missing Value imputation

trf1 = ColumnTransformer([
    ('impute_fever', SimpleImputer(), [2])
], remainder='passthrough')

In [305]:
# Applying OneHotEncoder on gender and city columns

trf2 = ColumnTransformer([
    ('gender_encoded', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1,4]),
    ('cough_encoded', OrdinalEncoder(categories=['Mild','Strong']), [3])
], remainder='passthrough')

In [307]:
# Scaling
trf3 = ColumnTransformer([
    ('scale', StandardScaler(), slice(0,6))
], remainder='passthrough')

In [309]:
trf4 = DecisionTreeClassifier()

In [311]:
pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4)
])

In [313]:
pipe.fit(X_train, y_train)

ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).