In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("C:\\Users\\91636\\OneDrive\\Desktop\\Regex ML\\Data\\covid_toy.csv")

In [4]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [5]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [6]:
from sklearn.impute import SimpleImputer # for missing values imputation(filling) 
from sklearn.preprocessing import OneHotEncoder # Category ==> Sub-category ==> column creation 
from sklearn.preprocessing import OrdinalEncoder # Sub-Category ==>Convert into  Numbers 

In [7]:
x = df.drop(columns = ['has_covid'])
y = df['has_covid']

In [8]:
from sklearn.model_selection import train_test_split 

x_train , x_test , y_train ,y_test = train_test_split(x,y,test_size=  0.2 , random_state = 42)

# Manually type output

In [9]:
# Adding Simple Imputer to fever column  

si = SimpleImputer() 
x_train_fever = si.fit_transform(x_train[['fever']])
x_test_fever = si.fit_transform(x_test[['fever']])

In [10]:
x_train

Unnamed: 0,age,gender,fever,cough,city
55,81,Female,101.0,Mild,Mumbai
88,5,Female,100.0,Mild,Kolkata
26,19,Female,100.0,Mild,Kolkata
42,27,Male,100.0,Mild,Delhi
69,73,Female,103.0,Mild,Delhi
...,...,...,...,...,...
60,24,Female,102.0,Strong,Bangalore
71,75,Female,104.0,Strong,Delhi
14,51,Male,104.0,Mild,Bangalore
92,82,Female,102.0,Strong,Kolkata


In [11]:
x_train_fever.shape

(80, 1)

# Step-2 ==> Ordinal Encoding ==> Cough

In [12]:
oe = OrdinalEncoder(categories = [[ 'Mild' , 'Strong']])
x_train_cough = oe.fit_transform(x_train[[ 'cough' ]])

In [14]:
# x_train.head()
x_train_cough.shape

(80, 1)

# Step-3 ==> OneHotEncoding ==> Gender, City

In [15]:
ohe = OneHotEncoder(drop = 'first' , sparse = False)
x_train_gender_city = ohe.fit_transform(x_train[['gender' , 'city']])

In [16]:
x_train_gender_city.shape

(80, 4)

# Step-4 ==> Extracting Age

In [17]:
x_train_age = x_train.drop(columns = ['gender' , 'fever' , 'cough' , 'city']).values 

In [18]:
x_train_age.shape

(80, 1)

# Step-5 ==> Concatenate all the Data

In [19]:
x_train_transformed = np.concatenate((x_train_age , x_train_fever , x_train_gender_city , 
                                      x_train_cough) , axis = 1)

In [20]:
x_train_transformed.shape

(80, 7)

# We can do this in one cell using Column Transformer 

In [21]:
from sklearn.compose import ColumnTransformer 

In [22]:
transformer = ColumnTransformer(transformers = [
    ('a' , SimpleImputer() , ['fever']) ,  #Simple Imputer fill your missing values 
    ('b' , OrdinalEncoder(categories = [['Mild' , 'Strong']]) , ['cough']),
    ('c' , OneHotEncoder(sparse = False , drop = 'first') , ['gender' , 'city'])
] , remainder = 'passthrough')   # remainder = 'passthrough' ==> means rest all are columns remain constant . 

In [23]:
transformer.fit_transform(x_train).shape

(80, 7)