In [81]:
import numpy as np
import pandas as pd


In [82]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [83]:
df=pd.read_csv('covid_toy.csv')

In [84]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [85]:
df['age'].isnull().sum()

0

In [86]:
new_Data=pd.get_dummies(df,drop_first=True)

In [87]:
new_Data

Unnamed: 0,age,fever,gender_Male,cough_Strong,city_Delhi,city_Kolkata,city_Mumbai,has_covid_Yes
0,60,103.0,1,0,0,1,0,0
1,27,100.0,1,0,1,0,0,1
2,42,101.0,1,0,1,0,0,0
3,31,98.0,0,0,0,1,0,0
4,65,101.0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...
95,12,104.0,0,0,0,0,0,0
96,51,101.0,0,1,0,1,0,1
97,20,101.0,0,0,0,0,0,0
98,5,98.0,0,1,0,0,1,0


# train test split

In [88]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.drop(columns=['has_covid']),df['has_covid'],test_size=0.2)

In [89]:
x_train

Unnamed: 0,age,gender,fever,cough,city
51,11,Female,100.0,Strong,Kolkata
52,47,Female,100.0,Strong,Bangalore
6,14,Male,101.0,Strong,Bangalore
42,27,Male,100.0,Mild,Delhi
61,81,Female,98.0,Strong,Mumbai
...,...,...,...,...,...
81,65,Male,99.0,Mild,Delhi
7,20,Female,,Strong,Mumbai
19,42,Female,,Strong,Bangalore
14,51,Male,104.0,Mild,Bangalore


In [90]:
y_train

51    Yes
52    Yes
6      No
42    Yes
61     No
     ... 
81     No
7     Yes
19    Yes
14     No
68     No
Name: has_covid, Length: 80, dtype: object

# COLUMN TRANSFORMER USING SKLERAN CLASS

In [94]:
from sklearn.compose import ColumnTransformer

In [99]:
'''transformer=ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer,['fever']),
    ('tnf2',OrdinalEncoder(categories=[['mild','strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')
'''

"transformer=ColumnTransformer(transformers=[\n    ('tnf1',SimpleImputer,['fever']),\n    ('tnf2',OrdinalEncoder(categories=[['mild','strong']]),['cough']),\n    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])\n],remainder='passthrough')\n"

In [100]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Instantiate the transformers
imputer = SimpleImputer(strategy='most_frequent')
ordinal_encoder = OrdinalEncoder(categories=[['Mild', 'Strong']])
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')

# Create the ColumnTransformer
transformer = ColumnTransformer(
    transformers=[
        ('tnf1', imputer, ['fever']),
        ('tnf2', ordinal_encoder, ['cough']),
        ('tnf3', one_hot_encoder, ['gender', 'city'])
    ],
    remainder='passthrough'
)


In [101]:
transform_data=transformer.fit_transform(x_train)

In [102]:
transformer.fit_transform(x_test).shape

(20, 7)

In [103]:
transform_data

array([[100.,   1.,   0.,   0.,   1.,   0.,  11.],
       [100.,   1.,   0.,   0.,   0.,   0.,  47.],
       [101.,   1.,   1.,   0.,   0.,   0.,  14.],
       [100.,   0.,   1.,   1.,   0.,   0.,  27.],
       [ 98.,   1.,   0.,   0.,   0.,   1.,  81.],
       [ 98.,   1.,   0.,   0.,   0.,   1.,   5.],
       [100.,   1.,   0.,   0.,   0.,   0.,  19.],
       [103.,   0.,   1.,   0.,   1.,   0.,  83.],
       [ 98.,   0.,   0.,   0.,   1.,   0.,  31.],
       [101.,   0.,   1.,   0.,   1.,   0.,  82.],
       [102.,   0.,   0.,   0.,   0.,   0.,  69.],
       [101.,   0.,   0.,   0.,   0.,   1.,  65.],
       [101.,   0.,   1.,   0.,   0.,   1.,  23.],
       [100.,   0.,   1.,   0.,   1.,   0.,  27.],
       [ 98.,   1.,   1.,   0.,   0.,   0.,  12.],
       [104.,   0.,   1.,   0.,   0.,   1.,  44.],
       [ 99.,   1.,   0.,   1.,   0.,   0.,  59.],
       [ 99.,   0.,   0.,   0.,   0.,   0.,  22.],
       [101.,   0.,   1.,   1.,   0.,   0.,  15.],
       [102.,   0.,   0.,   1.,

In [104]:
df=pd.DataFrame(transform_data)

In [105]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,100.0,1.0,0.0,0.0,1.0,0.0,11.0
1,100.0,1.0,0.0,0.0,0.0,0.0,47.0
2,101.0,1.0,1.0,0.0,0.0,0.0,14.0
3,100.0,0.0,1.0,1.0,0.0,0.0,27.0
4,98.0,1.0,0.0,0.0,0.0,1.0,81.0
...,...,...,...,...,...,...,...
75,99.0,0.0,1.0,1.0,0.0,0.0,65.0
76,101.0,1.0,0.0,0.0,0.0,1.0,20.0
77,101.0,1.0,0.0,0.0,0.0,0.0,42.0
78,104.0,0.0,1.0,0.0,0.0,0.0,51.0
