<a href="https://colab.research.google.com/github/anilans029/FE_tips/blob/main/coloumn_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day28-column-transformer/covid_toy.csv")
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [None]:
df.shape

(100, 6)

In [None]:
df['cough'].value_counts()

Mild      62
Strong    38
Name: cough, dtype: int64

In [None]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [None]:
## gender ==> onehot encoding
## fever ==> simpleimputer
## cough ==> odinalencoding
## city ==> onehot encoder
## has_covid(Target column) ==> label encoder

In [None]:
x = df.iloc[:,:5]
y = df.iloc[:,5]
print(x.shape)
print(y.shape)

(100, 5)
(100,)


In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)
print(x_train.shape,y_train.shape)

(80, 5) (80,)


In [None]:
x_train.head()

Unnamed: 0,age,gender,fever,cough,city
55,81,Female,101.0,Mild,Mumbai
88,5,Female,100.0,Mild,Kolkata
26,19,Female,100.0,Mild,Kolkata
42,27,Male,100.0,Mild,Delhi
69,73,Female,103.0,Mild,Delhi


# **manually transorming each column**

In [None]:
## using simple imputer to fill tha missing values in fever colummn

si = SimpleImputer()
x_train_fever = si.fit_transform(x_train[["fever"]])
x_test_fever  = si.transform(x_test[["fever"]])
print(x_train_fever.shape)

(80, 1)


In [None]:
## cough ==> odinalencoding

oe = OrdinalEncoder(categories=[["Mild","Strong"]])
x_train_cough = oe.fit_transform(x_train[["cough"]])

x_test_cough = oe.fit_transform(x_test[["cough"]])

In [None]:
## gender,city ==> onehot encoder

ohe = OneHotEncoder(drop= "first",sparse=False)

x_train_gender_city = ohe.fit_transform(x_train[["gender","city"]])

x_test_gender_city = ohe.fit_transform(x_test[["gender","city"]])


In [None]:
x_train_age = x_train[["age"]].values
                      
x_test_age = x_test[["age"]].values

In [None]:
x_train_transformed = np.concatenate((x_train_age,x_train_gender_city,x_train_cough,x_train_fever),axis = 1)

x_test_transformed = np.concatenate((x_test_age,x_test_gender_city,x_test_cough,x_test_fever),axis = 1)


In [None]:
x_train_transformed.shape

(80, 7)

# ** we can see how much complex it is to transform each column seperately. **

# **lets use column transformer**

In [None]:
## the columns where we dont apply any tranformation we need to pass through them 


transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [None]:
x_train= transformer.fit_transform(x_train)


In [None]:
x_train.shape

(80, 7)

In [None]:
x_test= transformer.fit_transform(x_test)

In [None]:
x_test.shape

(20, 7)