In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    train_test_split,
)  # used to split data set into (x, y) train and (x, y) test
from sklearn.preprocessing import (
    OrdinalEncoder,
    OneHotEncoder,
    LabelEncoder,
    MaxAbsScaler,
    StandardScaler,
)
from sklearn.linear_model import LogisticRegression, LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

df = pd.read_csv(r"/home/ahmed/Feature-engineering/DataSets/covid_toy.csv")
df.head()
# this data set consederd classification problem

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
df.isna().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [5]:
x_train, y_train = df.drop(columns="has_covid"), df["has_covid"]

# at first impute missing values in fever column


In [6]:
impute = SimpleImputer(strategy="mean")
impute.fit(x_train[["fever"]])
x_train_fixed_fever = impute.transform(x_train[["fever"]])
x_train_fixed_fever.shape

(100, 1)

# edit ordinal categories


In [7]:
x_train["cough"].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [8]:
oe = OrdinalEncoder(categories=[["Mild", "Strong"]])
oe.fit(x_train[["cough"]])
x_train_fixed_cough = oe.transform(x_train[["cough"]])
x_train_fixed_cough.shape

(100, 1)

# edit naminal categories


In [9]:
ohc = OneHotEncoder(drop="first", sparse_output=False)
ohc.fit(x_train[["gender", "city"]])
x_train_fixed_gender_city = ohc.transform(x_train[["gender", "city"]])
x_train_fixed_gender_city.shape

(100, 4)

In [10]:
x_train_age = x_train.drop(columns=["gender", "fever", "cough", "city"]).values

x_train_age.shape

(100, 1)

# group them together


In [11]:
x_train_transformed = np.concatenate(
    (x_train_age, x_train_fixed_fever, x_train_fixed_cough, x_train_fixed_gender_city),
    axis=1,
)
x_train_transformed.shape

(100, 7)

# let's make it easer !!


In [12]:
Transformer = ColumnTransformer(
    transformers=[
        ("impute missing values", SimpleImputer(strategy="mean"), ["fever"]),
        (
            "edit_ordinal_categories",
            OrdinalEncoder(categories=[["Mild", "Strong"]]),
            ["cough"],
        ),
        (
            "edit_naminal_categories",
            OneHotEncoder(drop="first", sparse_output=False),
            ["gender", "city"],
        ),
    ],
    remainder="passthrough",  # if drop it will remove ant column that is not in transformers, other wise it will pass through the opject
)
Transformer.fit(x_train)
x_train_transformed = Transformer.transform(x_train)
x_train_transformed.shape

(100, 7)