In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [2]:
penguins = sns.load_dataset('penguins')
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [3]:
penguins.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

Separate Numerical and Categorical Columns

In [4]:
num_cols = penguins.select_dtypes('number').columns
print(num_cols.tolist())

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']


In [5]:
cat_cols = penguins.select_dtypes('number').columns
print(num_cols.tolist())

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']


Handling Missing Values

In [6]:
from sklearn.impute import SimpleImputer

num_imp = SimpleImputer()
cat_imp = SimpleImputer(strategy='most_frequent')

In [7]:
penguins[num_cols] = num_imp.fit_transform(penguins[num_cols])
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.10000,18.70000,181.000000,3750.000000,Male
1,Adelie,Torgersen,39.50000,17.40000,186.000000,3800.000000,Female
2,Adelie,Torgersen,40.30000,18.00000,195.000000,3250.000000,Female
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,
4,Adelie,Torgersen,36.70000,19.30000,193.000000,3450.000000,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,43.92193,17.15117,200.915205,4201.754386,
340,Gentoo,Biscoe,46.80000,14.30000,215.000000,4850.000000,Female
341,Gentoo,Biscoe,50.40000,15.70000,222.000000,5750.000000,Male
342,Gentoo,Biscoe,45.20000,14.80000,212.000000,5200.000000,Female


In [8]:
penguins[['sex']] = cat_imp.fit_transform(penguins[['sex']])
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.10000,18.70000,181.000000,3750.000000,Male
1,Adelie,Torgersen,39.50000,17.40000,186.000000,3800.000000,Female
2,Adelie,Torgersen,40.30000,18.00000,195.000000,3250.000000,Female
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,Male
4,Adelie,Torgersen,36.70000,19.30000,193.000000,3450.000000,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,43.92193,17.15117,200.915205,4201.754386,Male
340,Gentoo,Biscoe,46.80000,14.30000,215.000000,4850.000000,Female
341,Gentoo,Biscoe,50.40000,15.70000,222.000000,5750.000000,Male
342,Gentoo,Biscoe,45.20000,14.80000,212.000000,5200.000000,Female


Encoding Categorical Values

drop = first, to avoid dummy variable trap.....jub hum Onehotencoding karte hai, to har category ke liya ek new column ban jata hai. agar aap n categories ko encode karo, to n columns sa banega. lekin inme se ek columns humesha baki columns ke combination se predict kiya ja sakta hai. Ise kehte hai dummy variable trap, aur ye multicollinearity create karte hai(jo linear models ke problem hai)

In [9]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

sex_enc = OneHotEncoder(drop='first')
sex_dummy = sex_enc.fit_transform(penguins[['sex']]).toarray()
sex_dummy_df = pd.DataFrame(sex_dummy)

In [10]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

# Load dataset
penguins = sns.load_dataset('penguins')

# Define categorical columns
cat_cols = ['species', 'island']

# OneHotEncoding
cat_enc = OneHotEncoder(drop='first')
dummy_cols = cat_enc.fit_transform(penguins[cat_cols]).toarray()

# Convert to DataFrame
dummy_df = pd.DataFrame(dummy_cols, columns=cat_enc.get_feature_names_out(cat_cols))


In [11]:
cat_cols = ['species','island']
cat_enc = OneHotEncoder(drop='first')
dummy_cols = cat_enc.fit_transform(penguins[cat_cols]).toarray()
dummy_df = pd.DataFrame(dummy_cols)

In [12]:
penguins['species'].value_counts()

species
Adelie       152
Gentoo       124
Chinstrap     68
Name: count, dtype: int64

In [13]:
penguins['island'].value_counts

<bound method IndexOpsMixin.value_counts of 0      Torgersen
1      Torgersen
2      Torgersen
3      Torgersen
4      Torgersen
         ...    
339       Biscoe
340       Biscoe
341       Biscoe
342       Biscoe
343       Biscoe
Name: island, Length: 344, dtype: object>

In [14]:
clean_df = pd.concat([penguins, dummy_df, sex_dummy_df,], axis = 1).drop(columns = cat_cols+['sex'])
clean_df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,0,1,2,3,0.1
0,39.1,18.7,181.0,3750.0,0.0,0.0,0.0,1.0,1.0
1,39.5,17.4,186.0,3800.0,0.0,0.0,0.0,1.0,0.0
2,40.3,18.0,195.0,3250.0,0.0,0.0,0.0,1.0,0.0
3,,,,,0.0,0.0,0.0,1.0,1.0
4,36.7,19.3,193.0,3450.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
339,,,,,0.0,1.0,0.0,0.0,1.0
340,46.8,14.3,215.0,4850.0,0.0,1.0,0.0,0.0,0.0
341,50.4,15.7,222.0,5750.0,0.0,1.0,0.0,0.0,1.0
342,45.2,14.8,212.0,5200.0,0.0,1.0,0.0,0.0,0.0


Scaler Numeric Values

In [15]:
num_cols = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']  # Example columns from penguins dataset

In [16]:
scaler = StandardScaler()
clean_df[num_cols]= scaler.fit_transform(clean_df[num_cols])
clean_df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,0,1,2,3,0.1
0,-0.884499,0.785449,-1.418347,-0.564142,0.0,0.0,0.0,1.0,1.0
1,-0.811126,0.126188,-1.062250,-0.501703,0.0,0.0,0.0,1.0,0.0
2,-0.664380,0.430462,-0.421277,-1.188532,0.0,0.0,0.0,1.0,0.0
3,,,,,0.0,0.0,0.0,1.0,1.0
4,-1.324737,1.089724,-0.563715,-0.938776,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
339,,,,,0.0,1.0,0.0,0.0,1.0
340,0.527932,-1.445897,1.003109,0.809516,0.0,1.0,0.0,0.0,0.0
341,1.188289,-0.735923,1.501644,1.933419,0.0,1.0,0.0,0.0,1.0
342,0.234440,-1.192335,0.789451,1.246590,0.0,1.0,0.0,0.0,0.0


Pipeline

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

df = sns.load_dataset('penguins')
num_cols = df.select_dtypes('number').columns
cat_cols = df.select_dtypes(exclude='number').columns

num_pipeline = Pipeline(
    steps =(
        ('imputer', SimpleImputer()),
        ('scaler', StandardScaler())
    )
)

In [18]:
cat_pipeline = Pipeline(
    steps = (
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first'))
    )
)    

In [19]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ]
)        

In [20]:
preprocessor.fit_transform(df)

array([[-0.88708123,  0.78774251, -1.42248782, ...,  0.        ,
         1.        ,  1.        ],
       [-0.81349399,  0.12655633, -1.06535169, ...,  0.        ,
         1.        ,  0.        ],
       [-0.66631952,  0.43171918, -0.42250666, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 1.1917582 , -0.73807176,  1.50602843, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.23512413, -1.19581604,  0.79175618, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.09977416, -0.53462985,  0.8631834 , ...,  0.        ,
         0.        ,  1.        ]], shape=(344, 9))

In [22]:
import gradio as gr