In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

## Numeric

In [3]:
data  = {
    'one': pd.Series([1,2,5], index=['a', 'b', 'e']),
    'two': pd.Series([1,2,3,4], index=['a', 'b', 'c', 'd']),
}

In [4]:
df = pd.DataFrame(data)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,,3.0
d,,4.0
e,5.0,


In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(df)
df_1 = imputer.transform(df)
df_1

array([[1.        , 1.        ],
       [2.        , 2.        ],
       [2.66666667, 3.        ],
       [2.66666667, 4.        ],
       [5.        , 2.5       ]])

In [6]:
df_1 = np.round(df_1)
df_1

array([[1., 1.],
       [2., 2.],
       [3., 3.],
       [3., 4.],
       [5., 2.]])

In [7]:
mean_col_1 = df['one'].mean()
mean_col_2 = df['two'].mean()
mean_col_1, mean_col_2 = round(mean_col_1, 1), round(mean_col_2, 1)
df.two.fillna(mean_col_2, inplace=True)
df.one.fillna(mean_col_1, inplace=True)

In [8]:
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,2.7,3.0
d,2.7,4.0
e,5.0,2.5


## Non Numeric

In [9]:
df = pd.read_csv('Missing_Value_Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [10]:
mean_age  = df.Age.mean()
mean_salary  = df.Salary.mean()
mean_age, mean_salary = round(mean_age), round(mean_salary)
mean_age, mean_salary


(39, 63778)

In [11]:
mode_country = df.Country.mode()
mode_country[0]

'France'

In [12]:
mode_purchased = df.Purchased.mode()
mode_purchased[0]

'Yes'

In [13]:
df.Age.fillna(mean_age, inplace=True)
df.Salary.fillna(mean_salary, inplace=True)
df.Country.fillna(mode_country, inplace=True)
df.Purchased.fillna(mode_purchased, inplace=True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,,38.0,61000.0,No
4,Germany,40.0,63778.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Label Encoder

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# df['Country'] = le.fit_transform(df.Country)

In [15]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,,38.0,61000.0,No
4,Germany,40.0,63778.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### One Hot Encoding

In [16]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder

In [17]:
one_hot_enc_data = pd.get_dummies(df, columns=['Country', 'Purchased'], dtype=int)

In [18]:
one_hot_enc_data

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain,Purchased_No,Purchased_Yes
0,44.0,72000.0,1,0,0,0,1
1,27.0,48000.0,0,0,1,0,1
2,30.0,54000.0,0,1,0,1,0
3,38.0,61000.0,0,0,0,1,0
4,40.0,63778.0,0,1,0,0,1
5,35.0,58000.0,1,0,0,0,1
6,39.0,52000.0,0,0,1,1,0
7,48.0,79000.0,1,0,0,0,1
8,50.0,83000.0,0,1,0,1,0
9,37.0,67000.0,1,0,0,0,1


In [19]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
categorical_columns

['Country', 'Purchased']

In [20]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(df[categorical_columns])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
df_encoded = pd.concat([df, one_hot_df], axis=1)
df_encoded = df_encoded.drop(categorical_columns, axis=1)
print(f"Encoded Employee data : \n{df_encoded}")

Encoded Employee data : 
    Age   Salary  Country_France  Country_Germany  Country_Spain  Country_nan  \
0  44.0  72000.0             1.0              0.0            0.0          0.0   
1  27.0  48000.0             0.0              0.0            1.0          0.0   
2  30.0  54000.0             0.0              1.0            0.0          0.0   
3  38.0  61000.0             0.0              0.0            0.0          1.0   
4  40.0  63778.0             0.0              1.0            0.0          0.0   
5  35.0  58000.0             1.0              0.0            0.0          0.0   
6  39.0  52000.0             0.0              0.0            1.0          0.0   
7  48.0  79000.0             1.0              0.0            0.0          0.0   
8  50.0  83000.0             0.0              1.0            0.0          0.0   
9  37.0  67000.0             1.0              0.0            0.0          0.0   

   Purchased_No  Purchased_Yes  
0           0.0            1.0  
1           0.0  