In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Dealing with missing data

In [2]:
from io import StringIO
csv_data = """
A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,
"""

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [3]:
# Identifying missing value in tabular data
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [4]:
# Eliminating records or features

# 1. Eliminating records
print(df.dropna(axis=0))

# 2. Eliminating features
print(df.dropna(axis=1))

     A    B    C    D
0  1.0  2.0  3.0  4.0
      A     B
0   1.0   2.0
1   5.0   6.0
2  10.0  11.0


In [5]:
# Eliminating records (more complex)

# a. Only drop rows where all columns are NaN
print(df.dropna(how='all'))
# Note: since we don't have a row with all values NaN, it returns
#        whole array.

# b.drop rows that have fewer than a number (in this example 4) 
#     of real values
print(df.dropna(thresh=4))

# c. only drop rows where NaN appear in specific columns (here: 'C')
print(df.dropna(subset=['C']))

      A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   NaN  8.0
2  10.0  11.0  12.0  NaN
     A    B    C    D
0  1.0  2.0  3.0  4.0
      A     B     C    D
0   1.0   2.0   3.0  4.0
2  10.0  11.0  12.0  NaN


# Imputing missing values

In [8]:
# Impute missing value with mean
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(df.values)
imputed_data = imp.transform(df.values) # Return numpy array
imputed_data = pd.DataFrame(imputed_data, columns=df.columns)
imputed_data

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


# Categorical data encoding with pandas

In [9]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


**1. Mapping ordinal features**

In [11]:
size_mapping = {'XL': 3, 'L': 2, 'M': 1}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


**2. Encoding labels (using LabelEncoder)**

In [12]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([1, 0, 1])

In [13]:
# Inverse transfrom
class_le.inverse_transform(y)

array(['class2', 'class1', 'class2'], dtype=object)

**3.One Hot Encoding on Nominal Features**

In [20]:
from sklearn.preprocessing import OneHotEncoder
# NOTE: OneHotEncoder will convert all features (numerical and categorical)
X = df[['color', 'size', 'price']].values
color_ohe = OneHotEncoder()
color_transformed = color_ohe\
                    .fit_transform(X[:, 0].reshape(-1, 1))\
                    .toarray() # make sure it is 2D
color_transformed_features = color_ohe.get_feature_names_out()
pd.DataFrame(color_transformed, columns=color_transformed_features)


Unnamed: 0,x0_blue,x0_green,x0_red
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0


In [45]:
# One Hot Encoding multi-feature (using ColumnTransformer)
from sklearn.compose import ColumnTransformer
X = df[['color', 'size', 'price']].values
c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0]),
    ('nothing', 'passthrough', [1, 2])
])

X_transformed = c_transf.fit_transform(X).astype(float)
color_transformed_features = c_transf\
                                .named_transformers_['onehot']\
                                .get_feature_names_out()
feature_names = np.concatenate((color_transformed_features,
                                df.columns[[1, 2]].values))

pd.DataFrame(X_transformed, columns=feature_names)

Unnamed: 0,x0_blue,x0_green,x0_red,size,price
0,0.0,1.0,0.0,1.0,10.1
1,0.0,0.0,1.0,2.0,13.5
2,1.0,0.0,0.0,3.0,15.3


In [47]:
# Encoder with pandas get_dummies
# NOTE: it only convert string columns and leave all other
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0
