In [109]:
from io import StringIO

import pandas as pd

csv_data = """A, B, C, D
1.0, 2.0, 3.0, 4.0
5.0, 6.0,,8.0
10.0, 11.0, 12.0"""

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [110]:
df.isnull().sum()

A     0
 B    0
 C    1
 D    1
dtype: int64

In [111]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [112]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [113]:
# Only drop rows where all columns are NaN
df.dropna(how="all")

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [114]:
# Drop rows that have less than 4 real values.
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [115]:
df.dropna(subset=["C"])

KeyError: ['C']

In [65]:
import numpy as np
from sklearn.impute import SimpleImputer

imr = SimpleImputer(missing_values=np.nan, strategy="mean")
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'green'

In [125]:
df = pd.DataFrame(
    [
        ["green", "M", 10.1, "class1"],
        ["red", "L", 13.5, "class2"],
        ["blue", "XL", 15.3, "class1"],
    ]
)
df.columns = ["color", "size", "price", "classlabel"]
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [67]:
size_mapping = {"XL": 3, "L": 2, "M": 1}
df["size"] = df["size"].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [69]:
inv_size_mapping = {v: k for (k, v) in size_mapping.items()}
df["size"] = df["size"].map(inv_size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [70]:
import numpy as np

class_mapping = {label: idx for idx, label in enumerate(np.unique(df.classlabel))}
class_mapping

{'class1': 0, 'class2': 1}

In [71]:
df.classlabel = df.classlabel.map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,0
1,red,L,13.5,1
2,blue,XL,15.3,0


In [108]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df.classlabel = df.classlabel.map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,
1,red,L,13.5,
2,blue,XL,15.3,


In [73]:
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
y = class_le.fit_transform(df.classlabel.values)
y

array([0, 1, 0])

In [75]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)

In [131]:
X = df[["color", "size", "price"]].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([['green', 'M', 10.1],
       ['red', 'L', 13.5],
       ['blue', 'XL', 15.3]], dtype=object)

In [134]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categories=[np.unique(X[:, 0]), [], []])  # Select column 0
ohe.fit_transform(X).toarray()

array(['blue', 'green', 'red'], dtype=object)

In [135]:
pd.get_dummies(df[["price", "color", "size"]])

Unnamed: 0,price,color_blue,color_green,color_red,size_L,size_M,size_XL
0,10.1,0,1,0,0,1,0
1,13.5,0,0,1,1,0,0
2,15.3,1,0,0,0,0,1


In [136]:
pd.get_dummies(df[["price", "color", "size"]], drop_first=True)

Unnamed: 0,price,color_green,color_red,size_M,size_XL
0,10.1,1,0,1,0
1,13.5,0,1,0,0
2,15.3,0,0,0,1
