결측치 처리

In [2]:
import numpy as np
import pandas as pd

In [4]:
df = pd.DataFrame(
    [
        [42, "male", 12, "reading", "class2"],
        [35, "unknown", 3, "cooking", "class1"],
        [1000, "female", 7, "cycling", "class3"],
        [1000, "unknown", 21, "unknown", "unknown"],
    ]
)
df.columns = ["age", "gender", "month_birth", "hobby", "target"]

In [5]:
df

Unnamed: 0,age,gender,month_birth,hobby,target
0,42,male,12,reading,class2
1,35,unknown,3,cooking,class1
2,1000,female,7,cycling,class3
3,1000,unknown,21,unknown,unknown


In [None]:
df["age"].unique()

array([  42,   35, 1000])

In [None]:
df["gender"].unique()

array(['male', 'unknown', 'female'], dtype=object)

In [None]:
df["month_birth"].unique()

array([12,  3,  7, 21])

In [None]:
df["hobby"].unique()

array(['reading', 'cooking', 'cycling', 'unknown'], dtype=object)

In [None]:
df["target"].unique()

array(['class2', 'class1', 'class3', 'unknown'], dtype=object)

In [11]:
df.loc[df["age"] > 150, ["age"]] = np.nan
df.loc[df["gender"] == "unknown", ["gender"]] = np.nan
df.loc[df["month_birth"] > 12, ["month_birth"]] = np.nan
df.loc[df["hobby"] == "unknown", ["hobby"]] = np.nan
df.loc[df["target"] == "unknown", ["target"]] = np.nan

In [12]:
df

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3
3,,,,,


In [14]:
df.isnull().sum()

age            2
gender         2
month_birth    1
hobby          1
target         1
dtype: int64

In [15]:
df2 = df.dropna(axis=0)
df2

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2


In [16]:
df3 = df.dropna(axis=1)
df3

0
1
2
3


In [None]:
df4 = df.dropna(how="all")
df4

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3


In [18]:
df5 = df.dropna(thresh=2)
df5

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3


In [None]:
df6 = df.dropna(subset=["gender"])
df6

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
2,,female,7.0,cycling,class3


In [None]:
alter_values = {
    "age": 0,
    "gender": "U",
    "month_birth": 0,
    "hobby": "U",
    "target": "class4",
}

In [24]:
df7 = df.fillna(value=alter_values)
df7

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,U,3.0,cooking,class1
2,0.0,female,7.0,cycling,class3
3,0.0,U,0.0,U,class4


클래스 라벨 설정

In [26]:
from sklearn.preprocessing import LabelEncoder

In [None]:
df8 = df7
class_label = LabelEncoder()
data_value = df8["target"].values
y_new = class_label.fit_transform(data_value)
y_new

array([1, 0, 2, 3])

In [None]:
df8["target"] = y_new
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,1
1,35.0,U,3.0,cooking,0
2,0.0,female,7.0,cycling,2
3,0.0,U,0.0,U,3


In [29]:
y_ori = class_label.inverse_transform(y_new)
y_ori

array(['class2', 'class1', 'class3', 'class4'], dtype=object)

In [None]:
df8["target"] = y_ori
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,U,3.0,cooking,class1
2,0.0,female,7.0,cycling,class3
3,0.0,U,0.0,U,class4


In [None]:
y_arr = df8["target"].values
y_arr.sort()
y_arr

array(['class1', 'class2', 'class3', 'class4'], dtype=object)

In [None]:
num_y