# 2.2 数据预处理

In [1]:
import os

#### 1 创建人工数据集并存储与CSV

In [2]:
os.makedirs(os.path.join('.', 'NotesData'), exist_ok=True)      # 创建目录
data_file = os.path.join('.', 'NotesData', 'house_tiny.csv')    # 新csv文件的路径字符串
with open(data_file, 'w') as f:
    f.write('NumRooms,RoofType,Price\n')
    f.write('NA,NA,127500\n')
    f.write('2,NA,106000\n')
    f.write('4,Slate,178100\n')
    f.write('NA,NA,140000\n')

#### 2 使用 pandas 读取数据

In [3]:
import pandas as pd
data = pd.read_csv(data_file)
print(data)

   NumRooms RoofType   Price
0       NaN      NaN  127500
1       2.0      NaN  106000
2       4.0    Slate  178100
3       NaN      NaN  140000


#### 3 处理缺失数据 NA —— 插值/删除

In [4]:
inputs = data.iloc[:, 0:2]  # 除了最后一列
outputs = data.iloc[:, 2]   # 最后一列
print(inputs)
print(outputs)

inputs= inputs.fillna(inputs.mean()) # 数字取均值
print(inputs)
inputs = pd.get_dummies(inputs, dummy_na=True) # 非数字根据类别取特征
print(inputs)

   NumRooms RoofType
0       NaN      NaN
1       2.0      NaN
2       4.0    Slate
3       NaN      NaN
0    127500
1    106000
2    178100
3    140000
Name: Price, dtype: int64
   NumRooms RoofType
0       3.0      NaN
1       2.0      NaN
2       4.0    Slate
3       3.0      NaN
   NumRooms  RoofType_Slate  RoofType_nan
0       3.0               0             1
1       2.0               0             1
2       4.0               1             0
3       3.0               0             1


#### 4 处理纯数字csv数据为张量tensor

In [5]:
import torch
X = torch.tensor(inputs.values)
y = torch.tensor(outputs.values)
print(X)
print(y)

tensor([[3., 0., 1.],
        [2., 0., 1.],
        [4., 1., 0.],
        [3., 0., 1.]], dtype=torch.float64)
tensor([127500, 106000, 178100, 140000])
