In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score

In [92]:
dataset = load_boston()
dataset.data.shape
#总共506*13=6578个数据
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [93]:
n_samples

506

In [94]:
n_features

13

加入缺失值

In [95]:
#加入50%的na 
rng = np.random.RandomState(0)#设置一个随机数种子
missing_rate=0.5#50%的Na
n_missing_samples=int(np.floor(n_samples*n_features*missing_rate))


In [96]:
n_missing_samples   #设置的missing_samples 共有3289个

3289

In [97]:
#所有数据要随机遍布在数据集的各行各列当中，而一个缺失的数据会需要一个行索引和一个列索引
#如果能够创造一个数组，包含3289个分布在0~506中间的行索引，和3289个分布在0~13之间的列索引，那我们就可以利用索引来为数据中的任意3289个位置赋空值
missing_features = rng.randint(0,n_features,n_missing_samples) 
missing_samples = rng.randint(0,n_samples,n_missing_samples)
#randint(下限，上限，n) 在这个区域随机取n个整数

## .   [missing_samples,missing_features] 就是na的位置索引


#missing_samples = rng.choice(dataset.data.shape[0],n_missing_samples,replace=False) #行不重复false
#我们现在采样了3289个数据，远远超过我们的样本量506，所以我们使用随机抽取的函数randint。但如果我们需要的数据量小于我们的样本量506，那我们可以采用np.random.choice来抽样，choice会随机抽取不重复的随机数


In [98]:
#原数据不动
x_missing = X_full.copy()
y_missing = y_full.copy()

x_missing[missing_samples,missing_features]=np.nan #把这些位置设置为nan


x_missing =pd.DataFrame(x_missing)


#x_missing[missing_samples,missing_features]=np.nan

In [99]:
#x_misding 中的nan
x_missing.isnull().sum() 

0     200
1     201
2     200
3     203
4     202
5     201
6     185
7     197
8     196
9     197
10    204
11    214
12    189
dtype: int64

生成x_missing含有50%缺失值的矩阵

In [100]:
x_missing

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,,18.0,,,0.538,,65.2,4.0900,1.0,296.0,,,4.98
1,0.02731,0.0,,0.0,0.469,,78.9,4.9671,2.0,,,396.90,9.14
2,0.02729,,7.07,0.0,,7.185,61.1,,2.0,242.0,,,
3,,,,0.0,0.458,,45.8,,,222.0,18.7,,
4,,0.0,2.18,0.0,,7.147,,,,,18.7,,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,,,,0.0,0.573,,69.1,,1.0,,21.0,,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,,396.90,9.08
503,,,11.93,,0.573,6.976,91.0,,,,21.0,,5.64
504,0.10959,0.0,11.93,,0.573,,89.3,,1.0,,21.0,393.45,6.48


利用随机森林填补

In [101]:
x_missing_reg = x_missing.copy()
x_missing_reg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,,18.0,,,0.538,,65.2,4.0900,1.0,296.0,,,4.98
1,0.02731,0.0,,0.0,0.469,,78.9,4.9671,2.0,,,396.90,9.14
2,0.02729,,7.07,0.0,,7.185,61.1,,2.0,242.0,,,
3,,,,0.0,0.458,,45.8,,,222.0,18.7,,
4,,0.0,2.18,0.0,,7.147,,,,,18.7,,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,,,,0.0,0.573,,69.1,,1.0,,21.0,,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,,396.90,9.08
503,,,11.93,,0.573,6.976,91.0,,,,21.0,,5.64
504,0.10959,0.0,11.93,,0.573,,89.3,,1.0,,21.0,393.45,6.48


In [102]:
sort_index=np.argsort(x_missing.isnull().sum(axis=0)).values #列索引中nan值的数量从小到大排序  eg：六列含有nan最少
sort_index

array([ 6, 12,  8,  7,  9,  0,  2,  1,  5,  4,  3, 10, 11])

In [110]:
for i in sort_index:
    #构建新的特征矩阵和新的标签
    df = x_missing_reg
    fillc =df.iloc[:,i]
    df = pd.concat([df.iloc[:,df.columns!=i],pd.DataFrame(y_full)],axis=1)

    #在新特征矩阵中，对含有缺失值的列，进行0的填补
    df_0=SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0).fit_transform(df)
    """
    1.使用SimpleImputer,生成的df_0是array,后面能用df_0[ytain.index,:]取值
    2.如果使用fillna 输出的是dataframe形式的数据,用df_0[ytain.index,:]取值会报错

    """

    #找出训练集和测试集
    ytrain = fillc[fillc.notnull()]
    ytest = fillc[fillc.isnull()]
    xtrain = df_0[ytrain.index,:]
    xtest = df_0[ytest.index,:]

    #用随机森林训练
    rfc = RandomForestRegressor(n_estimators=100)
    rfc = rfc.fit(xtrain,ytrain)
    ypredict = rfc.predict(xtest)

    #将填补好的缺失值返回原始的特征矩阵中
    x_missing_reg.loc[x_missing_reg.iloc[:,i].isnull(),i]=ypredict


    

没有缺失值了


In [129]:
x_missing_reg 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.220717,18.000,6.8038,0.16,0.538000,6.64174,65.200,4.090000,1.00,296.00,18.359,390.7057,4.9800
1,0.027310,0.000,5.6376,0.00,0.469000,6.15387,78.900,4.967100,2.00,279.39,18.467,396.9000,9.1400
2,0.027290,15.320,7.0700,0.00,0.466850,7.18500,61.100,4.272720,2.00,242.00,17.472,386.9499,4.8371
3,0.098456,21.345,3.0983,0.00,0.458000,6.76061,45.800,4.673109,3.65,222.00,18.700,392.4417,6.0752
4,0.061019,0.000,2.1800,0.00,0.463257,7.14700,55.646,4.897042,3.99,264.70,18.700,392.8113,5.3300
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.434832,0.720,8.5116,0.00,0.573000,6.20662,69.100,3.121522,1.00,289.67,21.000,390.9428,9.6700
502,0.045270,0.000,11.9300,0.00,0.573000,6.12000,76.700,2.287500,1.00,273.00,19.056,396.9000,9.0800
503,0.676748,0.960,11.9300,0.27,0.573000,6.97600,91.000,2.599812,6.36,361.86,21.000,383.3128,5.6400
504,0.109590,0.000,11.9300,0.00,0.573000,6.30532,89.300,3.023568,1.00,261.03,21.000,393.4500,6.4800
