# 常用的缺失值填充方法

In [1]:
import findspark
spark_home = "D:\\Anaconda\\Lib\\site-packages\\pyspark"
python_path = "D:\\Anaconda\\python"
findspark.init(spark_home,python_path)

from pyspark.sql import SparkSession
from pyspark.ml.feature import Imputer
from pyspark.ml.linalg import np
from pyspark.ml.linalg import scipy
import pyspark.pandas as ps
spark = SparkSession.builder.appName('MissingValues').getOrCreate()



In [6]:
df = spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")),
                            (2.0, float("nan")), (float("nan"), 3.0),
                            (4.0, 4.0), (5.0, 5.0)], ["age", "income"])
df.show()

+---+------+
|age|income|
+---+------+
|1.0|   NaN|
|2.0|   NaN|
|2.0|   NaN|
|NaN|   3.0|
|4.0|   4.0|
|5.0|   5.0|
+---+------+



## 删除缺失值的行

In [8]:
# 仅保留不包含缺失值的行
df1 = df.dropna()
df1.show()

+---+------+-----------+--------------+
|age|income|age_imputed|income_imputed|
+---+------+-----------+--------------+
|4.0|   4.0|        4.0|           4.0|
|5.0|   5.0|        5.0|           5.0|
+---+------+-----------+--------------+



##  使用统计值填充缺失值

In [11]:
# 使用Imputer将缺失值进行插值，均值
imputer = Imputer(inputCols=["age", "income"],
                  outputCols=["age", "income"]).setStrategy("mean")

df2 = imputer.fit(df).transform(df)
df2.show()

+---+------+
|age|income|
+---+------+
|1.0|   4.0|
|2.0|   4.0|
|2.0|   4.0|
|2.8|   3.0|
|4.0|   4.0|
|5.0|   5.0|
+---+------+



In [12]:
# 使用Imputer将缺失值进行插值，中位数
imputer = Imputer(inputCols=["age", "income"],
                  outputCols=["age", "income"]).setStrategy("median")

df4 = imputer.fit(df).transform(df)
df4.show()

+---+------+
|age|income|
+---+------+
|1.0|   4.0|
|2.0|   4.0|
|2.0|   4.0|
|2.0|   3.0|
|4.0|   4.0|
|5.0|   5.0|
+---+------+



In [13]:
# 使用Imputer将缺失值进行插值，众数
imputer = Imputer(inputCols=["age", "income"],
                  outputCols=["age", "income"]).setStrategy("mode")

df5 = imputer.fit(df).transform(df)
df5.show()

+---+------+
|age|income|
+---+------+
|1.0|   3.0|
|2.0|   3.0|
|2.0|   3.0|
|2.0|   3.0|
|4.0|   4.0|
|5.0|   5.0|
+---+------+



## 使用任意值填充缺失值

In [10]:
# 填充空白值为0
df3 = df.fillna(0)
df3.show()

+---+------+
|age|income|
+---+------+
|1.0|   0.0|
|2.0|   0.0|
|2.0|   0.0|
|0.0|   3.0|
|4.0|   4.0|
|5.0|   5.0|
+---+------+



## 线性插值

In [7]:
dfs = df.toPandas().values
input_varlen = len(df.columns)
nan_sum = np.isnan(dfs).sum(axis=0)
ir_data = []
for i in range(input_varlen):
    #选择缺失列
    if nan_sum[i] > 0:
        filter_data = dfs[:, i]
        #定义纵坐标，删除缺失值
        y = filter_data[np.where(np.isnan(filter_data) != 1)]
        #定义横坐标，没有缺失值的索引
        x0 = np.array(range(len(filter_data)))
        x = x0[np.where(np.isnan(filter_data) != 1)]
        #构造拟合函数
        irf = scipy.interpolate.interp1d(x, y, kind = 'linear', fill_value="extrapolate")
        #获取缺失值的索引，方便进行插值
        x_new = x0[np.where(np.isnan(filter_data) == 1)]
        #拟合缺失数据
        y_new = irf(x_new)
        filter_data[x_new] = y_new 
        #数据合并
        ir_data.append(filter_data)
    else:
        filter_data = dfs[:, i]
        #数据合并
        ir_data.append(filter_data)
df1 = np.array(ir_data).T
result = ps.DataFrame(data=df1, columns=df.columns).to_spark()



In [8]:
result.show()

+---+------+
|age|income|
+---+------+
|1.0|   0.0|
|2.0|   1.0|
|2.0|   2.0|
|3.0|   3.0|
|4.0|   4.0|
|5.0|   5.0|
+---+------+

