In [1]:
import findspark
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.pandas as ps
    
findspark.init()
spark = SparkSession \
    .builder \
    .appName("pca") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .master("local[*]") \
    .getOrCreate()



In [2]:
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('D:/AllData/训练数据集.csv', encoding='utf-8')
df["时间"] = pd.to_datetime(df["时间"])
df.dtypes

时间              datetime64[ns]
燃机转速                   float64
燃机负荷                   float64
压气机排气温度                float64
转子冷却空气温度均值             float64
TCA冷却器进口给水压力           float64
TCA冷却器进口给水温度           float64
TCA进口冷却水流量             float64
TCA出口给水压力              float64
TCA出口给水温度              float64
dtype: object

In [3]:
df_spark = spark.createDataFrame(df)
df_spark.show(10)

+-------------------+-----------+-----------+--------------+--------------------+---------------------+---------------------+-----------------+---------------+---------------+
|               时间|   燃机转速|   燃机负荷|压气机排气温度|转子冷却空气温度均值|TCA冷却器进口给水压力|TCA冷却器进口给水温度|TCA进口冷却水流量|TCA出口给水压力|TCA出口给水温度|
+-------------------+-----------+-----------+--------------+--------------------+---------------------+---------------------+-----------------+---------------+---------------+
|2021-05-10 18:54:00|   2997.375|   183.4375|    402.677887|         185.4125214|          13.91208363|          140.5234222|       93.4726944|    13.79161358|    221.0665283|
|2021-05-10 18:55:00|   2997.125|182.8424835|   407.1754456|          185.591095|          13.91452885|          140.6161804|      92.52132416|    13.79442024|    224.1034393|
|2021-05-10 18:56:00|   2997.625|178.2356262|   408.6000061|         185.8658447|          13.91697407|          140.7089233|      91.48206329|    13.79722595|    226.7919006|
|2021-05

In [4]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.ml import Pipeline, PipelineModel

select_columns = df_spark.columns[1:5]
df_spark_slice = df_spark.select(select_columns)

assembler = VectorAssembler(inputCols = select_columns, outputCol = 'features')
df_spark_slice = assembler.transform(df_spark_slice)

In [5]:
df_spark_slice.show(10, False)

+-----------+-----------+--------------+--------------------+-------------------------------------------------+
|燃机转速   |燃机负荷   |压气机排气温度|转子冷却空气温度均值|features                                         |
+-----------+-----------+--------------+--------------------+-------------------------------------------------+
|2997.375   |183.4375   |402.677887    |185.4125214         |[2997.375,183.4375,402.677887,185.4125214]       |
|2997.125   |182.8424835|407.1754456   |185.591095          |[2997.125,182.8424835,407.1754456,185.591095]    |
|2997.625   |178.2356262|408.6000061   |185.8658447         |[2997.625,178.2356262,408.6000061,185.8658447]   |
|2997.75    |175.1687622|408.5301819   |186.3105621         |[2997.75,175.1687622,408.5301819,186.3105621]    |
|2998.269287|173.5149994|408.3426208   |186.6753693         |[2998.269287,173.5149994,408.3426208,186.6753693]|
|3000.28125 |174.1581116|408.1695251   |186.9903717         |[3000.28125,174.1581116,408.1695251,186.9903717] |
|2997.739502|178.

In [10]:
pca = PCA(k = 4, inputCol = 'features', outputCol = 'pca_features')

model = pca.fit(df_spark_slice)

df_spark_pca = model.transform(df_spark_slice)

df_spark_pca.show(10, False)

+-----------+-----------+--------------+--------------------+-------------------------------------------------+-------------------------------------------------------------------------------+
|燃机转速   |燃机负荷   |压气机排气温度|转子冷却空气温度均值|features                                         |pca_features                                                                   |
+-----------+-----------+--------------+--------------------+-------------------------------------------------+-------------------------------------------------------------------------------+
|2997.375   |183.4375   |402.677887    |185.4125214         |[2997.375,183.4375,402.677887,185.4125214]       |[-356.0827024825528,246.1785255784866,-49.74064693965265,-3004.0906446678155]  |
|2997.125   |182.8424835|407.1754456   |185.591095          |[2997.125,182.8424835,407.1754456,185.591095]    |[-357.333945125733,248.92199108711876,-46.33710071640684,-3004.1019797189992]  |
|2997.625   |178.2356262|408.6000061   |185.8658447         |[299

In [8]:
model.explainedVariance

DenseVector([0.9883, 0.0083, 0.0024, 0.0009])

In [48]:
from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.feature import PCA
from pyspark.ml import Pipeline, PipelineModel

from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType
 
def to_array(col):
    def to_array_(v):
        return v.toArray().tolist()
    return udf(to_array_, ArrayType(DoubleType()))(col)

In [49]:
splits = df_spark.randomSplit([0.8, 0.2], 123)
train = splits[0]
test = splits[1]

print(df_spark.count())
print(train.count())
print(test.count())

62637
50031
12606


In [50]:
select_variables = df_spark.columns[1:5]

train_select = train.select(select_columns) 
test_select = test.select(select_columns)   

feature_nums = 3
assembler = VectorAssembler(inputCols = select_variables, outputCol = 'features')
normlizer = MinMaxScaler(inputCol='features', outputCol='scaler_fs')
pca = PCA(k = feature_nums, inputCol = 'scaler_fs', outputCol = 'pca_features')
pipe = Pipeline(stages=[assembler, normlizer, pca])
model = pipe.fit(train_select)

train_select_pca = model.transform(train_select)
train_select_pca.show(10, False)

+-----------+-----------+--------------+--------------------+-------------------------------------------------+--------------------------------------------------------------------------------+----------------------------------------------------------------+
|燃机转速   |燃机负荷   |压气机排气温度|转子冷却空气温度均值|features                                         |scaler_fs                                                                       |pca_features                                                    |
+-----------+-----------+--------------+--------------------+-------------------------------------------------+--------------------------------------------------------------------------------+----------------------------------------------------------------+
|2997.375   |183.4375   |402.677887    |185.4125214         |[2997.375,183.4375,402.677887,185.4125214]       |[0.11108299385201856,0.40884103685047557,0.03985193396075705,0.6278691676653662]|[-0.43324556146422233,0.14022961846990242,0.029851196114367

In [52]:
train_select_pca_rename = train_select_pca.withColumn("pca_features", to_array(col("pca_features"))) \
                                  .select(train_select_pca.columns + 
                                          [(col("pca_features")[i]).alias(f'pca_features_{i + 1}') for i in range(feature_nums)])
train_select_pca_rename = train_select_pca_rename.drop('features', 'scaler_fs', 'pca_features') 

train_select_pca_rename.show(10, False)

+-----------+-----------+--------------+--------------------+--------------------+-------------------+---------------------+
|燃机转速   |燃机负荷   |压气机排气温度|转子冷却空气温度均值|pca_features_1      |pca_features_2     |pca_features_3       |
+-----------+-----------+--------------+--------------------+--------------------+-------------------+---------------------+
|2997.375   |183.4375   |402.677887    |185.4125214         |-0.43324556146422233|0.14022961846990242|0.02985119611436793  |
|2997.125   |182.8424835|407.1754456   |185.591095          |-0.48401143392363655|0.10562389985895515|-0.007033815374733743|
|2997.75    |175.1687622|408.5301819   |186.3105621         |-0.46873704092877877|0.19713352166777867|-0.05298678570142337 |
|2998.269287|173.5149994|408.3426208   |186.6753693         |-0.4572549001263152 |0.2735055685490851 |-0.060371890340298384|
|3000.28125 |174.1581116|408.1695251   |186.9903717         |-0.44111564153057864|0.5711932268976587 |-0.05932912223105241 |
|2997.5     |173.7774963|

In [53]:
### apply to test data
test_select_pca = model.transform(test_select)
test_select_pca_rename = test_select_pca.withColumn("pca_features", to_array(col("pca_features"))) \
                                  .select(test_select_pca.columns + 
                                          [(col("pca_features")[i]).alias(f'pca_features_{i + 1}') for i in range(feature_nums)])
test_select_pca_rename = test_select_pca_rename.drop('features', 'scaler_fs', 'pca_features') 

test_select_pca_rename.show(10, False)

+-----------+-----------+--------------+--------------------+--------------------+-------------------+---------------------+
|燃机转速   |燃机负荷   |压气机排气温度|转子冷却空气温度均值|pca_features_1      |pca_features_2     |pca_features_3       |
+-----------+-----------+--------------+--------------------+--------------------+-------------------+---------------------+
|2997.625   |178.2356262|408.6000061   |185.8658447         |-0.4802873520062039 |0.17933237067000457|-0.03879633389091153 |
|2997.739502|178.1350098|408.1264954   |187.2728424         |-0.4766505246192313 |0.19619052830197886|-0.040740048741766094|
|2997.807617|172.9899902|407.8682556   |188.4824677         |-0.4578815518708461 |0.20510241444277222|-0.06499080010983763 |
|2998.093018|171.7299957|407.8252258   |188.5950012         |-0.4508372243452148 |0.24701504601609447|-0.07068256121476071 |
|2997.515625|172.9768829|407.7377014   |188.8294525         |-0.45959338526876725|0.16189018167709984|-0.0650172226056169  |
|2997.65625 |170.8899994|