Step1 資料觀察清洗

讀取資料並轉換為DataFrame 並肉眼觀察資料分布

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('fea_raw.csv',low_memory = False)
print(df.shape)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(df.describe())
print(df.isnull().sum())
print(df.info())
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y/%m/%d %H:%M') 
df = df.set_index('Timestamp') 

讀取Power目標值資料並觀察

In [None]:
df1 = pd.read_csv('power_raw.csv')
plt.plot(df1['Power(kW)'][0:100])
plt.show()

正規化目標值取前100筆為例(貼結果才要跑)

In [None]:
df2 = pd.read_csv('power_raw.csv')
scaler = MinMaxScaler(feature_range=(0,1))
columns_to_normalize = ['Power(kW)']
df2[columns_to_normalize] = scaler.fit_transform(df2[columns_to_normalize])
plt.plot(df2['Power(kW)'][0:100])
plt.show()

畫盒鬚圖並將誇張的值砍掉(以某欄為例貼結果才要跑)

In [None]:
from cmath import nan
import seaborn as sns

df.replace(99999,nan,inplace=True)
sns.set(style="whitegrid")
ax = sns.boxplot(x = df['Gearbox_T1_High_Speed_Shaft_Temperature'], orient = "h", color = "skyblue")  # 畫盒圖

第一次補值結果(以某欄為例貼結果才要跑)

In [None]:
Q1 = df['Gearbox_T1_High_Speed_Shaft_Temperature'].quantile(0.25)
Q3 = df['Gearbox_T1_High_Speed_Shaft_Temperature'].quantile(0.75)
IQR = Q3 - Q1
df.mask(((df['Gearbox_T1_High_Speed_Shaft_Temperature'] < (Q1 - 1.5 * IQR)) |(df['Gearbox_T1_High_Speed_Shaft_Temperature'] > (Q3 + 1.5 * IQR))),other=nan,inplace=True)
df['Gearbox_T1_High_Speed_Shaft_Temperature'].interpolate(method = 'linear', inplace=True)
sns.set(style="whitegrid")
ax = sns.boxplot(x = df['Gearbox_T1_High_Speed_Shaft_Temperature'], orient = "h", color = "skyblue")  # 畫盒圖

第一次異常值檢測與補值(盒鬚圖方法)

In [None]:
from cmath import nan

# 繪製盒鬚圖
# plt.boxplot(df.values, labels=df.columns)

# 代替異常值為NaN
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df[(df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))] = np.nan

# 線性插值
df.interpolate(method='linear', inplace=True)

print(df.isnull().any())
print(df.describe())

第二次異常值檢測與補值(盒鬚圖方法)

In [None]:
# 繪製盒鬚圖
# plt.boxplot(df.values, labels=df.columns)

# 代替異常值為NaN
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df[(df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))] = np.nan

# 線性插值
df.interpolate(method='linear', inplace=True)

print(df.isnull().any())
print(df.describe())

In [None]:
df.to_csv('fea_preprocessed.csv') #將處理好的數據另存CSV檔

盒鬚圖異常檢測與插值(別種寫法)

In [None]:
from cmath import nan

#第一次
df.replace(99999,nan,inplace=True)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df.mask(((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))),other=nan,inplace=True)
print(df.describe())

list = df.columns.values.tolist()
for i in list:

    df[i].interpolate(method = 'linear', inplace=True)

print(df.isnull().any())
print(df.describe())

#第二次
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df.mask(((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))),other=nan,inplace=True)
print(df.describe())

list = df.columns.values.tolist()

for i in list:

    df[i].interpolate(method = 'linear', inplace=True)

print(df.isnull().any())
print(df.describe())

df.to_csv('fea_preprocessed.csv')

將power欄位匯入到fea_preprocessed另存為all_pre.csv

In [None]:

df =pd.read_csv('fea_preprocessed.csv')
df1 = pd.read_csv('power_raw.csv')
df0 = pd.concat([df,df1['Power(kW)']],axis=1)
df0.drop(df0.index[136730:], inplace=True)
df0.to_csv('all_pre_new.csv',header=True,index=False)

觀察資料分布(可以不用跑)

In [None]:
import plotly.express as px
from scipy import stats

fig = px.line(df['']) #需填資料欄位

fig.update_layout(
    xaxis_title="X",
    yaxis_title="count")

fig.show()

Step2 平穩性檢測轉換

匯入不同年份資料集和觀察資料型態

In [None]:
import matplotlib.pyplot as plt
import sys
import numpy as np
import pandas as pd
from matplotlib.pyplot import axis
from pandas import Series
from statsmodels.tsa.stattools import adfuller, kpss

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

all_df = pd.read_csv("all_pre_new.csv") #2019-2021年資料
all_df.drop(['Timestamp'],axis=1,inplace=True)

all_df.info()
print(all_df.isnull().sum())
all_df.drop(['Tower Accelaration Normal Raw','Tower Deflection','Turbine State'],axis = 1,inplace = True)
all_df.info()

ADF檢驗/KPSS檢驗

In [None]:
#define kpss test
def kpss_test(timeseries):  
    print('Results of KPSS Test:')
    kpsstest = kpss(timeseries, regression = 'c')
    kpss_output = Series(kpsstest[0:3], index = ['Test Statistic', 'p-value', 'Lags Used'])
    for key, value in kpsstest[3].items():
        kpss_output['Critical Value (%s)' %key] = value
    print(kpss_output)

#define adf test
def adf_test(timeseries):   
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag = 'AIC')
    dfoutput = Series(dftest[0:4], index = ['Test Statistic', 'p-value', 
                                               '#Lags Used', 'Number of Observations Used'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value (%s)' %key] = value
    print(dfoutput)

kpss_test(all_df['Power(kW)'])  #adf平穩性檢驗
adf_test(all_df['Power(kW)'])   #kpss平穩性檢驗

#一階差分 (檢驗結果都為平穩就不用做)
# diff_power = series['Power(kW)']-series['Power(kW)'].shift(1)
# print(diff_power)
# series['Power(kW)'] = diff_power
# print("-------------")
# print(series['Power(kW)'])
# print("-------------")
# series.fillna(0, inplace = True)
# print(series['Power(kW)'])

Step3特徵選擇

相互資訊(MI)

In [None]:
from sklearn.feature_selection import mutual_info_regression as MIR

mi_score = MIR(all_df.iloc[:, 0:73], all_df.iloc[:, 73])


mi_score_selected_index1 = np.where(mi_score > 0.5)[0]
mi_score_selected_index2 = np.where(mi_score > 0.6)[0]
mi_score_selected_index3 = np.where(mi_score > 0.7)[0]
mi_score_selected_index4 = np.where(mi_score > 0.8)[0]
mi_score_selected_index5 = np.where(mi_score > 0.9)[0]
mi_score_selected_index6 = np.where(mi_score > 1.0)[0]

print(mi_score_selected_index1)
print(mi_score_selected_index2)
print(mi_score_selected_index3)
print(mi_score_selected_index4)
print(mi_score_selected_index5)
print(mi_score_selected_index6)


特徵選擇的圖(跑結果才要放)

In [None]:
mi_score = pd.DataFrame(mi_score)

values = mi_score.iloc[[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 15, 17, 18, 19, 22, 25, 27, 30, 32, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72],:]
values1 = mi_score.iloc[[ 0,  1,  2,  3,  4,  5,  6,  9, 10, 11, 12, 18, 19, 22, 25, 27, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72],:]
values2 = mi_score.iloc[[ 0,  1,  2,  3,  4,  5,  6,  9, 10, 11, 12, 22, 25, 61, 62, 63, 64, 65, 66, 67, 68, 72],:]
values3 = mi_score.iloc[[ 0,  1,  2,  3,  4,  5,  6,  9, 12, 22, 61, 62, 63, 64, 65, 66, 67, 68],:]
values4 = mi_score.iloc[[ 0,  1,  5, 12, 22, 61, 62, 63, 64, 65, 66, 67, 68],:]
values5 = mi_score.iloc[[12, 61, 62, 63, 64, 65, 66, 67, 68],:]

print(values)
print('------')
print(values1)
print('------')
print(values2)
print('------')
print(values3)
print('------')
print(values4)
print('------')
print(values5)

#特徵33的圖 >0.5
x=[ 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
h=[0.971,0.996,0.867,0.843,0.855,0.967,0.859,0.589,0.538,0.84,0.791,0.789,1.309,0.586,0.579,0.571,0.608,0.608,0.963,0.7,0.617,0.557,0.576,1.075,1.075,1.073,1.067,1.068,1.155,1.073,1.064,0.6,0.735]
label = ['0','1','2','3','4','5','6','7','8','9','10','11','12','13','15','17','18','19','22','25','27','30','32','61','62','63','64','65','66','67','68','71','72']
plt.figure(figsize=(10, 5))
plt.bar(x,h,tick_label=label,width=0.6)
plt.title('Feature Importance')
plt.xlabel('feature')
plt.ylabel('importance')
plt.show()

#特徵26的圖 >0.6
x=[ 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
h=[0.971,0.996,0.867,0.843,0.855,0.967,0.859,0.84,0.791,0.789,1.309,0.608,0.608,0.963,0.7,0.617,1.075,1.075,1.073,1.067,1.068,1.155,1.073,1.064,0.6,0.735]
label = ['0','1','2','3','4','5','6','9','10','11','12','18','19','22','25','27','61','62','63','64','65','66','67','68','71','72']
plt.figure(figsize=(10, 5))
plt.bar(x,h,tick_label=label,width=0.6)
plt.title('Feature Importance')
plt.xlabel('feature')
plt.ylabel('importance')
plt.show()

#特徵22的圖 >0.7
x=[ 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
h=[0.971,0.996,0.867,0.843,0.855,0.967,0.859,0.84,0.791,0.789,1.309,0.963,0.7,1.075,1.075,1.073,1.067,1.068,1.155,1.073,1.064,0.735]
label = ['0','1','2','3','4','5','6','9','10','11','12','22','25','61','62','63','64','65','66','67','68','72']
plt.figure(figsize=(10, 5))
plt.bar(x,h,tick_label=label,width=0.6)
plt.title('Feature Importance')
plt.xlabel('feature')
plt.ylabel('importance')
plt.show()

#特徵18的圖 >0.8
x=[ 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18]
h=[0.971,0.996,0.867,0.843,0.855,0.967,0.859,0.84,1.309,0.963,1.075,1.075,1.073,1.067,1.068,1.155,1.073,1.064]
label = ['0','1','2','3','4','5','6','9','12','22','61','62','63','64','65','66','67','68']
plt.figure(figsize=(10, 5))
plt.bar(x,h,tick_label=label,width=0.6)
plt.title('Feature Importance')
plt.xlabel('feature')
plt.ylabel('importance')
plt.show()

#特徵13的圖>0.9
x=[ 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13]
h=[0.971,0.996,0.967,1.309,0.963,1.075,1.075,1.073,1.067,1.068,1.155,1.073,1.064]
label = ['0','1','5','12','22','61','62','63','64','65','66','67','68']
plt.figure(figsize=(10, 5))
plt.bar(x,h,tick_label=label,width=0.6)
plt.title('Feature Importance')
plt.xlabel('feature')
plt.ylabel('importance')
plt.show()

#特徵9的圖>1.0
x=[ 1,  2,  3,  4,  5,  6,  7,  8,  9]
h=[1.309,1.075,1.075,1.073,1.067,1.068,1.155,1.073,1.064]
label = ['12','61','62','63','64','65','66','67','68']
plt.figure(figsize=(10, 5))
plt.bar(x,h,tick_label=label,width=0.6)
plt.title('Feature Importance')
plt.xlabel('feature')
plt.ylabel('importance')
plt.show()

篩選出重要特徵並另存CSV檔 手動將時間欄位輸入

In [None]:
fea_33 = all_df.iloc[:,[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 15, 17, 18, 19, 22, 25, 27, 30, 32, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73]]
fea_26 = all_df.iloc[:,[ 0,  1,  2,  3,  4,  5,  6,  9, 10, 11, 12, 18, 19, 22, 25, 27, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73]]
fea_22 = all_df.iloc[:,[ 0,  1,  2,  3,  4,  5,  6,  9, 10, 11, 12, 22, 25, 61, 62, 63, 64, 65, 66, 67, 68, 72, 73]]
fea_18 = all_df.iloc[:,[ 0,  1,  2,  3,  4,  5,  6,  9, 12, 22, 61, 62, 63, 64, 65, 66, 67, 68, 73]]
fea_13 = all_df.iloc[:,[ 0,  1,  5, 12, 22, 61, 62, 63, 64, 65, 66, 67, 68, 73]]
fea_9  = all_df.iloc[:,[12, 61, 62, 63, 64, 65, 66, 67, 68, 73]]

fea_33.info()
fea_26.info()
fea_22.info()
fea_18.info()
fea_13.info()
fea_9.info()

fea_33.to_csv(r'D:\堉豪論文\堉豪',header=True,index=False)
fea_26.to_csv(r'D:\堉豪論文\堉豪',header=True,index=False)
fea_22.to_csv(r'D:\堉豪論文\堉豪',header=True,index=False)
fea_18.to_csv(r'D:\堉豪論文\堉豪',header=True,index=False)
fea_13.to_csv(r'D:\堉豪論文\堉豪',header=True,index=False)
fea_9.to_csv('fea_9_notime.csv',header=True,index=False)