In [1]:
import gc
from datetime import datetime as dt
import seaborn as sns
import numpy as np
import pandas as pd
import re
import sys
import os
from matplotlib import pyplot as plt
plt.style.use('seaborn')
pd.set_option('max_columns',150)
pd.set_option('max_rows',500)
pd.options.display.float_format = '{:,.4f}'.format
import warnings
warnings.filterwarnings('ignore')

from sklearn import metrics
from sklearn import feature_selection
from sklearn import preprocessing
%matplotlib inline

In [2]:
time_start = dt.now()

### Functions

In [3]:
def ts_metrics(y_true, y_pred):
    return {
        'mae': metrics.mean_absolute_error(y_true, y_pred),
        'mse': metrics.mean_squared_error(y_true, y_pred),
        'mape': np.mean(np.abs((y_true - y_pred) / y_true)),
        'smape': np.mean(np.abs( 2*(y_true - y_pred) / (y_true+np.abs(y_pred)))),
    }

In [4]:
def prepare_sets():
    # Sets creation
    feat_target = 'Target'

    filt_fold  = df.fold == 0
    filt_null  = df[feat_target].isnull()
    filt_turb  = df.turbine_id == turb

    filt_train = ~filt_fold & ~filt_null & filt_turb
    filt_valid = filt_fold & ~filt_null & filt_turb
    filt_test  = filt_null & filt_turb

    x, y   = df[filt_train][feats_used], df[filt_train][feat_target]
    xv, yv = df[filt_valid][feats_used], df[filt_valid][feat_target]
    xt, yt = df[filt_test][feats_used],  df[filt_test][feat_target]
    
    feats_info = ['row_id','turbine_id','turbine_id','Target']
    df_preds_val1  = df[filt_valid][feats_info]
    df_preds_test1 = df[filt_test][feats_info]
    
    return x,y,xv,yv,xt,yt, df_preds_val1,df_preds_test1

### 1. Prepare dataset

In [5]:
# read train dataset
df1 = pd.read_csv('data/train.csv')
df1['row_id'] = range(len(df1))

# read test dataset
df2 = pd.read_csv('data/new/test.csv')
df2['row_id'] = range(len(df2))
df2['row_id'] = df2['row_id']  + 1000000

# merge
df = pd.concat([df1,df2])
del df1,df2
gc.collect()

# add fold for splitting
np.random.seed(1234)
df['fold'] = np.random.randint(0,3,len(df))
# drop some features
feats_drop = ['timestamp','active_power_calculated_by_converter','reactice_power_calculated_by_converter']
for f in feats_drop:
    if f in df.columns:
        del df[f]
# label encoder of categorical feats
feats_cat = ['turbine_id']
list_lbl  = []
for f in feats_cat:
    lbl = preprocessing.LabelEncoder()
    df[f] = lbl.fit_transform(df[f])
    list_lbl.append(lbl)

In [6]:
for i,l in enumerate(lbl.classes_):
    print(i, l)

0 Turbine_01
1 Turbine_10
2 Turbine_103
3 Turbine_105
4 Turbine_108
5 Turbine_120
6 Turbine_123
7 Turbine_13
8 Turbine_139
9 Turbine_14
10 Turbine_15
11 Turbine_158
12 Turbine_18
13 Turbine_19
14 Turbine_20
15 Turbine_97


In [7]:
# tables with results
df_preds_val = pd.DataFrame()
df_preds_test = pd.DataFrame()

### 2. MLP training (Turbine_20)

In [8]:
feats_used = [
    "active_power_raw",
    "ambient_temperature",
    "generator_speed",
    "generator_winding_temp_max",
    "grid_power10min_average",
    "nc1_inside_temp",
    "nacelle_temp",
    "reactive_power",
    "wind_direction_raw",
    "wind_speed_raw",
    "wind_speed_turbulence",   
]

In [9]:
# Sets creation
from keras import layers
from keras import models
from keras import Input
from tensorflow.keras import applications

from keras import optimizers
from keras import losses
from keras import metrics
from keras import callbacks

turb = 14
x,y,xv,yv,xt,yt,df_preds_val1,df_preds_test1 = prepare_sets()

scaler_name = 'z_score'
scaler = preprocessing.StandardScaler()
scaler.fit(x)
x1 = scaler.transform(x)
xv1 = scaler.transform(xv)
xt1 = scaler.transform(xt)

print(x.shape, xv.shape, xt.shape)
x[:3]

(37532, 11) (18991, 11) (18933, 11)


Unnamed: 0,active_power_raw,ambient_temperature,generator_speed,generator_winding_temp_max,grid_power10min_average,nc1_inside_temp,nacelle_temp,reactive_power,wind_direction_raw,wind_speed_raw,wind_speed_turbulence
21,840.6279,31.7113,1164.109,72.2803,712.743,31.5078,34.1534,168.5228,29.5899,7.9141,1.0158
53,1295.1928,34.1922,1200.9268,73.8948,1358.1369,33.9271,31.6354,259.5941,228.0924,8.2985,0.7002
82,829.6309,33.8101,1158.2494,64.4816,701.0086,33.5683,31.6274,165.1537,59.0599,7.0434,0.5886


In [10]:
# 2) define model
model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_dim=11))
model.add(layers.Dense(64, activation='relu'))
# model.add(layers.Dropout(0.5))
model.add(layers.Dense(32, activation='relu'))
# model.add(layers.Dropout(0.5))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(4, activation='relu'))
model.add(layers.Dense(1))

# 3) define callbacks
callbacks_list = [ 
    callbacks.EarlyStopping(monitor='val_mape', min_delta=1e-15, patience=500, restore_best_weights=True),
    callbacks.ModelCheckpoint(filepath=f'models/nn{turb}_v1.h5', monitor='val_mape', mode='max', save_best_only=True),
    # callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10),
    # callbacks.TensorBoard(log_dir=f'logs/{turb}')#, histogram_freq=1, embeddings_freq=1)
]

# 4) compile model
model.compile(
    optimizer = optimizers.rmsprop_v2.RMSprop(lr=0.001),
    loss      = losses.mse,
    metrics   = ['mae','mape']
)

# 5) train model
history = model.fit(
    x1,
    y,
    epochs = 5000,
    validation_data=(xv1, yv),
    batch_size = 1024,
    verbose=0,
    # validation_split = 0.2,
    callbacks = callbacks_list    
)

In [11]:
df_preds_val1['pred']  = model.predict(xv1)
df_preds_val = pd.concat([df_preds_val, df_preds_val1])

df_preds_test1['pred'] = model.predict(xt1)
df_preds_test = pd.concat([df_preds_test, df_preds_test1])

In [12]:
ts_metrics(df_preds_val1.Target, df_preds_val1.pred)

{'mae': <tf.Tensor: shape=(), dtype=float32, numpy=1.1893232>,
 'mse': <tf.Tensor: shape=(), dtype=float32, numpy=3.4482362>,
 'mape': 0.023732303898796943,
 'smape': 0.023750569437843535}

### 3.KNN Training

In [13]:
# what is the best features for every turbine (explored separately)
dict_meta1 = {
 0: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nacelle_temp',
  'reactive_power',
  'wind_direction_raw',
  'wind_speed_turbulence'],
 1: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nacelle_temp',
  'wind_direction_raw'],
 2: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nacelle_temp',
  'wind_direction_raw'],
 3: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nc1_inside_temp',
  'nacelle_temp',
  'wind_direction_raw'],
 4: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nc1_inside_temp',
  'nacelle_temp',
  'wind_direction_raw',
  'wind_speed_turbulence'],
 5: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nc1_inside_temp',
  'nacelle_temp',
  'wind_direction_raw',
  'wind_speed_turbulence'],
 6: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nacelle_temp',
  'wind_direction_raw',
  'wind_speed_turbulence'],
 7: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nc1_inside_temp',
  'nacelle_temp',
  'wind_direction_raw'],
 8: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nc1_inside_temp',
  'nacelle_temp',
  'wind_direction_raw',
  'wind_speed_turbulence'],
 9: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nc1_inside_temp',
  'nacelle_temp',
  'wind_direction_raw'],
 10: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nacelle_temp',
  'wind_direction_raw'],
 11: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nc1_inside_temp',
  'nacelle_temp',
  'reactive_power',
  'wind_direction_raw',
  'wind_speed_turbulence'],
 12: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nacelle_temp',
  'wind_direction_raw'],
 13: ['ambient_temperature',
  'generator_winding_temp_max',
  'grid_power10min_average',
  'nc1_inside_temp',
  'nacelle_temp',
  'wind_direction_raw'],
 14: ['generator_winding_temp_max',
  'grid_power10min_average',
  'nc1_inside_temp',
  'nacelle_temp',
  'wind_direction_raw',
  'wind_speed_turbulence'],
 15: ['generator_winding_temp_max',
  'grid_power10min_average',
  'nc1_inside_temp',
  'nacelle_temp',
  'wind_direction_raw']
}

dict_meta2 = {
 0: ['min_max', 'square', 2, 1],
 1: ['z_score', 'linear', 1, 1],
 2: ['z_score', 'linear', 1, 1],
 3: ['robust', 'square', 1, 1],
 4: ['min_max', 'square', 3, 1],
 5: ['min_max', 'square', 3, 1],
 6: ['min_max', 'square', 3, 1],
 7: ['robust', 'square', 1, 1],
 8: ['min_max', 'square', 2, 1],
 9: ['z_score', 'square', 1, 1],
 10: ['z_score', 'square', 2, 1],
 11: ['min_max', 'square', 3, 1],
 12: ['z_score', 'linear', 1, 1],
 13: ['z_score', 'square', 1, 1],
 14: ['min_max', 'square', 2, 1],
 15: ['z_score', 'square', 2, 1]
}

In [14]:
from sklearn import neighbors 

def knn_train():    
    # preprocess
    dict_scalers = {
        'z_score': preprocessing.StandardScaler(),
        'min_max': preprocessing.MinMaxScaler(),
        'quantile': preprocessing.QuantileTransformer(),
        'robust': preprocessing.RobustScaler(),
    }
    scaler = dict_scalers[N]
    scaler.fit(x)
    x1 = scaler.transform(x)
    xv1 = scaler.transform(xv)
    xt1 = scaler.transform(xt)
    
    # fitting
    knn = neighbors.NearestNeighbors(n_neighbors=K, n_jobs=-1, p=P)
    knn.fit(x1, y)
    
    # 1) for valid
    # distances and neighbors
    dist,res = knn.kneighbors(xv1, return_distance=True)
    df_res = pd.DataFrame([y.iloc[r].values for r in res])
    df_dist = pd.DataFrame(dist)
    # distance matrix
    if M == 'linear':
        df_mult = 1/df_dist
    elif M == 'square':
        df_mult = (1/df_dist)**2
    else:
        df_mult = 1
    # predictions
    if M == 'no':
        pred = df_res.mean(axis = 1)
    else:
        df_res1 = df_res * df_mult
        pred = df_res1.mean(axis = 1) / df_mult.mean(axis = 1)
    pred = pred.fillna(0).values
    pred_valid = pred
    
    # 2) for test
    # distances and neighbors
    dist,res = knn.kneighbors(xt1, return_distance=True)
    df_res = pd.DataFrame([y.iloc[r].values for r in res])
    df_dist = pd.DataFrame(dist)
    # distance matrix
    if M == 'linear':
        df_mult = 1/df_dist
    elif M == 'square':
        df_mult = (1/df_dist)**2
    else:
        df_mult = 1
    # predictions
    if M == 'no':
        pred = df_res.mean(axis = 1)
    else:
        df_res1 = df_res * df_mult
        pred = df_res1.mean(axis = 1) / df_mult.mean(axis = 1)
    pred = pred.fillna(0).values
    pred_test = pred
    
    return pred_valid, pred_test


In [15]:
for turb in range(16):
    if turb == 14:
        continue
    feats_used = dict_meta1[turb]
    N,M,K,P = dict_meta2[turb]
    
    x,y,xv,yv,xt,yt,df_preds_val1,df_preds_test1 = prepare_sets()
    # final version - merge valid set to train set 
    # (we don't need validation set to increase accuracy) 
    x = pd.concat([x,xv])
    y = pd.concat([y,yv])

    pred_valid, pred_test = knn_train()
    
    df_preds_val1['pred']  = pred_valid
    df_preds_val = pd.concat([df_preds_val, df_preds_val1])

    df_preds_test1['pred'] = pred_test
    df_preds_test = pd.concat([df_preds_test, df_preds_test1])

### 4. Submission

In [17]:
# submission
df_preds_test.sort_values('row_id', inplace = True)
df_preds_test['Target'] = df_preds_test['pred']
df_preds_test[['Target']].to_csv('output/main31.csv', index = False, sep = ',')

In [18]:
print('Working time: ', dt.now() - time_start) 

Working time:  0:04:37.976551
