# Библиотеки

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
!pip install catboost

In [None]:
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso, Ridge, HuberRegressor, ElasticNet, LinearRegression, ARDRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler

# Функции для обработки

In [None]:
# Для поиска выбросов будем использовать boxplot, pairplot
def get_boxplot(X, columns=None):
  if columns is None:
    columns = X.columns

  for i in columns:
    sns.boxplot(x=X[i])
    plt.show()
  pass  

def get_pairplot(X, columns=None):
  if columns is None:
    columns = X.columns

  sns.pairplot(X[columns])
  pass

def get_hist(X, columns=None, bins='auto'):
  if columns is None:
    columns = X.columns

  for i in columns:
    sns.histplot(x=X[i], bins=bins)
    plt.grid()
    plt.show()
  pass

def get_heatmap(X, columns=None):
  if columns is not None:
    X = X[columns]
    
  sns.heatmap(X.corr(), cmap='coolwarm', annot = True)
  pass

Функция выдает некоторую статистику: по каждому столбцу - количество пропусков, количество уникальных значий, тип данных.

Затем для каждого столбца, в котором число уникальных меньше лимита (50) - список уникальных значений

In [None]:
# По каждому признаку - число уникальных значений и тип
def get_stats(df, unic='all', limit=50):
  print('{0:<20} {1:>10} | {2:>10} | {3}\n'.format('Название колонки', 'Пустых', 'Уникальных', 'Тип данных')+'-'*57)
  for col in df.columns:
    print(f'{col:<20} {df[col].isnull().sum():>10} | {len(df[col].value_counts()):>10} | {df[col].dtype}')
  
  for col in df.columns:
    if len(df[col].value_counts())<limit+1:
      if unic=='object':
        if df[col].dtype=='object':
          print(f'\n{col}\n{"-"*57}')
          print(f'{df[col].value_counts()}\n{"-"*57}')
      else:
          print(f'\n{col}\n{"-"*57}')
          print(f'{df[col].value_counts()}\n{"-"*57}')          

  pass

# Добыча данных

In [None]:
path = '/content/drive/MyDrive/Авиахакатон/'
X_train_orig = pd.read_csv(path + "X_train.csv")
y_train_orig = pd.read_csv(path + "y_train.csv")
X_test_orig = pd.read_csv(path + "X_test.csv")
y_test_orig = pd.read_csv(path + "y_test.csv")

X_valid_orig = pd.read_csv(path + "X_valid.csv")

# Обработка данных

## Описательные статистики

In [None]:
get_stats(y_train_orig, limit=0)

Название колонки         Пустых | Уникальных | Тип данных
---------------------------------------------------------
flight_datetime               0 |      35772 | object
flight_phase                  0 |          2 | object
engine_id                     0 |        116 | object
BRAT                         73 |       1294 | float64
DEGT                      19515 |      27663 | float64
DELFN                     37914 |       9194 | float64
DELN1                     37914 |       9153 | float64
DELVSV                    35910 |         93 | float64
DPOIL                     28544 |       2198 | float64
EGTC                      19515 |      24780 | float64
EGTHDM                    10833 |      36793 | float64
EGTHDM_D                  28801 |      18883 | float64
GEGTMC                    28591 |      18897 | float64
GN2MC                     28591 |      18376 | float64
GPCN25                    19515 |      27220 | float64
GWFM                      19515 |      28086 | float64
PCN12  

In [None]:
get_stats(X_train_orig, limit=0)

Название колонки         Пустых | Уникальных | Тип данных
---------------------------------------------------------
engine_id                     0 |        116 | object
aircraft_id                   0 |         58 | object
flight_datetime               0 |      35772 | object
flight_phase                  0 |          2 | object
engine_position               0 |          2 | int64
n1_modifier                   0 |          8 | float64
number_blades                 0 |          3 | float64
engine_family                 0 |          3 | object
engine_type                   0 |          5 | object
manufacturer                  0 |          2 | object
ZHPTAC                    35837 |         49 | float64
ZLPTAC                    35837 |         63 | float64
ZPCN12                        0 |       2263 | float64
ZPCN25                        0 |       1622 | float64
ZPHSF                     34458 |        361 | float64
ZPHSR                     34458 |        361 | float64
ZPN12R       

## Baseline - обработка

In [None]:
X_train_orig.shape

(47794, 53)

Список столбцов с пропусками X_train 

In [None]:
null_columns = ['ZHPTAC', 'ZLPTAC', 'ZPHSF', 'ZPHSR', 'ZPN12R', 'ZPOIL', 'ZPS3', 'ZT1AB', 'ZT3', 
       'ZTAMB', 'ZTLA', 'ZTNAC', 'ZTOIL', 'ZVB1F', 'ZVB1R', 'ZVB2F', 'ZVB2R',
       'ZVSV', 'ZWF36', 'IHPSOV', 'AGW', 'CAS', 'IAI', 'IVS12', 'SAT',
       'ZVIAS', 'ZWBP1', 'ZWBP1_8E', 'ZWBP2', 'ZWBP2_8E',
       'IBP', 'IAIE']

In [None]:
def get_value(X, column, mode='mean', value=0):
  if mode == 'value':
    X.loc[X[X[column].isna()].index ,column] = value
  else:
    X.loc[X[X[column].isna()].index, column] = X[column].mean()
  return X

def prepare_X_data(X: pd.DataFrame) -> pd.DataFrame:
  X_ref = X.copy()
  # удаляем идентификационные поля
  X_ref = X_ref.drop(columns=['engine_id', 'aircraft_id', 'flight_datetime'])
  # Заполняем пропуски в столбцах средним значением
  for col in null_columns:
    get_value(X_ref, col)

  # Конструирование признаков
  Teta = X_ref['ZT1A'] / 288.15
  P = X_ref['ZALT'] / 1013.25
  # если значения нулевые
  Teta[Teta==0] = 0.001
  P[P==0] = 10
  
  X_ref['EGTK'] = (X_ref['ZT49'] + 273.15) / Teta  
  X_ref['FFK'] = X_ref['ZWF36'] / (np.sqrt(Teta) * P )
  X_ref['N2K'] = X_ref['ZPCN25'] / np.sqrt(Teta) 
  X_ref['N1K'] = X_ref['ZPCN12'] / np.sqrt(Teta)
  X_ref.fillna(0, inplace=True)
  # Поскольку мы использовали признаки для конструирования других, исходные надо убрать
  #X_ref = X_ref.drop(columns=['ZT1A', 'ZALT', 'ZT49', 'ZWF36', 'ZPCN25', 'ZPCN12'])

  #Убираем нечисловые признаки
  #cat_features = list(X.columns[X.dtypes == object])
  #X_ref = X_ref.drop(columns=cat_features)
  

  # Категориальные признаки превращаем в фиктивные
  X_ref = pd.get_dummies(X_ref, columns=['n1_modifier', 'flight_phase', 'engine_family',
                                         'engine_type', 'manufacturer', 'aircraft_family',
                                         'aircraft_type', 'aircraft_grp', 'ac_manufacturer'])

  return X_ref

def prepare_y_data(y: pd.DataFrame) -> pd.DataFrame:
  #Убираем нечисловые признаки
  cat_features = list(y.columns[y.dtypes == object])
  y_ref = y.drop(columns=cat_features)

  # Пропуски
  #y_ref.fillna(0, inplace=True)
  return y_ref

In [None]:
X_train = prepare_X_data(X_train_orig)
y_train = prepare_y_data(y_train_orig)

X_test = prepare_X_data(X_test_orig)
y_test = prepare_y_data(y_test_orig)

X_valid = prepare_X_data(X_valid_orig)

  result = getattr(ufunc, method)(*inputs, **kwargs)


# Baseline-**моделирование**

Класс

In [None]:
class ModelsTraining:

  def __init__(self, targets:list, models: dict):
    self.sc = StandardScaler()
    self.targets = targets
    self.models = models
    self.predict_data = {}

  def fit(self, X, y):
    # шкалируем
    X_sc = self.sc.fit_transform(X)
    self.fit_data = {
        target: 
        {name_model:
         self.models[name_model].fit(X_sc, y[target]) if 'neuro' not in name_model 
         else 
         self.models[name_model].fit(X_sc, y[target], validation_batch_size=50,
                                     validation_split=0.2,
                                     epochs=50, verbose=0, callbacks=my_callbacks) 
         for name_model in self.models
         } for target in self.targets
                    }


  def __get_best_metric__(self, target):
    return min(self.errors[target], key=self.errors[target].get)

  def get_metric(self, y_test, error: str='mse') -> pd.DataFrame:
    if error=='mse':
      err = mean_squared_error
    elif error=='mae':
      err = mean_absolute_error
    else:
      err = r2_score

    self.errors, self.metric, self.best_model = {}, {}, {}
    self.metric = {}
    for target in self.targets:
      self.errors[target] = {name_model: err(y_test[target], value) for name_model,value in self.predict_data[target].items()}
      best_model_name = self.__get_best_metric__(target)
      self.metric[target] = {'best_model': best_model_name,
                             'best_metric': self.errors[target][best_model_name]}
      self.best_model[target] = self.models[best_model_name]                      
    return pd.DataFrame(self.metric)

  def predict_models(self, X_test):
    X_sc = self.sc.transform(X_test)
    self.predict_data = {target: 
                         {name_model: 
                          self.models[name_model].predict(X_sc) for name_model in self.models
                          } for target in self.targets
                         }
    return self.predict_data

  def predict(self, X_test) -> pd.DataFrame:
    X_sc = self.sc.transform(X_test)
    predict = pd.DataFrame([])
    for target in self.targets:
      best_model_name = self.__get_best_metric__(target)
      predict[target] = self.best_model[target].predict(X_sc).flatten()
    return predict

In [None]:
def lr_exp_decay(epoch, lr):
  if epoch < 2:
    lr = 0.001
  else:
    if epoch % 10 == 0:
      lr = lr /2
  return lr

optimizer = Adam(0.001, decay=1e-6)

my_callbacks = [
    EarlyStopping(monitor='val_loss', patience=5),
    #ModelCheckpoint(monitor='val_loss', save_best_only=True, save_weights_only=True),
    LearningRateScheduler(lr_exp_decay, verbose=0)
]

Модели нейросетей

In [None]:
model = Sequential()
model.add(Dense(50, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(20, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
model.compile(loss='mean_squared_error', optimizer=optimizer)

model2 = Sequential()
model2.add(Dense(20, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
model2.add(Dense(10, kernel_initializer='normal', activation='relu'))
model2.add(Dense(1, kernel_initializer='normal'))
# Compile model
model2.compile(loss='mean_squared_error', optimizer=optimizer)

model3 = Sequential()
model3.add(Dense(100, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
model3.add(Dense(50, kernel_initializer='normal', activation='relu'))
model3.add(Dense(1, kernel_initializer='normal'))
# Compile model
model3.compile(loss='mean_squared_error', optimizer=optimizer)

model4 = Sequential()
model4.add(Dense(60, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
model4.add(Dense(30, kernel_initializer='normal', activation='relu'))
model4.add(Dense(10, kernel_initializer='normal', activation='relu'))
model4.add(Dense(1, kernel_initializer='normal'))
# Compile model
model4.compile(loss='mean_squared_error', optimizer=optimizer)

In [None]:
# таргеты для предсказаний
targets = y_train.columns

# Словарь с моделями
models = {
    #'knn': KNeighborsRegressor(),
    #'lasso': Lasso(random_state=66),
    #'ridge': Ridge(random_state=66),
    #'huber': HuberRegressor(),
    #'elastic': ElasticNet(random_state=66),
    #'linear': LinearRegression(),
    #'ARD': ARDRegression(),
    #'tree': DecisionTreeRegressor(random_state=66),
    #'random_tree': RandomForestRegressor(verbose=0, random_state=66),
    #'catboost': CatBoostRegressor(iterations=50, verbose=0, random_state=66),
    #'XGB': XGBRegressor(verbose=0, random_state=66),
    #'neuro1': model,
    #'neuro2': model2,
    #'neuro3': model3,
    'neuro4': model4  
}

Цикл обучения семейства моделей на каждом из таргетов и предикт на лучшей модели

In [None]:
predict, er = pd.DataFrame([]), pd.DataFrame([])

for target in tqdm(targets):
  # Обучаем только на непустых данных в таргете
  y_train_ = y_train[y_train[target].notna()]
  X_train_ = X_train[y_train[target].notna()]

  y_test_ = y_test[y_test[target].notna()]
  X_test_ = X_test[y_test[target].notna()]

  # Обучение семейства моделей
  mt = ModelsTraining(targets=[target], models=models)
  mt.fit(X_train_, y_train_)

  # Вычисление метрики для каждой модели из семейства
  predicts_models = mt.predict_models(X_test_)
  # Лучшая модель и лучшая метрика
  er[target] = mt.get_metric(y_test_, error='mse')

  # На лучшей модели для таргета делаем предикт для валидационной выборки
  predict[target] = mt.predict(X_valid)

  0%|          | 0/30 [00:00<?, ?it/s]



  3%|▎         | 1/30 [00:22<11:06, 22.99s/it]



  7%|▋         | 2/30 [00:59<14:30, 31.09s/it]



 10%|█         | 3/30 [01:25<12:55, 28.73s/it]



 13%|█▎        | 4/30 [01:51<11:57, 27.59s/it]



 17%|█▋        | 5/30 [02:30<13:10, 31.61s/it]



 20%|██        | 6/30 [03:14<14:20, 35.84s/it]



 23%|██▎       | 7/30 [04:39<19:55, 51.97s/it]



 27%|██▋       | 8/30 [07:05<30:01, 81.89s/it]



 30%|███       | 9/30 [08:30<29:00, 82.87s/it]



 33%|███▎      | 10/30 [09:54<27:43, 83.18s/it]



 37%|███▋      | 11/30 [11:19<26:32, 83.82s/it]



 40%|████      | 12/30 [13:44<30:45, 102.52s/it]



 43%|████▎     | 13/30 [16:10<32:42, 115.46s/it]



 47%|████▋     | 14/30 [18:26<32:28, 121.76s/it]



 50%|█████     | 15/30 [19:51<27:42, 110.80s/it]

In [None]:
er

Unnamed: 0,BRAT,DEGT
best_metric,0.155212,2.707408
best_model,neuro4,neuro3


In [None]:
predict

Unnamed: 0,BRAT,DEGT
0,1.166223,739.899170
1,1.112320,357.348724
2,1.026718,-27.389238
3,1.225899,10.943503
4,0.955453,-34.328930
...,...,...
28671,0.972078,-36.805759
28672,1.142450,148.364182
28673,1.288878,297.488159
28674,0.942643,231.179260
