# Библиотеки

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# Функции для обработки

In [None]:
# Для поиска выбросов будем использовать boxplot, pairplot
def get_boxplot(X, columns=None):
  if columns is None:
    columns = X.columns

  for i in columns:
    sns.boxplot(x=X[i])
    plt.show()
  pass  

def get_pairplot(X, columns=None):
  if columns is None:
    columns = X.columns

  sns.pairplot(X[columns])
  pass

def get_hist(X, columns=None, bins='auto'):
  if columns is None:
    columns = X.columns

  for i in columns:
    sns.histplot(x=X[i], bins=bins)
    plt.grid()
    plt.show()
  pass

def get_heatmap(X, columns=None):
  if columns is not None:
    X = X[columns]
    
  sns.heatmap(X.corr(), cmap='coolwarm', annot = True)
  pass

Функция выдает некоторую статистику: по каждому столбцу - количество пропусков, количество уникальных значий, тип данных.

Затем для каждого столбца, в котором число уникальных меньше лимита (50) - список уникальных значений

In [None]:
# По каждому признаку - число уникальных значений и тип
def get_stats(df, unic='all', limit=50):
  print('{0:<20} {1:>10} | {2:>10} | {3}\n'.format('Название колонки', 'Пустых', 'Уникальных', 'Тип данных')+'-'*57)
  for col in df.columns:
    print(f'{col:<20} {df[col].isnull().sum():>10} | {len(df[col].value_counts()):>10} | {df[col].dtype}')
  
  for col in df.columns:
    if len(df[col].value_counts())<limit+1:
      if unic=='object':
        if df[col].dtype=='object':
          print(f'\n{col}\n{"-"*57}')
          print(f'{df[col].value_counts()}\n{"-"*57}')
      else:
          print(f'\n{col}\n{"-"*57}')
          print(f'{df[col].value_counts()}\n{"-"*57}')          

  pass

# Добыча данных

In [None]:
path = '/content/drive/MyDrive/Авиахакатон/'
X_train_orig = pd.read_csv(path + "X_train.csv")
y_train_orig = pd.read_csv(path + "y_train.csv")
X_test_orig = pd.read_csv(path + "X_test.csv")
y_test_orig = pd.read_csv(path + "y_test.csv")

# Обработка данных

## Описательные статистики

In [None]:
get_stats(y_train_orig, limit=0)

Название колонки         Пустых | Уникальных | Тип данных
---------------------------------------------------------
flight_datetime               0 |      35772 | object
flight_phase                  0 |          2 | object
engine_id                     0 |        116 | object
BRAT                         73 |       1294 | float64
DEGT                      19515 |      27663 | float64
DELFN                     37914 |       9194 | float64
DELN1                     37914 |       9153 | float64
DELVSV                    35910 |         93 | float64
DPOIL                     28544 |       2198 | float64
EGTC                      19515 |      24780 | float64
EGTHDM                    10833 |      36793 | float64
EGTHDM_D                  28801 |      18883 | float64
GEGTMC                    28591 |      18897 | float64
GN2MC                     28591 |      18376 | float64
GPCN25                    19515 |      27220 | float64
GWFM                      19515 |      28086 | float64
PCN12  

In [None]:
get_stats(X_train_orig, limit=0)

Название колонки         Пустых | Уникальных | Тип данных
---------------------------------------------------------
engine_id                     0 |        116 | object
aircraft_id                   0 |         58 | object
flight_datetime               0 |      35772 | object
flight_phase                  0 |          2 | object
engine_position               0 |          2 | int64
n1_modifier                   0 |          8 | float64
number_blades                 0 |          3 | float64
engine_family                 0 |          3 | object
engine_type                   0 |          5 | object
manufacturer                  0 |          2 | object
ZHPTAC                    35837 |         49 | float64
ZLPTAC                    35837 |         63 | float64
ZPCN12                        0 |       2263 | float64
ZPCN25                        0 |       1622 | float64
ZPHSF                     34458 |        361 | float64
ZPHSR                     34458 |        361 | float64
ZPN12R       

## Baseline - обработка

In [None]:
X_train_orig.shape

(47794, 53)

In [None]:
def prepare_X_data(X: pd.DataFrame) -> pd.DataFrame:
  #Убираем нечисловые признаки
  cat_features = list(X.columns[X.dtypes == object])
  X_ref = X.drop(columns=cat_features)
  

  # Категориальные признаки превращаем в фиктивные
  X_ref = pd.get_dummies(X_ref, columns=['n1_modifier'])

  # Пропуски
  X_ref.fillna(0, inplace=True)
  return X_ref

def prepare_y_data(y: pd.DataFrame) -> pd.DataFrame:
  #Убираем нечисловые признаки
  cat_features = list(y.columns[y.dtypes == object])
  y_ref = y.drop(columns=cat_features)

  # Пропуски
  y_ref.fillna(0, inplace=True)
  return y_ref

In [None]:
X_train = prepare_X_data(X_train_orig)
y_train = prepare_y_data(y_train_orig)

X_test = prepare_X_data(X_test_orig)
y_test = prepare_y_data(y_test_orig)

## Baseline-моделирование

In [None]:
!pip install catboost

In [None]:
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso, Ridge, HuberRegressor, ElasticNet, LinearRegression, ARDRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [None]:
# таргеты для предсказаний
targets = y_train.columns[0:2]

# Словарь с моделями
models = {
    '1. svr': SVR(),
    '2. knn': KNeighborsRegressor(),
    '3. lasso': Lasso(),
    '4. ridge': Ridge(),
    '5. huber': HuberRegressor(),
    '6. elastic': ElasticNet(),
    '7. linear': LinearRegression(),
    '8. ARD': ARDRegression(),
    '7. tree': DecisionTreeRegressor(),
    '8. random_tree': RandomForestRegressor(),
    '9. catboost': CatBoostRegressor(),
    '8. XGB': XGBRegressor()
}

In [None]:
class ModelsTraining:

  def __init__(self, targets:list, models: dict):
    self.sc = StandardScaler()
    self.targets = targets
    self.models = models
    self.predict_data = {}

  def fit(self, X, y):
    # шкалируем
    X_sc = self.sc.fit_transform(X)
    self.fit_data = {target: 
                      {name_model: 
                       self.models[name_model].fit(X_sc, y[target]) for name_model in self.models
                       } for target in tqdm(self.targets)
                      }

  def predict_models(self, X_test):
    X_sc = self.sc.transform(X_test)
    self.predict_data = {target: 
                         {name_model: 
                          self.models[name_model].predict(X_sc) for name_model in self.models
                          } for target in self.targets
                         }
    return self.predict_data

  def predict(self, X_test) -> pd.DataFrame:
    X_sc = self.sc.transform(X_test)
    predict = pd.DataFrame([])
    for target in self.targets:
      best_model_name = min(self.errors[target], key=self.errors[target].get)
      predict[target] = self.predict_data[target][best_model_name]
    return predict

  def get_error(self, y_test, error: str='mse') -> pd.DataFrame:
    if error=='mse':
      err = mean_squared_error
    elif error=='mae':
      err = mean_absolute_error
    else:
      err = r2_score

    self.errors = {}
    for target in self.targets:
      self.errors[target] = {name_model: err(y_test[target], value) for name_model,value in self.predict_data[target].items()}
    
    return pd.DataFrame(self.errors)


In [None]:
mt = ModelsTraining(targets=targets, models=models)
mt.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Learning rate set to 0.075426
0:	learn: 0.1906094	total: 68.6ms	remaining: 1m 8s
1:	learn: 0.1770430	total: 85.1ms	remaining: 42.5s
2:	learn: 0.1644855	total: 101ms	remaining: 33.5s
3:	learn: 0.1529279	total: 118ms	remaining: 29.3s
4:	learn: 0.1421769	total: 134ms	remaining: 26.6s
5:	learn: 0.1323482	total: 150ms	remaining: 24.8s
6:	learn: 0.1233180	total: 167ms	remaining: 23.6s
7:	learn: 0.1149677	total: 198ms	remaining: 24.5s
8:	learn: 0.1073293	total: 215ms	remaining: 23.7s
9:	learn: 0.1002929	total: 231ms	remaining: 22.9s
10:	learn: 0.0936760	total: 247ms	remaining: 22.2s
11:	learn: 0.0877296	total: 263ms	remaining: 21.6s
12:	learn: 0.0820868	total: 279ms	remaining: 21.1s
13:	learn: 0.0769661	total: 295ms	remaining: 20.8s
14:	learn: 0.0722779	total: 311ms	remaining: 20.4s
15:	learn: 0.0679338	total: 326ms	remaining: 20.1s
16:	learn: 0.0639996	total: 342ms	remaining: 19.8s
17:	learn: 0.0604352	total: 359ms	remaining: 19.6s
18:	learn: 0.0573050	total: 375ms	remaining: 19.4s
19:	learn


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Learning rate set to 0.075426
0:	learn: 16.0595554	total: 26.8ms	remaining: 26.8s
1:	learn: 15.2349018	total: 48.3ms	remaining: 24.1s
2:	learn: 14.4742095	total: 68.1ms	remaining: 22.6s
3:	learn: 13.7659580	total: 86.9ms	remaining: 21.6s
4:	learn: 13.1532382	total: 105ms	remaining: 21s
5:	learn: 12.5970975	total: 125ms	remaining: 20.7s
6:	learn: 12.1101675	total: 144ms	remaining: 20.4s
7:	learn: 11.6620029	total: 163ms	remaining: 20.2s
8:	learn: 11.2615282	total: 186ms	remaining: 20.4s
9:	learn: 10.9067178	total: 208ms	remaining: 20.6s
10:	learn: 10.5693849	total: 228ms	remaining: 20.5s
11:	learn: 10.2764009	total: 248ms	remaining: 20.4s
12:	learn: 10.0166275	total: 268ms	remaining: 20.4s
13:	learn: 9.7564633	total: 288ms	remaining: 20.3s
14:	learn: 9.5418565	total: 307ms	remaining: 20.2s
15:	learn: 9.3398980	total: 329ms	remaining: 20.2s
16:	learn: 9.1743456	total: 347ms	remaining: 20.1s
17:	learn: 9.0221195	total: 366ms	remaining: 20s
18:	learn: 8.8836992	total: 386ms	remaining: 19.9


100%|██████████| 2/2 [05:37<00:00, 168.93s/it]


In [None]:
predicts_models = mt.predict_models(X_test)
er = mt.get_error(y_test, error='mse')
predict = mt.predict(X_test)

In [None]:
er

Unnamed: 0,BRAT,DEGT
3. lasso,155.344055,88.070507
7. tree,286.20582,26.868806


In [None]:
predict

Unnamed: 0,BRAT,DEGT
0,15.106370,15.264038
1,-14.583950,-0.127075
2,-18.583692,-34.849426
3,0.530797,0.000000
4,-19.387267,-32.834595
...,...,...
19113,-18.647628,-16.142456
19114,2.721510,0.000000
19115,14.460366,-20.236084
19116,1.551323,0.000000


### Нейросети

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import Adam

In [None]:
model = Sequential()
model.add(Dense(50, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(20, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
model.compile(loss='mean_squared_error', optimizer=Adam(0.001))

model2 = Sequential()
model2.add(Dense(20, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
model2.add(Dense(10, kernel_initializer='normal', activation='relu'))
model2.add(Dense(1, kernel_initializer='normal'))
# Compile model
model2.compile(loss='mean_squared_error', optimizer=Adam(0.001))

In [None]:
model.fit(X_train, 
                    y_train['BRAT'], 
                    epochs=10, 
                    validation_split=0.1, 
                    verbose=2)

Epoch 1/10
1345/1345 - 3s - loss: 13.6510 - val_loss: 0.0431 - 3s/epoch - 2ms/step
Epoch 2/10
1345/1345 - 3s - loss: 1.5625 - val_loss: 0.0403 - 3s/epoch - 2ms/step
Epoch 3/10
1345/1345 - 2s - loss: 0.0438 - val_loss: 0.0330 - 2s/epoch - 2ms/step
Epoch 4/10
1345/1345 - 4s - loss: 0.2836 - val_loss: 0.0704 - 4s/epoch - 3ms/step
Epoch 5/10
1345/1345 - 2s - loss: 0.2627 - val_loss: 0.0338 - 2s/epoch - 2ms/step
Epoch 6/10
1345/1345 - 2s - loss: 0.0281 - val_loss: 0.0298 - 2s/epoch - 2ms/step
Epoch 7/10
1345/1345 - 3s - loss: 0.0265 - val_loss: 0.0254 - 3s/epoch - 2ms/step
Epoch 8/10
1345/1345 - 2s - loss: 0.0267 - val_loss: 0.0253 - 2s/epoch - 2ms/step
Epoch 9/10
1345/1345 - 2s - loss: 0.0259 - val_loss: 0.0403 - 2s/epoch - 2ms/step
Epoch 10/10
1345/1345 - 2s - loss: 0.0267 - val_loss: 0.0260 - 2s/epoch - 2ms/step


<keras.callbacks.History at 0x7f4e5aca7a50>