In [2]:
import pandas as pd                  
import torch
import numpy as np
import random

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error as MAPE

from torch import nn
from torch import optim

In [3]:
train_df = pd.read_csv('contest_train_df.csv')
test_df = pd.read_csv('contest_test_df.csv')

In [4]:
def preprocessing(df):
    df = df.sort_values(by=['id', 'Date'])     # сортировка данных  

    df['Date'] = pd.to_datetime(df['Date']).apply(lambda x: x.value / 10**9)              # преобразование даты        
    df['Start_date'] = pd.to_datetime(df['Start_date']).apply(lambda x: x.value / 10**9)

    index = pd.MultiIndex.from_frame(df[['id', 'Well_ID']])    # создание и  применение мультиндекса
    df = df.set_index(index) 

    ids = df['id'].unique()          # массив из уникальных id

    df = df.drop(columns=['Formation_lbl', 'Well_ID'])          # удаление ненужных столбцов

    main_df = pd.DataFrame()                    # заполнение неизвестных значений (NaN)
    for id in ids:
        iter_df = df.loc[id]
        iter_df = iter_df.fillna(iter_df.median())
        iter_df = iter_df.fillna(0)

        new_df = pd.DataFrame()
        for column in iter_df.columns:
            if column == 'Q_OIS':
                median = iter_df[:-1][column].median()
                new_df['Q_OIS_pred'] = iter_df['Q_OIS'].tail(1).values[0]
            else:
                median = iter_df[column].median()
                if column == 'id':
                    median = int(median)

            new_df[column] = pd.Series(median)

        main_df = main_df.append(new_df)

    main_df = main_df.set_index('id')

    min_max_scaler = MinMaxScaler()                  # нормализация датафрейма
    column_names_to_normalize = ['Dnytr', 'H_din', 'NomMoschn', 'Hperf',
        'NomNapor', 'Water_cut', 'NomPodacha', 'Dnkt', 'H_sp', 'P_plst', 'Hvd',
        'Extend_Hvd', 'PlNeft', 'VyazkNeft', 'SumTolshin', 'P_zatr', 'PlVody',
        'Pnas', 'Inject_0', 'Inject_1', 'Inject_2', 'Inject_3', 'BHP', 'state_age', 'Date', 'Start_date']
    x = main_df[column_names_to_normalize].values
    x_scaled = min_max_scaler.fit_transform(x)
    df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = main_df.index)
    main_df_normalize = main_df
    main_df_normalize[column_names_to_normalize] = df_temp

    # main_df_normalize.to_csv('main_df_normalize.csv')   # сохранение датафрейма если необходимо

    return main_df_normalize

In [5]:
train_df = preprocessing(train_df)
test_df = preprocessing(test_df)

In [2]:
# train_df = pd.read_csv('main_df_normalize.csv', index_col='id')       
# test_df = pd.read_csv('main_df_normalize_test.csv', index_col='id')

In [6]:
X = train_df.drop(columns=['Q_OIS_pred'])      # выделение обучающих и предсказываемых данных
y = train_df['Q_OIS_pred']

# разделение данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42, shuffle=True)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42, shuffle=True)

In [7]:
# класс для нейросети
class Regression(nn.Module):         
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(31, 15)
        self.fc2 = nn.Linear(15, 7)
        self.fc3 = nn.Linear(7, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        
        return x
 

In [8]:
# основные параметры нейросети
main_mape = 20
current_epochs = 0
model = Regression().double()
criterion = nn.HuberLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

In [9]:
# функция проверки модели
def check(X_1, y_1, main_mape, test_flag=False):
    model.eval()
    lst = []
    with torch.no_grad():
        for i in range(len(X_1)):
            j = torch.tensor(X_1.iloc[i])
                    
            Q = model.forward(j)
            lst.append(pd.Series(Q)[0])

    y_pred_valid = pd.Series(lst)

    predict_frame = pd.DataFrame(y_1)
    predict_frame['Q_OIS_predict'] = round(y_pred_valid).values
    predict_frame = predict_frame.rename(columns={'Q_OIS_pred' : 'Q_OIS_true'})

    y_valid_series = predict_frame['Q_OIS_true']
    y_pred_valid = predict_frame['Q_OIS_predict']
    
    mape = MAPE(y_valid_series, y_pred_valid) * 100
    print(f'MAPE = {mape}, Epoch: {current_epochs}')

    if test_flag:
        if mape < main_mape:
            main_mape = mape
            print('Saving model')
            torch.save(model.state_dict(), 'checkpoint_date.pth')
            final_frame = predict_frame
            return final_frame, main_mape
    model.train()

In [10]:
# обучение модели
epochs = 15
for epoch in range(epochs):
    for i in range(len(X_train)):
        j = torch.tensor(X_train.iloc[i])
        
        optimizer.zero_grad()
                
        Q = model.forward(j)
        loss = criterion(Q,  torch.tensor(y_train.iloc[i]).view(1))
        loss.backward()
        optimizer.step()
    
    current_epochs += 1
    check(X_valid, y_valid, main_mape)

final_frame, main_mape = check(X_test, y_test, main_mape, True)
print(f'\nFinal MAPE = {main_mape}')
print('Final Frame')
print(final_frame) 

MAPE = 28.511889820778908, Epoch: 1
MAPE = 26.213318138870566, Epoch: 2
MAPE = 23.52020961112681, Epoch: 3
MAPE = 22.329663151251943, Epoch: 4
MAPE = 22.09109093528813, Epoch: 5
MAPE = 21.35484215982992, Epoch: 6
MAPE = 21.346517549868537, Epoch: 7
MAPE = 21.204866097444057, Epoch: 8
MAPE = 21.00508078004866, Epoch: 9
MAPE = 21.048153700028237, Epoch: 10
MAPE = 20.817697017124438, Epoch: 11
MAPE = 21.117350488006846, Epoch: 12
MAPE = 21.089242766915493, Epoch: 13
MAPE = 20.864051308535192, Epoch: 14
MAPE = 20.539290611899695, Epoch: 15
MAPE = 19.37055386141652, Epoch: 15
Saving model

Final MAPE = 19.37055386141652
Final Frame
       Q_OIS_true  Q_OIS_predict
id                              
46473        36.0           30.0
39186       641.0          523.0
38311       162.0          174.0
49577        23.0           39.0
349         154.0          152.0
...           ...            ...
32426       100.0           83.0
16108       209.0          210.0
13508       275.0          226.0
15

In [21]:
model.eval()
final_test_df = test_df.drop(columns=['Q_OIS_pred'])  
lst = []
with torch.no_grad():
    for i in range(len(final_test_df)):
        j = torch.tensor(final_test_df.iloc[i])
                
        Q = model.forward(j)
        lst.append(pd.Series(Q)[0])

y_pred_valid = pd.Series(lst)
predict_frame = pd.DataFrame(final_test_df.index)
predict_frame['Q_OIS'] = round(y_pred_valid).values

print(predict_frame)
model.train()

         id  Q_OIS
0         6  276.0
1         8  644.0
2        22   71.0
3        23   64.0
4        24   94.0
...     ...    ...
3890  59941   25.0
3891  59947   37.0
3892  59952   17.0
3893  60052   50.0
3894  60111   15.0

[3895 rows x 2 columns]


Regression(
  (fc1): Linear(in_features=31, out_features=15, bias=True)
  (fc2): Linear(in_features=15, out_features=7, bias=True)
  (fc3): Linear(in_features=7, out_features=1, bias=True)
)

In [22]:
predict_frame.to_csv('final_submission.csv', index=False)