In [1]:
%%capture
import pathlib
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import mean_absolute_percentage_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.decomposition import PCA
import joblib
!pip install fastparquet
import fastparquet

In [2]:
def apply_pca(df):
    gas1_df = df[df['feature4'] == 'gas1'].copy()
    gas2_df = df[df['feature4'] == 'gas2'].copy()

    # PCA for gas1
    if not gas1_df.empty:
        gas1_features = ['feature0', 'feature2', 'feature12', 'feature15', 'feature17', 'feature18', 'feature22', 'feature23']
        pca_gas1 = PCA()
        gas1_pca = pca_gas1.fit_transform(gas1_df[gas1_features])
        gas1_pca_components = gas1_pca[:, :3]
        gas1_pca_df = pd.DataFrame(gas1_pca_components, columns=['PCA_VIF_1', 'PCA_VIF_2', 'PCA_VIF_3'], index=gas1_df.index)
        gas1_df = pd.concat([gas1_df, gas1_pca_df], axis=1)

    # PCA for gas2
    if not gas2_df.empty:
        gas2_features = ['feature0', 'feature8', 'feature10', 'feature12', 'feature19', 'feature23', 'feature24']
        pca_gas2 = PCA()
        gas2_pca = pca_gas2.fit_transform(gas2_df[gas2_features])
        gas2_pca_components = gas2_pca[:, :3]
        gas2_pca_df = pd.DataFrame(gas2_pca_components, columns=['PCA_VIF_1', 'PCA_VIF_2', 'PCA_VIF_3'], index=gas2_df.index)
        gas2_df = pd.concat([gas2_df, gas2_pca_df], axis=1)

    combined_df = pd.concat([gas1_df, gas2_df])
    combined_df.sort_index(inplace=True)

    return combined_df


def load_and_rename_models(file_path, model_names):
    loaded_models = joblib.load(file_path + 'ensemble_models.joblib')
    named_models = dict(zip(model_names, loaded_models))
    return named_models

def get_model(feature4, target):
    """
    Выбор модели на основе комбинации feature4 и target.

    Параметры:
        feature4 (str): Значение feature4 ('gas1' или 'gas2').
        target (str): Имя целевого столбца ('target0' или 'target1').

    Возвращает:
        Модель, соответствующую комбинации feature4 и target.
    """

    if feature4 == 'gas1' and target == 'target0':
        return named_models['target0_gas1']
    elif feature4 == 'gas1' and target == 'target1':
        return named_models['target1_gas1']
    elif feature4 == 'gas2' and target == 'target0':
        return named_models['target0_gas2']
    elif feature4 == 'gas2' and target == 'target1':
        return named_models['target1_gas2']
    else:
        raise ValueError(f"Некорректная комбинация feature4 и target: {feature4}, {target}")
    
    
def predict(df: pd.DataFrame) -> pd.DataFrame:
    """
    Вычисление предсказаний.
    Параметры:
        df: датафрейм, содержащий строки из тестового множества.
    Результат:
        Датафрейм предсказаний.
    """
    df = apply_pca(df)
    gas1_features_to_remove = ['feature1', 'feature4', 'feature5', 'feature7', 'feature8', 'feature10', 'feature19', 'feature24', 'feature0', 'feature2', 'feature12', 'feature15', 'feature17', 'feature18', 'feature22', 'feature23']
    gas2_features_to_remove = ['feature2', 'feature3', 'feature4', 'feature15', 'feature17', 'feature21', 'feature22', 'feature0', 'feature8', 'feature10', 'feature12', 'feature19', 'feature23', 'feature24']

    predictions = pd.DataFrame(index=df.index, columns=["target0", "target1"])
    df_temp = df
    df = df.drop(columns=["target0", "target1"])

    for feature4 in ['gas1', 'gas2']:
        for target in ['target0', 'target1']:
            subset = df[df['feature4'] == feature4]
            model = get_model(feature4, target)
            if model is not None:
                if feature4 == "gas1":
                    subset = subset.drop(gas1_features_to_remove, axis=1)
                elif feature4 == "gas2":
                    subset = subset.drop(gas2_features_to_remove, axis=1)
                predictions.loc[subset.index, target] = model.predict(subset)

    predictions = predictions.reindex(df.index)  # Восстановление исходного порядка индексов
    if not np.issubdtype(predictions.dtypes, np.number):
        # Приведение неправильных типов данных к числовому типу данных
        predictions = predictions.astype(float)

    # Переименование столбцов в predictions
    predictions = predictions.rename(columns={'target0': 'feature_t0', 'target1': 'feature_t1'})

    # Объединение исходного датафрейма df с predictions
    df_predictions = pd.concat([df_temp, predictions], axis=1)

    return df_predictions


In [3]:
file_path = '/kaggle/input/sibur23/'
model_names = ['target0_gas1', 'target1_gas1', 'target0_gas2', 'target1_gas2']
named_models = load_and_rename_models(file_path, model_names)

In [4]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

df = pd.read_parquet("/kaggle/input/sibur23/train.parquet")
predictions = predict(df)

In [5]:
predictions

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature22,feature23,feature24,target0,target1,PCA_VIF_1,PCA_VIF_2,PCA_VIF_3,feature_t0,feature_t1
0,32.910908,17.376350,77.557840,2.929855,gas1,20.487150,57.633085,49.245392,-44.124381,23.400064,...,110.529868,56.817260,12.887802,27.050891,6.502743,19.674097,-2.299538,1.296653,27.019813,6.431210
1,41.263782,22.419445,47.945514,-25.847472,gas2,21.461239,3.474080,49.659980,-13.553188,-13.047593,...,37.286110,61.224272,39.822424,84.127890,76.578716,77.269214,-20.846219,27.285959,83.999288,76.222840
2,25.580283,17.376350,77.654180,2.799411,gas1,20.487150,28.938295,49.245392,-44.124381,20.610679,...,111.471534,52.664304,12.887802,22.080133,3.036043,11.337947,-1.110233,0.895696,22.086950,3.034993
3,33.756900,17.376350,73.049625,2.953982,gas1,20.487150,28.932311,49.245392,-44.124381,18.107963,...,115.589451,56.840719,12.887802,30.234082,8.910795,18.982585,-9.075466,1.054279,30.334209,8.915871
4,4.223732,38.772534,48.015553,-25.843943,gas2,24.635721,12.011581,51.030938,84.244199,-17.735680,...,37.398779,40.488468,128.295838,71.128092,50.475082,-65.481950,3.101652,-15.470756,71.134920,50.776548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153412,25.851721,17.376350,69.777513,3.086227,gas1,20.487150,9.187947,49.245392,-44.124381,8.395057,...,119.189777,52.544704,12.887802,26.319008,4.318349,9.183471,-11.913267,0.564432,26.598321,4.353239
153413,33.211644,17.376350,84.113483,2.963000,gas1,20.487150,37.677592,49.245392,-44.124381,6.252886,...,103.624989,56.818116,12.887802,19.951185,7.128586,21.938253,6.940890,1.769714,19.953670,7.124447
153414,0.339018,17.376350,77.627839,2.889655,gas1,20.487150,49.094915,49.245392,-44.124381,23.596678,...,110.992570,39.673505,12.887802,28.492906,5.589999,-16.269987,5.245449,-0.069477,28.546020,5.591571
153415,21.018093,38.902218,48.015239,-25.843954,gas2,24.642608,0.626815,51.022655,84.812493,-30.045713,...,37.399312,50.917348,128.621094,72.065991,51.306816,-66.257941,3.634379,4.267385,72.052726,51.094964


In [6]:
TARGETS = ["target0", "target1"]

In [None]:
target0 - gas1: ['feature11', 'feature13', 'feature20', 'PCA_VIF_2']
target1 - gas1: ['feature11', 'feature13', 'feature16', 'PCA_VIF_1']
target0 - gas2: ['feature11', 'feature13', 'feature20', 'PCA_VIF_1', 'PCA_VIF_2', 'PCA_VIF_3']
target1 - gas2: ['feature13', 'feature16', 'feature20', 'PCA_VIF_2']


In [None]:
ensemble_models = []

best_models_all = []

for group_name, gas_df in gas_dfs.items():
    target_vars = [col for col in gas_df.filter(like='target').columns]
    
    for var in target_vars:
        y_preds = []
        best_models = []
        y = gas_df[var].values
        X = gas_df.drop(columns=TARGETS)
        X = X[importance_features[group_name][var]]
        
        scaler = StandardScaler()
        X_normalized = scaler.fit_transform(X)
        
        kf = KFold(n_splits=5, random_state=42, shuffle=True)
        
        for train_index, test_index in kf.split(X_normalized):
            X_train, X_test = X_normalized[train_index], X_normalized[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            models = [(reg1, labels[0]), (reg7, labels[1]), (reg10, labels[2])]
            
            for model, label in models:
                if isinstance(model, type(reg1)):
                    param_grid = {
                        'fit_intercept': [True, False]
                    }
                elif isinstance(model, type(reg7)):
                    param_grid = {
                        'n_estimators': [100, 200, 300],
                        'learning_rate': [0.01, 0.1, 0.5],
                        'max_depth': [5, 10, 15]
                    }
                elif isinstance(model, type(reg10)):
                    param_grid = {
                        'n_estimators': [100, 200, 300],
                        'max_depth': [5, 10, 15],
                        'min_samples_split': [2, 5, 10]
                    }
                else:
                    param_grid = {}
                
                grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_absolute_percentage_error', cv=5)
                grid_search.fit(X_train, y_train)
                best_model = grid_search.best_estimator_
                best_models.append((best_model, label))
                best_models_all.append((best_model, label, group_name, var))  # Добавление в список всех лучших моделей
        
        for model, label in best_models:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            mape_score = mean_absolute_percentage_error(y_test, y_pred)
            y_preds.append(y_pred)
        
        ensemble_model = VotingRegressor(estimators=[(label, model) for model, label in best_models])
        ensemble_model.fit(X_train, y_train)
        ensemble_preds = ensemble_model.predict(X_test)
        ensemble_mape_score = mean_absolute_percentage_error(y_test, ensemble_preds)
        
        ensemble_models.append(ensemble_model)


In [None]:
    gas1_features_to_remove = ['feature1', 'feature4', 'feature5', 'feature7', 'feature8', 'feature10', 'feature19', 'feature24', 'feature0', 'feature2', 'feature12', 'feature15', 'feature17', 'feature18', 'feature22', 'feature23']
    gas2_features_to_remove = ['feature2', 'feature3', 'feature4', 'feature15', 'feature17', 'feature21', 'feature22', 'feature0', 'feature8', 'feature10', 'feature12', 'feature19', 'feature23', 'feature24']
    target0_gas1_to_use = ['feature11', 'feature13', 'feature20', 'PCA_VIF_2']
    target1_gas1_to_use = ['feature11', 'feature13', 'feature16', 'PCA_VIF_1']
    target0_gas2_to_use = ['feature11', 'feature13', 'feature20', 'PCA_VIF_1', 'PCA_VIF_2', 'PCA_VIF_3']
    target1_gas2_to_use = ['feature13', 'feature16', 'feature20', 'PCA_VIF_2']

    for feature4 in ['gas1', 'gas2']:
        for target in ['target0', 'target1']:
            subset = df[df['feature4'] == feature4]
            if model is not None:
                if feature4 == "gas1":
                    subset = subset.drop(gas1_features_to_remove, axis=1)
                    subset = subset[f'{target}_{feature4}_to_use']
                elif feature4 == "gas2":
                    subset = subset.drop(gas2_features_to_remove, axis=1)
                    subset = subset[f'{target}_{feature4}_to_use']
                predictions.loc[subset.index, target] = model.predict(subset)

In [12]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

def build_lstm_model(input_shape):
    model = Sequential()
    
    # LSTM слои
    model.add(LSTM(64, input_shape=input_shape, return_sequences=True))
    model.add(LSTM(64, kernel_regularizer=l2(0.001), return_sequences=True))
    
    # Плотные слои
    model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(Dropout(0.2))
    
    # Нормализация весов
    model.add(BatchNormalization())
    
    # Выходной слой
    model.add(Dense(1, activation='linear'))
    
    return model

def train_lstm_model(df, gas1_features_to_remove, gas2_features_to_remove, target0_gas1_to_use,
                     target1_gas1_to_use, target0_gas2_to_use, target1_gas2_to_use):
    targets = ['target0', 'target1']
    
    scaler = MinMaxScaler()  # Нормализация данных
    
    predictions = pd.DataFrame(index=df.index, columns=targets)
    predictions_array = np.zeros_like(predictions.values)  # Создание массива для прогнозов
    
    for feature4 in ['gas1', 'gas2']:
        if feature4 == 'gas1':
            features_to_remove = gas1_features_to_remove
            targets_to_use = {'target0': target0_gas1_to_use, 'target1': target1_gas1_to_use}
        elif feature4 == 'gas2':
            features_to_remove = gas2_features_to_remove
            targets_to_use = {'target0': target0_gas2_to_use, 'target1': target1_gas2_to_use}
        
        for target in targets_to_use.keys():
            subset = df[df['feature4'] == feature4].copy()
            subset = subset.drop(features_to_remove, axis=1)
            subset = subset[targets_to_use[target]]
            X = subset.values  # Преобразование в массив NumPy
            y = df[target].values  # Преобразование в массив NumPy
            X = scaler.fit_transform(X)
            train_size = int(len(X) * 0.8)
            X_train, X_test = X[:train_size], X[train_size:]
            y_train, y_test = y[:train_size], y[train_size:]
            X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))  # Добавление измерения временных шагов
            input_shape = (X_train.shape[1], X_train.shape[2])
            model = build_lstm_model(input_shape)
            model.compile(loss='mean_absolute_percentage_error', optimizer=Adam())
            model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)
            model_name = f"{feature4}_{target}_model.pkl"
            joblib.dump(model, model_name)
            X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))  # Добавление измерения временных шагов
            y_pred = model.predict(X_test)
            predictions_array[subset.index[train_size:], predictions.columns.get_loc(target)] = y_pred.flatten()
    
    predictions = pd.DataFrame(predictions_array, index=df.index, columns=targets)  # Присвоение массива прогнозов в датафрейм
    
    return predictions



In [13]:
gas1_features_to_remove = ['feature1', 'feature4', 'feature5', 'feature7', 'feature8', 'feature10', 'feature19', 'feature24', 'feature0', 'feature2', 'feature12', 'feature15', 'feature17', 'feature18', 'feature22', 'feature23']
gas2_features_to_remove = ['feature2', 'feature3', 'feature4', 'feature15', 'feature17', 'feature21', 'feature22', 'feature0', 'feature8', 'feature10', 'feature12', 'feature19', 'feature23', 'feature24']
target0_gas1_to_use = ['feature11', 'feature13', 'feature20', 'PCA_VIF_2', 'feature_t0']
target1_gas1_to_use = ['feature11', 'feature13', 'feature16', 'PCA_VIF_1', 'feature_t1']
target0_gas2_to_use = ['feature11', 'feature13', 'feature20', 'PCA_VIF_1', 'PCA_VIF_2', 'PCA_VIF_3', 'feature_t0']
target1_gas2_to_use = ['feature13', 'feature16', 'feature20', 'PCA_VIF_2', 'feature_t1']

predictions = train_lstm_model(predictions, gas1_features_to_remove, gas2_features_to_remove, target0_gas1_to_use,
                               target1_gas1_to_use, target0_gas2_to_use, target1_gas2_to_use)



KeyboardInterrupt

