In [1]:
%%capture
import pathlib
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import mean_absolute_percentage_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.decomposition import PCA
import joblib
!pip install fastparquet
import fastparquet

ModuleNotFoundError: No module named 'fastparquet'

# Создаём сабмит

In [2]:
# %%writefile /kaggle/input/sibur23/predict.py

def apply_pca(df):
    gas1_df = df[df['feature4'] == 'gas1'].copy()
    gas2_df = df[df['feature4'] == 'gas2'].copy()

    # PCA for gas1
    if not gas1_df.empty:
        gas1_features = ['feature0', 'feature2', 'feature12', 'feature15', 'feature17', 'feature18', 'feature22', 'feature23']
        pca_gas1 = PCA()
        gas1_pca = pca_gas1.fit_transform(gas1_df[gas1_features])
        gas1_pca_components = gas1_pca[:, :3]
        gas1_pca_df = pd.DataFrame(gas1_pca_components, columns=['PCA_VIF_1', 'PCA_VIF_2', 'PCA_VIF_3'], index=gas1_df.index)
        gas1_df = pd.concat([gas1_df, gas1_pca_df], axis=1)

    # PCA for gas2
    if not gas2_df.empty:
        gas2_features = ['feature0', 'feature8', 'feature10', 'feature12', 'feature19', 'feature23', 'feature24']
        pca_gas2 = PCA()
        gas2_pca = pca_gas2.fit_transform(gas2_df[gas2_features])
        gas2_pca_components = gas2_pca[:, :3]
        gas2_pca_df = pd.DataFrame(gas2_pca_components, columns=['PCA_VIF_1', 'PCA_VIF_2', 'PCA_VIF_3'], index=gas2_df.index)
        gas2_df = pd.concat([gas2_df, gas2_pca_df], axis=1)

    combined_df = pd.concat([gas1_df, gas2_df])
    combined_df.sort_index(inplace=True)

    return combined_df


def load_and_rename_models(file_path, model_names):
    loaded_models = joblib.load(file_path + 'ensemble_models.joblib')
    named_models = dict(zip(model_names, loaded_models))
    return named_models

file_path = '/kaggle/input/sibur23/'
model_names = ['target0_gas1', 'target1_gas1', 'target0_gas2', 'target1_gas2']
named_models = load_and_rename_models(file_path, model_names)

def get_model(feature4, target):
    """
    Выбор модели на основе комбинации feature4 и target.

    Параметры:
        feature4 (str): Значение feature4 ('gas1' или 'gas2').
        target (str): Имя целевого столбца ('target0' или 'target1').

    Возвращает:
        Модель, соответствующую комбинации feature4 и target.
    """

    if feature4 == 'gas1' and target == 'target0':
        return named_models['target0_gas1']
    elif feature4 == 'gas1' and target == 'target1':
        return named_models['target1_gas1']
    elif feature4 == 'gas2' and target == 'target0':
        return named_models['target0_gas2']
    elif feature4 == 'gas2' and target == 'target1':
        return named_models['target1_gas2']
    else:
        raise ValueError(f"Некорректная комбинация feature4 и target: {feature4}, {target}")


def predict(df: pd.DataFrame) -> pd.DataFrame:
    """
    Вычисление предсказаний.
    Параметры:
        df: датафрейм, содержащий строки из тестового множества.
    Результат:
        Датафрейм предсказаний.
    """
    df = apply_pca(df)
    gas1_features_to_remove = ['feature1', 'feature4', 'feature5', 'feature7', 'feature8', 'feature10', 'feature19', 'feature24', 'feature0', 'feature2', 'feature12', 'feature15', 'feature17', 'feature18', 'feature22', 'feature23']
    gas2_features_to_remove = ['feature2', 'feature3', 'feature4', 'feature15', 'feature17', 'feature21', 'feature22', 'feature0', 'feature8', 'feature10', 'feature12', 'feature19', 'feature23', 'feature24']

    predictions = pd.DataFrame(index=df.index, columns=["target0", "target1"])
    df = df.drop(columns=["target0", "target1"])

    for feature4 in ['gas1', 'gas2']:
        for target in ['target0', 'target1']:
            subset = df[df['feature4'] == feature4]
            model = get_model(feature4, target)
            if model is not None:
                if feature4 == "gas1":
                    subset = subset.drop(gas1_features_to_remove, axis=1)
                elif feature4 == "gas2":
                    subset = subset.drop(gas2_features_to_remove, axis=1)
                predictions.loc[subset.index, target] = model.predict(subset)

    predictions = predictions.reindex(df.index)  # Восстановление исходного порядка индексов
    if not np.issubdtype(predictions.dtypes, np.number):
        # Приведение неправильных типов данных к числовому типу данных
        predictions = predictions.astype(float)

    return predictions


In [20]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

df_test = pd.read_parquet("/kaggle/input/sibur23/train.parquet")

predictions = predict(df_test)

# Расчет MAPE для target0
actual_target0 = df_test["target0"]
predicted_target0 = predictions["target0"]
mape_target0 = np.mean(np.abs((actual_target0 - predicted_target0) / actual_target0)) * 100

# Расчет MAPE для target1
actual_target1 = df_test["target1"]
predicted_target1 = predictions["target1"]
mape_target1 = np.mean(np.abs((actual_target1 - predicted_target1) / actual_target1)) * 100

# Расчет общего MAPE
mape_total = (mape_target0 + mape_target1) / 2

# Вывод общего MAPE с округлением до 2 знаков после запятой
print("Total MAPE: {:.3f}%".format(mape_total))

Total MAPE: 0.405%


In [19]:
predictions

Unnamed: 0,target0,target1
0,27.019813,6.431210
1,83.999288,76.222840
2,22.086950,3.034993
3,30.334209,8.915871
4,71.134920,50.776548
...,...,...
153412,26.598321,4.353239
153413,19.953670,7.124447
153414,28.546020,5.591571
153415,72.052726,51.094964


In [4]:

# Проверка наличия отсутствующих значений в DataFrame
if predictions.isnull().values.any():
    print("DataFrame содержит отсутствующие значения (NaN)")


In [15]:
predictions.dtypes

target0    float64
target1    float64
dtype: object

In [18]:
if not np.issubdtype(predictions.dtypes, np.number):
    print("DataFrame содержит неправильный тип данных")


DataFrame содержит неправильный тип данных


In [11]:
for column in predictions.columns:
    print(f"Column: {column}")
    print(f"Data type: {predictions[column].dtype}")
    print(f"Unique values: {predictions[column].unique()}")
    print("-----------------------")


Column: target0
Data type: float64
Unique values: [27.0198126  83.99928771 22.08694995 ... 28.5460197  72.05272567
 25.15045131]
-----------------------
Column: target1
Data type: float64
Unique values: [ 6.43121031 76.22284039  3.03499285 ...  5.59157121 51.09496444
  3.55900498]
-----------------------


In [8]:

if not np.issubdtype(predictions.dtypes, np.number):
    # Приведение неправильных типов данных к числовому типу данных
    predictions = predictions.astype(float)


In [21]:
# Проверка совместимости с np.isfinite()
if not np.isfinite(predictions.values).all():
    print("DataFrame содержит неправильные значения")

# Если никаких сообщений не было выведено, DataFrame совместим с np.isfinite()
