In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import warnings

import funcs
from scipy.optimize import minimize
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# from ydata_profiling import ProfileReport
from catboost import CatBoostRegressor
# from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import cross_validate
# from sklearn.model_selection import cross_val_score
# from sklearn.ensemble import (
#     ExtraTreesRegressor,
#     RandomForestRegressor,
#     VotingRegressor,
# )
# from sklearn.impute import KNNImputer
# from sklearn.linear_model import ElasticNet
# from sklearn.preprocessing import MaxAbsScaler
# from xgboost import XGBRegressor
from scipy.optimize import curve_fit

%matplotlib inline
warnings.filterwarnings("ignore", category=FutureWarning)

- Id - идентификатор состава.
- Cement - количество цемента в смеси.
- BlastFurnaceSlag - количество доменного шлака в смеси.
- FlyAsh - количество зольной пыли в смеси.
- Water - количество воды в смеси.
- Superplasticizer - количество суперпластификатора в смеси.
- CoarseAggregate - количество заполнителя с грубой фракцией в смеси.
- FineAggregate - количество заполнителя с мелкой фракцией в смеси.
- Age - время высыхания в днях.
- Strength - прочность получившегося бетона (Целевая переменная)


In [63]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
df_train = df_train.drop(["Id"], axis=1)
df_test = df_test.drop(["Id"], axis=1)
df_train.rename(
    columns={
        "Strength": "target",
        "Superplasticizer": "SP",
        "Fly Ash": "FA",
        "Water": "W",
        "Coarse Aggregate": "Ag",
        "Fine Aggregate": "Af",
        "Blast Furnace Slag": "BFS",
        "Cement": "C",
        "Age": "t",
    },
    inplace=True,
)
df_train.columns = df_train.columns.str.replace(" ", "")
df_test.columns = df_test.columns.str.replace(" ", "")
# df_train.rename(columns={'Strength': 'target'}, inplace=True)
df_train.info()
# df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   C       800 non-null    float64
 1   BFS     800 non-null    float64
 2   FA      800 non-null    float64
 3   W       800 non-null    float64
 4   SP      800 non-null    float64
 5   Ag      800 non-null    float64
 6   Af      800 non-null    float64
 7   t       800 non-null    int64  
 8   target  800 non-null    float64
dtypes: float64(8), int64(1)
memory usage: 56.4 KB


In [76]:
# Признаки и целевая переменная
X = df_1.drop(columns=["target"])
y = df_1["target"]


# Модифицированная эмпирическая модель
def strength_model(params, X):
    k1, k2, k3, k4, k5, k6, alpha = params
    C = X["C"]
    FA = X["FA"]
    BFS = X["BFS"]
    W = X["W"]
    SP = X["SP"]
    A_g = X["Ag"]
    A_f = X["Af"]
    t = X["t"]

    # Модифицированная модель
    strength = (
        k1 * ((C + k5 * BFS + k6 * FA) / (W + k2 * SP)) / (1 + k3 * A_g + k4 * A_f)
    )
    return strength * (t / t + alpha) * np.log1p(t)


# Функция для минимизации ошибки (MSE) с L2 регуляризацией
def objective_function(params, X, y, lambda_reg=0.01):
    predicted_strength = strength_model(params, X)
    mse = np.mean((predicted_strength - y) ** 2)  # MSE
    # Регуляризация L2 для предотвращения переобучения
    reg_term = lambda_reg * np.sum(np.array(params) ** 2)
    return mse + reg_term


# Начальные значения коэффициентов
initial_params = [
    0.8,
    0.2,
    0.1,
    0.05,
    0.6,
    0.4,
    50,
]  # Пример начальных значений для k1, k2, k3, k4, alpha

# Оптимизация коэффициентов с использованием градиентного спуска
result = minimize(objective_function, initial_params, args=(X, y), method="BFGS")

# Полученные коэффициенты
k1, k2, k3, k4, k5, k6, alpha = result.x
print(
    f"""Оптимизированные коэффициенты:
    k1 = {k1:.2f}, k2 = {k2:.2f}, k3 = {k3:.2f}, 
    k4 = {k4:.2f}, k5 = {k5:.2f}, k6 = {k6:.2f}, 
    alpha = {alpha:.2f}"""
)

# Прогнозирование прочности на основе оптимизированных коэффициентов
df_train["Strength_pred"] = strength_model(result.x, df_train)

# Оценка качества модели
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(df_train["target"], df_train["Strength_pred"]))
print(f"RMSE: {rmse}")

Оптимизированные коэффициенты:
    k1 = 3.45, k2 = -0.98, k3 = -0.00, 
    k4 = 0.00, k5 = 0.31, k6 = 0.20, 
    alpha = 2.99
RMSE: 13.901897946983865


In [78]:
df_1.head(15)

Unnamed: 0,C,BFS,FA,W,SP,Ag,Af,t,target,Strength_pred
0,376.0,0.0,0.0,214.6,0.0,1003.5,762.4,3,16.28,11.16629
2,250.0,0.0,95.7,187.4,5.5,956.9,861.2,3,13.82,12.223631
6,192.0,288.0,0.0,192.0,0.0,929.8,716.1,3,12.79,17.069958
8,236.0,0.0,0.0,194.0,0.0,968.0,885.0,3,6.47,7.989612
17,183.9,122.6,0.0,203.5,0.0,959.2,800.0,3,4.9,10.261542
19,198.6,132.4,0.0,192.0,0.0,978.4,825.5,3,9.13,11.863842
22,173.0,116.0,0.0,192.0,0.0,946.8,856.8,3,6.94,10.406818
32,393.0,0.0,0.0,192.0,0.0,940.6,785.6,3,19.2,13.014405
35,500.0,0.0,0.0,200.0,0.0,1125.0,613.0,1,12.64,7.771804
64,139.6,209.4,0.0,192.0,0.0,1047.0,806.9,3,8.06,12.945838
