In [91]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")

In [92]:
df = pd.read_csv("../../data/frames_errors.csv", header=None)
df.columns = [
    "block_id",
    "frame_idx",
    "E_mu_Z",
    "E_mu_phys_est",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
    "nTot",
    "bayesImVoltage",
    "opticalPower",
    "polarizerVoltages[0]",
    "polarizerVoltages[1]",
    "polarizerVoltages[2]",
    "polarizerVoltages[3]",
    "temp_1",
    "biasVoltage_1",
    "temp_2",
    "biasVoltage_2",
    "synErr",
    "N_EC_rounds",
    "maintenance_flag",
    "estimator_name",
    "f_EC",
    "E_mu_Z_est",
    "R",
    "s",
    "p",
]

In [93]:
id_features = [
    "block_id",
    "frame_idx",
]

phys_gt_features = [
    "E_mu_Z",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
]

phys_features = [
    "bayesImVoltage",
    "opticalPower",
    "polarizerVoltages[0]",
    "polarizerVoltages[1]",
    "polarizerVoltages[2]",
    "polarizerVoltages[3]",
    "temp_1",
    "biasVoltage_1",
    "temp_2",
    "biasVoltage_2",
]

est_features = [
    # "E_mu_phys_est",
    "E_mu_Z_est",
    "R",
    "s",
    "p",
]

proxy_features = [
    "N_EC_rounds",
    # "f_EC",
]

df = df[id_features + phys_gt_features + phys_features + est_features + proxy_features]

In [94]:
# df = df.dropna(subset=["f_EC"])

In [95]:
missing_counts = df.isna().sum()
columns_with_missing = missing_counts[missing_counts > 0]
print("Столбцы с пропусками и их количество:")
print(columns_with_missing)
timestamp_counts = df.groupby("block_id")["frame_idx"].nunique()
print("Количество фреймов / Количество рядов")
print(timestamp_counts.value_counts())
print("Shape:", df.shape)

Столбцы с пропусками и их количество:
Series([], dtype: int64)
Количество фреймов / Количество рядов
frame_idx
399    569
400    251
398      2
390      1
Name: count, dtype: int64
Shape: (328617, 44)


In [96]:
def create_features(df, features, lag, window):
    df = df.copy()
    created_features = []

    for feature in features:
        for l in lag:
            df[f"{feature}_lag_{l}"] = df.groupby("block_id")[feature].shift(l)
            created_features.append(f"{feature}_lag_{l}")
        for w in window:
            df[f"{feature}_roll_mean_{w}"] = (
                df.groupby("block_id")[feature].shift(1).rolling(window=w).mean()
            )
            created_features.append(f"{feature}_roll_mean_{w}")
            df[f"{feature}_roll_std_{w}"] = (
                df.groupby("block_id")[feature].shift(1).rolling(window=w).std()
            )
            created_features.append(f"{feature}_roll_std_{w}")

    df = df.bfill().ffill()
    return df, created_features

In [73]:
features_to_create = phys_gt_features + phys_features + est_features
df, created_features = create_features(df, features_to_create, lag=[2, 5, 10, 20], window=[5, 10])
print("Созданные признаки:", created_features)

Созданные признаки: ['E_mu_Z_lag_2', 'E_mu_Z_lag_5', 'E_mu_Z_lag_10', 'E_mu_Z_lag_20', 'E_mu_Z_roll_mean_5', 'E_mu_Z_roll_std_5', 'E_mu_Z_roll_mean_10', 'E_mu_Z_roll_std_10', 'E_mu_X_lag_2', 'E_mu_X_lag_5', 'E_mu_X_lag_10', 'E_mu_X_lag_20', 'E_mu_X_roll_mean_5', 'E_mu_X_roll_std_5', 'E_mu_X_roll_mean_10', 'E_mu_X_roll_std_10', 'E_nu1_X_lag_2', 'E_nu1_X_lag_5', 'E_nu1_X_lag_10', 'E_nu1_X_lag_20', 'E_nu1_X_roll_mean_5', 'E_nu1_X_roll_std_5', 'E_nu1_X_roll_mean_10', 'E_nu1_X_roll_std_10', 'E_nu2_X_lag_2', 'E_nu2_X_lag_5', 'E_nu2_X_lag_10', 'E_nu2_X_lag_20', 'E_nu2_X_roll_mean_5', 'E_nu2_X_roll_std_5', 'E_nu2_X_roll_mean_10', 'E_nu2_X_roll_std_10', 'E_nu1_Z_lag_2', 'E_nu1_Z_lag_5', 'E_nu1_Z_lag_10', 'E_nu1_Z_lag_20', 'E_nu1_Z_roll_mean_5', 'E_nu1_Z_roll_std_5', 'E_nu1_Z_roll_mean_10', 'E_nu1_Z_roll_std_10', 'E_nu2_Z_lag_2', 'E_nu2_Z_lag_5', 'E_nu2_Z_lag_10', 'E_nu2_Z_lag_20', 'E_nu2_Z_roll_mean_5', 'E_nu2_Z_roll_std_5', 'E_nu2_Z_roll_mean_10', 'E_nu2_Z_roll_std_10', 'N_mu_X_lag_2', 'N_mu_X

In [97]:
# Test (Val)
start_idx = df[(df["block_id"] == 1489460492) & (df["frame_idx"] == 99)].index[0]
end_idx = df[(df["block_id"] == 1840064900) & (df["frame_idx"] == 101)].index[0]

test_df = df.loc[start_idx:end_idx].copy()
assert len(test_df) == 2000, "Test (Val) set size is not 2000 rows"

# Train
all_block_ids = df["block_id"].unique()
train_blocks = [bid for bid in all_block_ids if bid not in test_df["block_id"].values]
train_df = df[df["block_id"].isin(train_blocks)]
train_df = train_df[(train_df["N_EC_rounds"] == 1)]

print(f"Train: {len(train_df)}")
print(f"Test: {len(test_df)}")


Train: 106372
Test: 2000


# Catboost E_mu_Z

In [75]:
features_in = created_features + phys_features

X_train = train_df[features_in]
y_train = train_df["E_mu_Z_est"]
X_test = test_df[features_in]
y_test = test_df["E_mu_Z_est"]

In [76]:
model_E_mu_Z = CatBoostRegressor(
    iterations=5000,
    learning_rate=0.05,
    loss_function="RMSE",
    verbose=100,
    random_state=42,
)

model_E_mu_Z.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
)

0:	learn: 0.0125286	test: 0.0033744	best: 0.0033744 (0)	total: 21.8ms	remaining: 1m 48s
100:	learn: 0.0007895	test: 0.0004905	best: 0.0004905 (100)	total: 1.97s	remaining: 1m 35s
200:	learn: 0.0006264	test: 0.0004116	best: 0.0004115 (199)	total: 3.78s	remaining: 1m 30s
300:	learn: 0.0005483	test: 0.0003865	best: 0.0003865 (300)	total: 5.68s	remaining: 1m 28s
400:	learn: 0.0004999	test: 0.0003683	best: 0.0003683 (400)	total: 7.66s	remaining: 1m 27s
500:	learn: 0.0004639	test: 0.0003555	best: 0.0003555 (500)	total: 9.77s	remaining: 1m 27s
600:	learn: 0.0004344	test: 0.0003427	best: 0.0003425 (596)	total: 11.7s	remaining: 1m 25s
700:	learn: 0.0004097	test: 0.0003304	best: 0.0003304 (700)	total: 13.8s	remaining: 1m 24s
800:	learn: 0.0003886	test: 0.0003183	best: 0.0003183 (800)	total: 15.8s	remaining: 1m 22s
900:	learn: 0.0003714	test: 0.0003075	best: 0.0003075 (899)	total: 17.9s	remaining: 1m 21s
1000:	learn: 0.0003568	test: 0.0002986	best: 0.0002986 (1000)	total: 20s	remaining: 1m 20s
11

<catboost.core.CatBoostRegressor at 0x7d209a6b8bc0>

In [77]:
E_mu_Z_pred = model_E_mu_Z.predict(X_test)

# CatBoost R

In [78]:
features_in = created_features + phys_features + ["E_mu_Z_est"]

X_train = train_df[features_in]
y_train = train_df["R"]
X_test = test_df[features_in]
y_test = test_df["R"]

In [79]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
print(dict(zip(le.classes_, le.transform(le.classes_))))

{0.5: 0, 0.55: 1, 0.6: 2, 0.65: 3, 0.7: 4, 0.75: 5, 0.8: 6, 0.85: 7}


In [80]:
model_R = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    verbose=100,
    random_state=42,
)

model_R.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
)

0:	learn: 0.9922254	test: 1.0000000	best: 1.0000000 (0)	total: 201ms	remaining: 1m 40s
100:	learn: 0.9980070	test: 1.0000000	best: 1.0000000 (0)	total: 13.9s	remaining: 54.7s
200:	learn: 0.9983830	test: 1.0000000	best: 1.0000000 (0)	total: 25.9s	remaining: 38.6s
300:	learn: 0.9985993	test: 1.0000000	best: 1.0000000 (0)	total: 37.6s	remaining: 24.8s
400:	learn: 0.9987967	test: 1.0000000	best: 1.0000000 (0)	total: 48.6s	remaining: 12s
499:	learn: 0.9989189	test: 1.0000000	best: 1.0000000 (0)	total: 59.3s	remaining: 0us

bestTest = 1
bestIteration = 0

Shrink model to first 1 iterations.


<catboost.core.CatBoostClassifier at 0x7d209a6bc590>

In [81]:
R_pred_labels = model_R.predict(X_test).flatten()

R_pred = le.inverse_transform(R_pred_labels)

# CatBoost s p

In [82]:
features_in = phys_features + created_features + ["E_mu_Z_est", "R"]

X_train = train_df[features_in]
y_train = train_df["s"]
X_test = test_df[features_in]
y_test = test_df["s"]

In [87]:
model_s_p = CatBoostRegressor(
    iterations=10000,
    learning_rate=0.04,
    loss_function="RMSE",
    verbose=100,
    random_state=42,
)

model_s_p.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
)

0:	learn: 567.3076935	test: 597.1882999	best: 597.1882999 (0)	total: 32.8ms	remaining: 5m 28s
100:	learn: 66.5062908	test: 24.9618496	best: 24.9618496 (100)	total: 2.45s	remaining: 4m
200:	learn: 39.3805438	test: 12.3436447	best: 12.3436447 (200)	total: 4.46s	remaining: 3m 37s
300:	learn: 33.8141622	test: 10.4049177	best: 10.4046594 (299)	total: 6.38s	remaining: 3m 25s
400:	learn: 30.4133167	test: 8.8856792	best: 8.8851536 (397)	total: 8.3s	remaining: 3m 18s
500:	learn: 27.7442443	test: 8.0539311	best: 8.0539311 (500)	total: 10.3s	remaining: 3m 14s
600:	learn: 25.4336661	test: 7.4554134	best: 7.4554134 (600)	total: 12.7s	remaining: 3m 18s
700:	learn: 23.8988138	test: 6.7984721	best: 6.7983600 (694)	total: 15.2s	remaining: 3m 21s
800:	learn: 22.7076845	test: 6.4307647	best: 6.4307153 (798)	total: 17.2s	remaining: 3m 17s
900:	learn: 21.5307321	test: 6.1641493	best: 6.1633803 (897)	total: 18.9s	remaining: 3m 11s
1000:	learn: 20.5560124	test: 5.9263910	best: 5.9256727 (990)	total: 20.8s	re

<catboost.core.CatBoostRegressor at 0x7d20b1d10ec0>

In [88]:
s_pred_float = model_s_p.predict(X_test)
s_pred_int = np.round(s_pred_float).astype(int)

In [89]:
p_pred_int = 4800 - s_pred_int

In [90]:
pd.DataFrame(data={"E_mu_Z": E_mu_Z_pred, "R": R_pred, "s": s_pred_int, "p": p_pred_int}).to_csv(
    "submission_3.csv", header=False, index=False
)