# Импорт библиотек

In [2]:
import warnings
import pandas as pd
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# Подготовка данных

In [3]:
df = pd.read_csv("../../data/frames_errors.csv", header=None)
df.columns = [
    "block_id",
    "frame_idx",
    "E_mu_Z",
    "E_mu_phys_est",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
    "nTot",
    "bayesImVoltage",
    "opticalPower",
    "polarizerVoltages[0]",
    "polarizerVoltages[1]",
    "polarizerVoltages[2]",
    "polarizerVoltages[3]",
    "temp_1",
    "biasVoltage_1",
    "temp_2",
    "biasVoltage_2",
    "synErr",
    "N_EC_rounds",
    "maintenance_flag",
    "estimator_name",
    "f_EC",
    "E_mu_Z_est",
    "R",
    "s",
    "p",
]

In [4]:
missing_counts = df.isna().sum()
columns_with_missing = missing_counts[missing_counts > 0]
print("Столбцы с пропусками и их количество:")
print(columns_with_missing)
timestamp_counts = df.groupby("block_id")["frame_idx"].nunique()
print("Количество фреймов / Количество рядов")
print(timestamp_counts.value_counts())
print("Shape:", df.shape)

Столбцы с пропусками и их количество:
E_mu_phys_est    448
f_EC             131
dtype: int64
Количество фреймов / Количество рядов
frame_idx
399    569
400    251
398      2
390      1
Name: count, dtype: int64
Shape: (328617, 50)


In [5]:
df = df.dropna(subset=["f_EC"])

In [6]:
# Test (Val)
start_idx = df[(df["block_id"] == 1489460492) & (df["frame_idx"] == 99)].index[0]
end_idx = df[(df["block_id"] == 1840064900) & (df["frame_idx"] == 101)].index[0]

test_df = df.loc[start_idx:end_idx].copy()
assert len(test_df) == 2000, "Test (Val) set size is not 2000 rows"

# Train
all_block_ids = df["block_id"].unique()
train_blocks = [bid for bid in all_block_ids if bid not in test_df["block_id"].values]
train_df = df[df["block_id"].isin(train_blocks)]

print(f"Train: {len(train_df)}")
print(f"Test: {len(test_df)}")


Train: 326090
Test: 2000


In [7]:
phys_features = [
    "bayesImVoltage",
    "opticalPower",
    "polarizerVoltages[0]",
    "polarizerVoltages[1]",
    "polarizerVoltages[2]",
    "polarizerVoltages[3]",
    "temp_1",
    "biasVoltage_1",
    "temp_2",
    "biasVoltage_2",
]

features_to_lag = phys_features + [
    "E_mu_Z_est",
    "E_mu_Z",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
]

In [8]:
def create_lag_features(df, features_to_lag, lags):
    df = df.copy()
    for lag in range(1, lags + 1):
        for feature in features_to_lag:
            df[f"{feature}_lag{lag}"] = df.groupby("block_id")[feature].shift(lag)
        # df = df.dropna(subset=[f"{feature}_lag{lag}" for feature in features_to_lag])
    df = df.fillna(0)
    return df

In [None]:
train_df = create_lag_features(train_df, features_to_lag, lags=30)
train_df = train_df[(train_df["N_EC_rounds"] <= 2) & (train_df["f_EC"] <= 2)]
test_df_lag = create_lag_features(test_df, features_to_lag, lags=30)

# Catboost E_mu_Z

In [12]:
import numpy as np
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [13]:
features_to_drop = [
    "E_mu_Z_est",
    "E_mu_Z",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
    "block_id",
    "frame_idx",
    "estimator_name",
    "maintenance_flag",
    "synErr",
    "N_EC_rounds",
    "f_EC",
    "R",
    "s",
    "p",
    "nTot",
    "E_mu_phys_est",
]

In [14]:
X_train = train_df.drop(columns=features_to_drop)
y_train = train_df["E_mu_Z_est"]
X_test = test_df_lag.drop(columns=features_to_drop)
y_test = test_df_lag["E_mu_Z_est"]

In [15]:
model_E_mu_Z = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    loss_function="RMSE",
    verbose=100,
    random_state=42,
)

In [16]:
model_E_mu_Z.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
)

0:	learn: 0.0136429	test: 0.0030531	best: 0.0030531 (0)	total: 146ms	remaining: 2m 25s
100:	learn: 0.0009949	test: 0.0001966	best: 0.0001966 (100)	total: 7.16s	remaining: 1m 3s
200:	learn: 0.0008538	test: 0.0001563	best: 0.0001563 (200)	total: 14s	remaining: 55.7s
300:	learn: 0.0007320	test: 0.0001465	best: 0.0001445 (275)	total: 21.1s	remaining: 48.9s
400:	learn: 0.0006389	test: 0.0001420	best: 0.0001420 (400)	total: 27.8s	remaining: 41.5s
500:	learn: 0.0005691	test: 0.0001398	best: 0.0001398 (500)	total: 34.7s	remaining: 34.6s
600:	learn: 0.0005147	test: 0.0001420	best: 0.0001398 (500)	total: 42s	remaining: 27.9s
700:	learn: 0.0004707	test: 0.0001411	best: 0.0001398 (500)	total: 49s	remaining: 20.9s
800:	learn: 0.0004323	test: 0.0001401	best: 0.0001398 (500)	total: 56.2s	remaining: 14s
900:	learn: 0.0004024	test: 0.0001387	best: 0.0001387 (897)	total: 1m 3s	remaining: 7.02s
999:	learn: 0.0003741	test: 0.0001394	best: 0.0001379 (961)	total: 1m 11s	remaining: 0us

bestTest = 0.00013790

<catboost.core.CatBoostRegressor at 0x718fb0b713a0>

In [17]:
E_mu_Z_pred = model_E_mu_Z.predict(X_test)

# CatBoost R

In [19]:
features_to_drop = [
    "E_mu_Z",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
    "block_id",
    "frame_idx",
    "estimator_name",
    "maintenance_flag",
    "synErr",
    "N_EC_rounds",
    "f_EC",
    "R",
    "s",
    "p",
    "nTot",
    "E_mu_phys_est",
]

In [20]:
X_train = train_df.drop(columns=features_to_drop)
y_train = train_df["R"]
X_test = test_df_lag.drop(columns=features_to_drop)
y_test = test_df_lag["R"]

In [21]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
print(dict(zip(le.classes_, le.transform(le.classes_))))

{0.5: 0, 0.55: 1, 0.6: 2, 0.65: 3, 0.7: 4, 0.75: 5, 0.8: 6, 0.85: 7}


In [22]:
model_R = CatBoostClassifier(
    iterations=200,
    learning_rate=0.05,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    verbose=100,
    random_state=42,
)

model_R.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
)

0:	learn: 0.9929610	test: 0.9965000	best: 0.9965000 (0)	total: 1.03s	remaining: 3m 24s
100:	learn: 0.9956234	test: 0.9955000	best: 0.9980000 (80)	total: 1m 3s	remaining: 1m 2s
199:	learn: 0.9958719	test: 0.9955000	best: 0.9980000 (80)	total: 1m 54s	remaining: 0us

bestTest = 0.998
bestIteration = 80

Shrink model to first 81 iterations.


<catboost.core.CatBoostClassifier at 0x718faedb9ca0>

In [24]:
R_pred_labels = model_R.predict(X_test).flatten()

R_pred = le.inverse_transform(R_pred_labels)

# CatBoost s p

In [25]:
features_to_drop = [
    "E_mu_Z_est",
    "E_mu_Z",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
    "block_id",
    "frame_idx",
    "estimator_name",
    "maintenance_flag",
    "synErr",
    "N_EC_rounds",
    "f_EC",
    "s",
    "p",
    "nTot",
    "E_mu_phys_est",
]

In [26]:
X_train = train_df.drop(columns=features_to_drop)
y_train = train_df["s"]
X_test = test_df_lag.drop(columns=features_to_drop)
y_test = test_df_lag["s"]

In [27]:
model_s_p = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    loss_function="RMSE",
    verbose=100,
    random_state=42,
)

In [28]:
model_s_p.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
)

0:	learn: 558.9339684	test: 591.7890506	best: 591.7890506 (0)	total: 101ms	remaining: 1m 41s
100:	learn: 64.4777870	test: 28.4997641	best: 28.4997641 (100)	total: 9.13s	remaining: 1m 21s
200:	learn: 41.2572805	test: 19.7867404	best: 19.7859032 (199)	total: 17s	remaining: 1m 7s
300:	learn: 33.7064493	test: 16.4288174	best: 16.4288174 (300)	total: 24.1s	remaining: 55.9s
400:	learn: 30.3542365	test: 14.6947364	best: 14.6947364 (400)	total: 31s	remaining: 46.4s
500:	learn: 28.0619947	test: 13.8897064	best: 13.8890374 (499)	total: 38.1s	remaining: 37.9s
600:	learn: 26.1203773	test: 13.3781017	best: 13.3759261 (599)	total: 45.5s	remaining: 30.2s
700:	learn: 24.4481130	test: 13.0095027	best: 13.0095019 (699)	total: 53s	remaining: 22.6s
800:	learn: 23.0890416	test: 12.7451333	best: 12.7448984 (799)	total: 1m	remaining: 15s
900:	learn: 21.8665083	test: 12.4420333	best: 12.4420333 (900)	total: 1m 8s	remaining: 7.51s
999:	learn: 20.9274134	test: 12.1262106	best: 12.1201943 (981)	total: 1m 15s	rem

<catboost.core.CatBoostRegressor at 0x718fb0876c60>

In [29]:
s_pred_float = model_s_p.predict(X_test)
s_pred_int = np.round(s_pred_float).astype(int)

In [30]:
p_pred_int = 4800 - s_pred_int

In [31]:
pd.DataFrame(data={"E_mu_Z": E_mu_Z_pred, "R": R_pred, "s": s_pred_int, "p": p_pred_int}).to_csv(
    "submission.csv", header=False, index=False
)