In [28]:
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
from scipy.signal import argrelextrema
import os 
from collections import defaultdict
import math
import catboost
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier

def extract_graph_features(df, start_time, end_time,mon = 0):
    
    df = df[(df['time'] >= start_time) & (df['time'] <= end_time)]
    
    # Смещение графика в начало координат
    x = df["time"].values - df["time"].values[0]
    y = df["pressure"].values - df["pressure"].values[0]
    
    # 1. Статистические признаки
    mean_y = np.mean(y)
    std_y = np.std(y)
    range_y = np.max(y) - np.min(y)
    cv_y = std_y / mean_y if mean_y != 0 else 0
    skewness_y = skew(y)
    kurtosis_y = kurtosis(y)

    # 2. Новые признаки
    height = np.max(y) - np.min(y)  # Высота графика
    diff_first_last = y[-1] - y[0]  # Разница между последним и первым значением
    
    # 3. Формообразующие признаки
    num_maxima = len(argrelextrema(y, np.greater)[0])
    num_minima = len(argrelextrema(y, np.less)[0])
    num_extrema = num_maxima + num_minima
    
    angles = np.arctan(np.diff(y) / np.diff(x))
    mean_angle = np.mean(angles)

    segment_lengths = np.abs(np.diff(y)) / range_y if range_y != 0 else np.zeros(len(y)-1)
    mean_segment_length = np.mean(segment_lengths)

    # 4. Частотные признаки
    fft_values = np.fft.fft(y)
    fft_amplitudes = np.abs(fft_values)
    dominant_freq = np.argmax(fft_amplitudes[1:]) + 1

    energy_y = np.sum(y**2)

    # 5. MSE отклонения от ln(t)
    ln_x = np.log(x + 1e-6)  # добавляем малое число, чтобы избежать log(0)
    mse_ln = np.mean((ln_x - y) ** 2)

    # 6. Статистика по производным (дельты)
    deltas = np.diff(y)  # Разности y[i+1] - y[i]
    
    # Статистика по разностям
    mean_delta = np.mean(deltas)
    median_delta = np.median(deltas)
    max_delta = np.max(deltas)
    min_delta = np.min(deltas)
    var_delta = np.var(deltas)
    std_delta = np.std(deltas)
    
    # Количество смен знака (переход от положительного к отрицательному и наоборот)
    sign_changes = np.sum(np.diff(np.sign(deltas)) != 0)

    # Создаем DataFrame с новыми признаками
    features_df = pd.DataFrame({
        "mean": [mean_y],
        "std": [std_y],
        "range": [range_y],
        "cv": [cv_y],
        "skewness": [skewness_y],
        "kurtosis": [kurtosis_y],
        "num_extrema": [num_extrema],
        "mean_angle": [mean_angle],
        "mean_segment_length": [mean_segment_length],
        "dominant_freq": [dominant_freq],
        "energy": [energy_y],
        "height": [height],
        "diff_first_last": [diff_first_last],
        "mse_ln": [mse_ln],
        
        # Статистика по производным (дельтам)
        "mean_delta": [mean_delta],
        "median_delta": [median_delta],
        "max_delta": [max_delta],
        "min_delta": [min_delta],
        "var_delta": [var_delta],
        "std_delta": [std_delta],
        "sign_changes": [sign_changes],
        "mon" : [mon]
    })
    
    return features_df

In [29]:
def detect_monotonic_segments_ignore_outliers(
    csv_file_path,
    smoothing_window=5,   # окно сглаживания
    outlier_factor=1000.0,   # фактор для определения выбросов (через MAD)
    noise_tolerance=1,  # допустимая "зона шума" при смене тренда
    min_points=20,        # минимальное количество точек в сегменте
    maxval_proc=80,       # процент от максимального давления, который должен быть в сегменте
    cnt_log_cycles=2,     # параметр, определяющий разрывы во времени в лог-шкале
    plot=0,               # флаг построения графика
    show_autor_patterns=0,# флаг отображения "авторской" разметки из столбца "class"
    scatter_plot=0,       # 0 -> все графики через scatter, 1 -> все графики через plot (авторская разметка всё равно scatter)
    graph_mad=0           # 0 -> рисуем df_clean["pressure"], 1 -> рисуем df_clean["pressure_smooth"]
):
    """
    Определение участков монотонного возрастания/убывания в данных, игнорируя «истинные» выбросы.
    
    Параметры:
    -----------
    graph_mad : int
        0 -> При построении графика использовать исходный столбец 'pressure' (но без выбросов).
        1 -> При построении графика использовать сглаженный столбец 'pressure_smooth'.
    """

    # Шорткат-функции для отрисовки основного графика
    def plot_data_main(x, y, label=None, color=None):
        """Рисует основной трек (в зависимости от graph_mad)."""
        if scatter_plot == 0:
            plt.scatter(x, y, label=label, c=color)
        else:
            plt.plot(x, y, label=label, color=color, alpha=0.7)

    # Функция для отрисовки авторской разметки (всегда scatter)
    def plot_data_autor(x, y, label=None, color=None):
        plt.scatter(x, y, label=label, c=color)

    # 1) Считываем и сортируем данные
    df = pd.read_csv(csv_file_path)
    df = df.sort_values(by="time").reset_index(drop=True)

    # 2) Определяем глобальный максимум давления и вычисляем порог для maxval_proc
    max_pressure = df["pressure"].max()
    threshold_pressure = (maxval_proc / 100.0) * max_pressure

    # 3) Сглаживаем давление (rolling median)
    df["pressure_smooth"] = (
        df["pressure"]
        .rolling(window=smoothing_window, center=True, min_periods=1)
        .median()
    )

    # 4) Определяем выбросы по порогу (outlier_factor * MAD)
    mad = np.median(np.abs(df["pressure"] - df["pressure_smooth"]))
    threshold = outlier_factor * mad
    df["is_outlier"] = np.abs(df["pressure"] - df["pressure_smooth"]) > threshold

    # 5) Оставляем только «чистые» точки (не выбросы)
    df_clean = df[~df["is_outlier"]].reset_index(drop=True)
    if len(df_clean) < 2:
        return {1: [], -1: []}

    # Выбираем, какие значения будем рисовать на графике
    # (для поиска сегментов логика всё равно использует 'pressure_smooth',
    #  но визуализация может идти по 'pressure' или 'pressure_smooth')
    pressure_col_for_plot = "pressure_smooth" if graph_mad == 1 else "pressure"

    # Вспомогательная функция: определяет текущий знак изменения и был ли переход
    def get_sign_and_change(old_trend, diff, noise_tolerance):
        """
        Возвращает (new_sign, changed):
          new_sign  = +1, если diff > 0
                      -1, если diff < 0
                       0, если diff == 0
          changed   = True, если при переходе от old_trend к new_sign
                          действительно считаем, что тренд "сменился"
        """
        if diff > 0:
            raw_sign = 1
        elif diff < 0:
            raw_sign = -1
        else:
            raw_sign = 0

        changed = False
        if old_trend is None:
            return raw_sign, False

        if old_trend != 0 and raw_sign != 0:
            if (raw_sign != old_trend) and (abs(diff) > noise_tolerance):
                changed = True

        return raw_sign, changed

    segments = {1: [], -1: []}
    current_trend = None
    current_start_idx = None
    last_valid_index = None
    n = len(df_clean)

    # Порог логарифмического разрыва
    log_threshold = cnt_log_cycles * math.log(10)

    # Функция для «закрытия» текущего сегмента
    def close_segment(end_idx, trend):
        """
        Проверяем, что сегмент достаточно длинный (min_points)
        и что в нём есть хотя бы одна точка >= threshold_pressure.
        """
        seg_length = end_idx - current_start_idx + 1
        if seg_length >= min_points:
            segment_data = df_clean.iloc[current_start_idx : end_idx + 1]
            # Проверка на наличие точки, превышающей порог:
            if (segment_data["pressure"] >= threshold_pressure).any():
                start_time = segment_data.iloc[0]["time"]
                end_time = segment_data.iloc[-1]["time"]
                segments[trend].append((start_time, end_time))

    # Основной цикл по точкам
    for i in range(1, n):
        time_prev = df_clean.loc[i - 1, "time"]
        time_curr = df_clean.loc[i, "time"]

        if time_prev <= 0 or time_curr <= 0:
            log_diff = 0
        else:
            log_diff = math.log(time_curr) - math.log(time_prev)

        # Смотрим изменение именно сглаженного давления (логика поиска трендов)
        diff = df_clean.loc[i, "pressure_smooth"] - df_clean.loc[i - 1, "pressure_smooth"]
        new_sign, changed = get_sign_and_change(current_trend, diff, noise_tolerance)

        if current_trend is None:
            if new_sign != 0:
                current_trend = new_sign
                current_start_idx = i - 1
                last_valid_index = i - 1
            continue

        log_exceeded = (log_diff > log_threshold)

        if changed or log_exceeded:
            close_segment(last_valid_index, current_trend)

            if log_exceeded:
                current_trend = new_sign if new_sign != 0 else None
                current_start_idx = i
                last_valid_index = i
            else:
                if new_sign != 0:
                    current_trend = new_sign
                    current_start_idx = i - 1
                    last_valid_index = i - 1
                else:
                    current_trend = None
                    current_start_idx = None
                    last_valid_index = None
        else:
            last_valid_index = i
            if new_sign != 0:
                current_trend = new_sign

    # Закрываем «висящий» сегмент
    if current_trend is not None and last_valid_index is not None and current_start_idx is not None:
        close_segment(last_valid_index, current_trend)

    # Рисуем, если нужно
    if plot == 1:
        plt.figure(figsize=(12, 6))

        # Рисуем выбранную колонку (raw или smooth)
        plot_data_main(
            x=df_clean["time"],
            y=df_clean[pressure_col_for_plot],
            label=("Сглаженное давление (clean)" if graph_mad == 1 else "Исходное давление (clean)"),
            color="gray"
        )

        # Авторская разметка, если включена
        if show_autor_patterns == 1 and "class" in df.columns:
            df_inc = df[df["class"] == 1]  # рост (авторская разметка)
            if not df_inc.empty:
                plot_data_autor(
                    x=df_inc["time"],
                    y=df_inc["pressure"],
                    label="возрастание (авт. разм.)",
                    color="yellow"
                )
            df_dec = df[df["class"] == 2]  # убывание (авторская разметка)
            if not df_dec.empty:
                plot_data_autor(
                    x=df_dec["time"],
                    y=df_dec["pressure"],
                    label="убывание (авт. разм.)",
                    color="blue"
                )

        # Отмечаем найденные монотонные сегменты
        for trend, seg_list in segments.items():
            color = "green" if trend == 1 else "red"
            label = "Возрастание" if trend == 1 else "Убывание"
            for (start_time, end_time) in seg_list:
                mask = (df_clean["time"] >= start_time) & (df_clean["time"] <= end_time)
                plot_data_main(
                    x=df_clean.loc[mask, "time"],
                    y=df_clean.loc[mask, pressure_col_for_plot],
                    label=label,
                    color=color
                )

        plt.xlabel("Время")
        plt.ylabel("Давление")
        plt.title(
            f"window: {smoothing_window}, outlier_factor: {outlier_factor}, noise_tolerance: {noise_tolerance},\n"
            f"min_points: {min_points}, maxval_proc: {maxval_proc}, cnt_log_cycles: {cnt_log_cycles}, graph_mad: {graph_mad}"
        )

        # Убираем дублирующиеся метки в легенде
        handles, labels = plt.gca().get_legend_handles_labels()
        by_label = dict(zip(labels, handles))
        plt.legend(by_label.values(), by_label.keys())
        plt.grid(True)
        plt.show()

    return segments


In [30]:
def submittion_file(razmetka_path):
    df = pd.read_csv(razmetka_path)
    df_ans = pd.DataFrame()
    for i in range(df.shape[0]): 
        mezh_path = df.loc[i,'filename']
        path = os.path.join('/Users/savinovsvatoslav/Code/skvazhina_hack/SiamHack/marked_data' , mezh_path)
        mini_data = pd.read_csv(path)
        mezh_df = extract_graph_features(mini_data, df.loc[i, 'time_start'], df.loc[i, 'time_stop'])
        mezh_df['target'] = df.loc[i,'class']
        mezh_df['time_start'] = df.loc[i,'time_start']
        mezh_df['time_stop']   = df.loc[i,'time_stop']
        df_ans = pd.concat((df_ans, mezh_df ), axis=0, ignore_index=True)
    return df_ans

In [31]:
submittion_file('/Users/savinovsvatoslav/Code/skvazhina_hack/SiamHack/razmetka.csv').to_csv('razmetka_features.csv',index=False)

In [None]:
data_train_all = pd.read_csv('razmetka_features.csv')
data_train_all = data_train_all[data_train_all['target'] != 3 ]
data_train_all = data_train_all.drop(['time_start','time_stop','mon'],axis=1)
data_train_recovery =  data_train_all[(data_train_all['target'] == 1 )|  (data_train_all['target'] == 0)]
print(data_train_recovery.shape)
data_train_recovery = data_train_recovery[data_train_recovery['mean_angle'] >= 0]
print(data_train_recovery.shape)
data_train_drop = data_train_all[(data_train_all['target'] == 2 )|  (data_train_all['target'] == 0)]
print(data_train_drop.shape)
data_train_drop = data_train_drop[data_train_drop['mean_angle'] < 0 ]
print(data_train_drop.shape)

y_train_recovery = data_train_recovery['target']
x_train_recovery = data_train_recovery.drop('target',axis=1)

y_train_drop = data_train_drop['target']
x_train_drop= data_train_drop.drop('target',axis=1)

model_recovery=  CatBoostClassifier()
model_drop = CatBoostClassifier()

model_recovery.fit(x_train_recovery,y_train_recovery)

model_drop.fit(x_train_drop,y_train_drop)

(210, 22)
(61, 22)
(260, 22)
(213, 22)
Learning rate set to 0.003121
0:	learn: 0.6898586	total: 741us	remaining: 741ms
1:	learn: 0.6870313	total: 1.51ms	remaining: 756ms
2:	learn: 0.6834362	total: 2.79ms	remaining: 927ms
3:	learn: 0.6799319	total: 3.39ms	remaining: 844ms
4:	learn: 0.6773585	total: 4.09ms	remaining: 814ms
5:	learn: 0.6745291	total: 4.84ms	remaining: 801ms
6:	learn: 0.6717309	total: 5.41ms	remaining: 767ms
7:	learn: 0.6687928	total: 6.27ms	remaining: 777ms
8:	learn: 0.6654029	total: 7.23ms	remaining: 796ms
9:	learn: 0.6626532	total: 7.77ms	remaining: 769ms
10:	learn: 0.6603199	total: 8.32ms	remaining: 748ms
11:	learn: 0.6574679	total: 9.34ms	remaining: 769ms
12:	learn: 0.6547249	total: 9.88ms	remaining: 750ms
13:	learn: 0.6521289	total: 10.4ms	remaining: 732ms
14:	learn: 0.6491010	total: 10.9ms	remaining: 717ms
15:	learn: 0.6464607	total: 11.9ms	remaining: 729ms
16:	learn: 0.6434014	total: 12.4ms	remaining: 716ms
17:	learn: 0.6402493	total: 12.9ms	remaining: 703ms
18:	le

<catboost.core.CatBoostClassifier at 0x13b7b9670>

In [70]:
folder_path = '/Users/savinovsvatoslav/Downloads/full_test_df 2'
df = pd.DataFrame()
for filename in os.listdir(folder_path):
    full_path = os.path.join(folder_path,  filename)
    mini_data = pd.read_csv(full_path)
    seg = detect_monotonic_segments_ignore_outliers(full_path)
    for el in seg[1]:
        mezh_df = extract_graph_features(mini_data, el[0], el[1],mon=1) 
        mezh_df['time_start'] = el[0]
        mezh_df['time_stop']   = el[1]
        mezh_df['filename'] = filename
        df = pd.concat((df,mezh_df))
    for el in seg[-1]:
        mezh_df = extract_graph_features(mini_data, el[0], el[1],mon=-1) 
        mezh_df['time_start'] = el[0]
        mezh_df['time_stop']   = el[1]
        mezh_df['filename'] = filename
        df = pd.concat((df,mezh_df))
df.to_csv('test_data.csv',index=False)
df

Unnamed: 0,mean,std,range,cv,skewness,kurtosis,num_extrema,mean_angle,mean_segment_length,dominant_freq,...,median_delta,max_delta,min_delta,var_delta,std_delta,sign_changes,mon,time_start,time_stop,filename
0,1.102030,66.219202,180.985840,60.088368,0.955176,-0.787256,2,0.182692,0.001590,1,...,0.000000,3.871355,-2.903516,0.328284,0.572960,219,1,16655.419444,17010.819167,5dbc4812-135e-4e6b-9a14-e2cde50489f5.csv
0,-177.891898,18.116002,194.535582,-0.101837,3.860130,24.783399,3,-0.050675,0.000417,1,...,0.000000,0.967839,-7.742709,0.183084,0.427883,230,1,17010.819167,18071.354444,5dbc4812-135e-4e6b-9a14-e2cde50489f5.csv
0,-112.650775,36.194769,150.982840,-0.321301,1.029449,0.468074,0,-0.561494,0.007353,136,...,0.000000,0.000000,-10.646226,3.795465,1.948195,63,-1,3738.412500,3835.795556,5dbc4812-135e-4e6b-9a14-e2cde50489f5.csv
0,20.767836,60.587905,141.304801,2.917391,-0.039200,-1.882242,6,0.053725,0.000868,1,...,0.000000,1.935683,-6.774888,0.162782,0.403463,253,1,258.802222,1505.274722,7dc0adee-fbfb-46a0-803d-54275561d1eb.csv
0,-131.698230,22.513140,147.111848,-0.170945,2.727262,8.525897,5,-0.128175,0.004064,1,...,0.000000,22.260346,-17.421140,5.172873,2.274395,121,1,1505.274722,1868.242778,7dc0adee-fbfb-46a0-803d-54275561d1eb.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,41.900158,16.189414,74.689082,0.386381,-1.072888,0.967805,10,0.337610,0.029795,1,...,0.219699,23.879485,-7.749485,25.820858,5.081423,10,1,7829.836389,7860.220556,31210932-757a-44c9-ad24-e4d44bb7344b.csv
0,31.221443,22.524117,52.380399,0.721431,-0.322263,-1.879688,12,0.058740,0.007769,1,...,0.030003,38.749358,-0.480048,10.360431,3.218762,24,1,7906.067222,8141.406389,31210932-757a-44c9-ad24-e4d44bb7344b.csv
0,-88.060689,10.957418,96.399640,-0.124430,5.412391,38.141923,44,-0.121081,0.004830,1,...,-0.020325,17.699835,-65.120060,15.404197,3.924818,92,1,8141.406389,8486.603611,31210932-757a-44c9-ad24-e4d44bb7344b.csv
0,-34.935042,10.286752,49.310414,-0.294454,0.886071,1.153852,2,-0.622360,0.016687,1,...,-0.415203,0.030003,-10.959805,2.602481,1.613221,11,-1,7860.220556,7906.067222,31210932-757a-44c9-ad24-e4d44bb7344b.csv


In [71]:
drop_to_pred_data = df[df['mon'] == -1]
drop_to_pred_data= drop_to_pred_data.reset_index(drop=True)
drop_to_pred_data_clear = drop_to_pred_data.drop(['filename','time_start','time_stop','mon'],axis=1)
drop_to_pred_data_clear

Unnamed: 0,mean,std,range,cv,skewness,kurtosis,num_extrema,mean_angle,mean_segment_length,dominant_freq,...,height,diff_first_last,mse_ln,mean_delta,median_delta,max_delta,min_delta,var_delta,std_delta,sign_changes
0,-112.650775,36.194769,150.982840,-0.321301,1.029449,0.468074,0,-0.561494,0.007353,136,...,150.982840,-150.982840,14860.749907,-1.110168,0.000000,0.000000,-10.646226,3.795465,1.948195,63
1,-108.486189,26.112860,127.755026,-0.240702,2.450143,5.779533,1,-0.278473,0.004917,1,...,127.755026,-126.787184,13456.239026,-0.609554,0.000000,0.967842,-11.614094,2.497138,1.580234,58
2,-126.024368,24.369067,151.883305,-0.193368,2.943943,10.688926,5,-0.259324,0.005651,1,...,151.883305,-143.879258,17710.396222,-0.737842,-0.135498,8.391183,-29.945004,8.739852,2.956324,17
3,-2.123280,0.446081,2.913194,-0.210091,0.912883,4.418009,38,-0.040462,0.039281,85,...,2.913194,-2.835767,25.342998,-0.033362,0.000000,0.338744,-1.229155,0.036048,0.189863,50
4,-37.113448,1.776080,44.114089,-0.047855,18.183077,382.115699,882,-0.012852,0.002896,1,...,44.114089,-37.135972,1819.227839,-0.023341,0.000000,6.136097,-17.566273,0.362794,0.602324,1155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,-21.068539,7.413523,43.424631,-0.351876,2.985761,8.914696,32,-0.103822,0.000682,1,...,43.424631,-25.363932,711.753094,-0.010835,0.000000,9.770534,-1.677781,0.053243,0.230745,401
67,-28.234391,9.040923,39.082168,-0.320210,0.920887,0.098183,0,-0.931393,0.004739,1,...,39.082168,-39.082168,1007.087163,-0.185224,-0.098698,0.000000,-2.269927,0.054600,0.233667,43
68,-31.741620,7.539629,39.279549,-0.237531,1.678512,2.743336,3,-0.313308,0.001328,766,...,39.279549,-38.786087,1290.858312,-0.050635,0.000000,0.296065,-2.171243,0.016802,0.129623,274
69,-57.854868,10.357321,66.780872,-0.179022,1.953929,3.755869,6,-0.010885,0.000397,1,...,66.780872,-65.813033,4305.355180,-0.014538,0.000000,0.967839,-4.839194,0.031240,0.176748,225


In [72]:
pred_drop = model_drop.predict(drop_to_pred_data_clear)

In [73]:
drop_to_pred_data['target'] = pred_drop
drop_to_pred_data  = drop_to_pred_data[drop_to_pred_data['target'] == 2]
drop_to_pred_data.to_csv('drop_ans.csv',index=False)

In [75]:
recovery_to_pred_data = df[df['mon'] == 1]
recovery_to_pred_data = recovery_to_pred_data.reset_index(drop=True)
recovery_to_pred_data_clear = recovery_to_pred_data.drop(['filename', 'time_start', 'time_stop', 'mon'], axis=1)
recovery_to_pred_data_clear


Unnamed: 0,mean,std,range,cv,skewness,kurtosis,num_extrema,mean_angle,mean_segment_length,dominant_freq,...,height,diff_first_last,mse_ln,mean_delta,median_delta,max_delta,min_delta,var_delta,std_delta,sign_changes
0,1.102030,66.219202,180.985840,60.088368,0.955176,-0.787256,2,0.182692,0.001590,1,...,180.985840,131.626066,4329.096582,0.161703,0.000000,3.871355,-2.903516,0.328284,0.572960,219
1,-177.891898,18.116002,194.535582,-0.101837,3.860130,24.783399,3,-0.050675,0.000417,1,...,194.535582,-192.599905,34145.543671,-0.063817,0.000000,0.967839,-7.742709,0.183084,0.427883,230
2,20.767836,60.587905,141.304801,2.917391,-0.039200,-1.882242,6,0.053725,0.000868,1,...,141.304801,90.977064,3802.377965,0.055951,0.000000,1.935683,-6.774888,0.162782,0.403463,253
3,-131.698230,22.513140,147.111848,-0.170945,2.727262,8.525897,5,-0.128175,0.004064,1,...,147.111848,-43.552850,19183.114823,-0.089985,0.000000,22.260346,-17.421140,5.172873,2.274395,121
4,11.156959,15.666520,153.886736,1.404193,4.343569,20.812293,77,0.027692,0.001264,4,...,153.886736,153.886736,258.906037,0.081164,0.000000,50.327737,-0.967842,1.729006,1.314917,419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,-30.475293,22.319891,95.578913,-0.732393,3.078487,8.182636,31,-0.048361,0.009501,1,...,95.578913,56.719220,1706.919262,0.370714,0.009678,60.050522,-7.439777,26.364478,5.134635,65
100,41.900158,16.189414,74.689082,0.386381,-1.072888,0.967805,10,0.337610,0.029795,1,...,74.689082,59.640157,1806.369478,1.325337,0.219699,23.879485,-7.749485,25.820858,5.081423,10
101,31.221443,22.524117,52.380399,0.721431,-0.322263,-1.879688,12,0.058740,0.007769,1,...,52.380399,52.380399,1189.309070,0.356329,0.030003,38.749358,-0.480048,10.360431,3.218762,24
102,-88.060689,10.957418,96.399640,-0.124430,5.412391,38.141923,44,-0.121081,0.004830,1,...,96.399640,-56.729867,8868.778697,-0.182411,-0.020325,17.699835,-65.120060,15.404197,3.924818,92


In [76]:
pred_recovery = model_recovery.predict(recovery_to_pred_data_clear)
pred_recovery

array([1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1])

In [77]:
recovery_to_pred_data['target'] = pred_recovery
recovery_to_pred_data = recovery_to_pred_data[recovery_to_pred_data['target'] == 1]
recovery_to_pred_data.to_csv('recovery_ans.csv', index=False)

In [95]:
def create_submission(
    submission_file: str,
    recovery_file: str,
    drop_file: str,
    output_file: str = "SABMISHEN.csv"
):
    """
    Функция читает готовый submission-файл (где уже есть колонка 'file' без .csv)
    и обновляет в нём столбцы drop и recovery из файлов drop_ans.csv и recovery_ans.csv.
    
    ВАЖНО:
      - Порядок строк не меняется
      - Колонка file не меняется
      - Если для file нет записей в drop_ans.csv/recovery_ans.csv, ставим []
      - Новых строк не добавляем, старых не удаляем
    """
    # 1. Считываем уже имеющийся submission-файл
    df_sub = pd.read_csv(submission_file)

    # 2. Считываем данные recovery_ans.csv, drop_ans.csv
    df_recovery = pd.read_csv(recovery_file)
    df_drop = pd.read_csv(drop_file)

    # 3. Формируем словари, где ключ = имя файла c расширением ".csv", значение = список [[start, stop], ...]
    drop_dict = {}
    for _, row in df_drop.iterrows():
        fname = row["filename"]
        drop_dict.setdefault(fname, []).append([row["time_start"], row["time_stop"]])

    recovery_dict = {}
    for _, row in df_recovery.iterrows():
        fname = row["filename"]
        recovery_dict.setdefault(fname, []).append([row["time_start"], row["time_stop"]])

    # 4. Обновляем в df_sub столбцы drop и recovery
    #    Идём ровно по строкам старого submission, не меняя их порядок
    if "drop" not in df_sub.columns:
        df_sub["drop"] = "[]"
    if "recovery" not in df_sub.columns:
        df_sub["recovery"] = "[]"

    for idx, row in df_sub.iterrows():
        # Ищем соответствие file → file.csv
        filename_with_ext = row["file"] + ".csv"

        # Получаем интервалы drop и recovery
        d_list = drop_dict.get(filename_with_ext, [])
        r_list = recovery_dict.get(filename_with_ext, [])

        # Записываем их в виде строк, например "[[start1, stop1], [start2, stop2]]"
        df_sub.at[idx, "drop"] = str(d_list) if d_list else "[]"
        df_sub.at[idx, "recovery"] = str(r_list) if r_list else "[]"

    # 5. Сохраняем результат. Порядок не меняем, индексы не пишем
    df_sub.to_csv(output_file, index=False)
    print(f"Обновлённый submission сохранён в {output_file}")


In [96]:
submission_file = '/Users/savinovsvatoslav/Code/skvazhina_hack/SiamHack/submission_clear.csv'
recovery_file = '/Users/savinovsvatoslav/Code/skvazhina_hack/SiamHack/recovery_ans.csv'
drop_file = '/Users/savinovsvatoslav/Code/skvazhina_hack/SiamHack/drop_ans.csv'
create_submission(submission_file,recovery_file,drop_file)

Обновлённый submission сохранён в SABMISHEN.csv
