In [71]:
import pandas as pd
import os
import numpy as np

In [72]:
def feature_adder_fast(filepath, window_size=10):
    df = pd.read_csv(filepath)

    print("Shape before processing:", df.shape)
    
    # Чтобы не было log(0) и деления на 0 при p_grad:
    # Заменяем все неположительные X на 1 (или небольшой эпсилон > 0).
    df.loc[df["X"] <= 0, "X"] = 1.0

    # Считаем смещения от первого значения
    df['X'] = df['X'] - df['X'].iloc[0]
    df['Y'] = df['Y'] - df['Y'].iloc[0]
    df['Y'] = df['Y'].abs()  # Если нужно брать абсолютную разницу

    # Логарифм по оси времени (X уже гарантированно > 0)
    df["ln_t"] = np.log(df["X"])

    # --- Векторизованное вычисление p_grad ---
    # p_grad = (Y[i+1] - Y[i-1]) / (ln_t[i+1] - ln_t[i-1])
    shift_fwd = df["Y"].shift(-1)
    shift_bwd = df["Y"].shift(1)
    ln_fwd    = df["ln_t"].shift(-1)
    ln_bwd    = df["ln_t"].shift(1)

    numerator   = shift_fwd - shift_bwd
    denominator = ln_fwd - ln_bwd

    # Если denominator=0, заменяем его на маленькое число (например 1e-9),
    # чтобы избежать бесконечностей:
    denominator_safe = denominator.copy()
    denominator_safe.replace(0, 1e-9, inplace=True)

    df["p_grad"] = numerator / denominator_safe

    # Заполним все оставшиеся NaN и inf в p_grad нулями
    df["p_grad"].replace([np.inf, -np.inf], np.nan, inplace=True)
    df["p_grad"].fillna(0, inplace=True)

    # Для удобства сразу и остальные пропуски в Y тоже заполним (на случай сдвигов)
    df["Y"].fillna(0, inplace=True)

    mean_y = df["Y"].mean()
    mean_p_grad = df["p_grad"].mean()

    # --- Создаём сдвинутые колонки ---
    new_cols = {}
    prev_y_cols = []
    next_y_cols = []

    for i in range(1, window_size + 1):
        # Сдвиги по Y
        new_cols[f"prev{i}_y"] = df["Y"].shift(i)
        new_cols[f"next{i}_y"] = df["Y"].shift(-i)

        # Сдвиги по p_grad
        new_cols[f"prev{i}_p_grad"] = df["p_grad"].shift(i)
        new_cols[f"next{i}_p_grad"] = df["p_grad"].shift(-i)

        # Запомним названия колонок Y (для групповых расчетов)
        prev_y_cols.append(f"prev{i}_y")
        next_y_cols.append(f"next{i}_y")

    # Склеиваем
    new_cols_df = pd.DataFrame(new_cols, index=df.index)

    # Заполняем пропуски (NaN) средними значениями
    for col in new_cols_df.columns:
        if col.endswith("_y"):
            new_cols_df[col].fillna(mean_y, inplace=True)
        elif col.endswith('_p_grad'):
            new_cols_df[col].fillna(mean_p_grad, inplace=True)

    # Объединяем с исходным df
    df_result = pd.concat([df, new_cols_df], axis=1)

    # --- Дополнительные признаки ---
    df_result['delta_P'] = (df_result['Y'] - df_result['prev1_y']).abs().astype(float)
    df_result.loc[0, 'delta_P'] = 0  # как в исходном коде

    df_result['avg_y_2n']     = df_result[prev_y_cols + next_y_cols].mean(axis=1)
    df_result['median_y_2n']  = df_result[prev_y_cols + next_y_cols].median(axis=1)
    df_result['avg_y_prevn']  = df_result[prev_y_cols].mean(axis=1)
    df_result['avg_y_nextn']  = df_result[next_y_cols].mean(axis=1)
    df_result['median_y_prevn'] = df_result[prev_y_cols].median(axis=1)
    df_result['median_y_nextn'] = df_result[next_y_cols].median(axis=1)

    df_result['max-now']           = (df_result['Y'] - df_result['Y'].max()).abs()
    df_result["diff_from_avg_prev"] = df_result["Y"] - df_result["avg_y_prevn"]
    df_result["diff_from_avg_next"] = df_result["avg_y_nextn"] - df_result["Y"]

    combined_cols = prev_y_cols + next_y_cols

    # Статистические показатели
    df_result['combined_mean']   = df_result[combined_cols].mean(axis=1)
    df_result['combined_median'] = df_result[combined_cols].median(axis=1)
    df_result['combined_std']    = df_result[combined_cols].std(axis=1)
    df_result['combined_skew']   = df_result[combined_cols].skew(axis=1)
    df_result['combined_kurt']   = df_result[combined_cols].kurtosis(axis=1)
    df_result['combined_range']  = df_result[combined_cols].max(axis=1) - df_result[combined_cols].min(axis=1)

    df_result['combined_max']    = df_result[combined_cols].max(axis=1)
    df_result['combined_min']    = df_result[combined_cols].min(axis=1)

    # Отношения к max/min
    # Чтобы не было деления на 0, проверяем делитель. Если 0 - ставим 0
    df_result['ratio_to_max_combined'] = np.where(
        df_result['combined_max'] == 0,
        0,
        df_result['Y'] / df_result['combined_max']
    )

    df_result['ratio_to_min_combined'] = np.where(
        df_result['combined_min'] == 0,
        0,
        df_result['Y'] / df_result['combined_min']
    )

    # Флаг нестабильности
    std_threshold = df_result['combined_std'].mean()
    df_result['unstable_flag_combined'] = (df_result['combined_std'] > std_threshold).astype(int)

    # На всякий случай ещё раз убираем возможные NaN/inf, если где-то возникли
    df_result.replace([np.inf, -np.inf], 0, inplace=True)
    df_result.fillna(0, inplace=True)

    print("Shape after processing:", df_result.shape)
    
    return df_result


In [73]:
feature_adder_fast('/Users/savinovsvatoslav/Code/skvazhina_hack/train_data.csv').to_csv('proba.csv',index=False)
# feature_adder('/Users/savinovsvatoslav/Code/skvazhina_hack/test')

Shape before processing: (1574917, 3)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Shape after processing: (1574917, 66)


In [74]:
df = pd.read_csv('proba.csv')
df.shape

(1574917, 66)

In [75]:
df

Unnamed: 0,X,Y,class,ln_t,p_grad,prev1_y,next1_y,prev1_p_grad,next1_p_grad,prev2_y,...,combined_median,combined_std,combined_skew,combined_kurt,combined_range,combined_max,combined_min,ratio_to_max_combined,ratio_to_min_combined,unstable_flag_combined
0,0.000000,0.000000,0,0.000000,0.000000,56.955336,0.000000,-1300.536112,0.000000,56.955336,...,28.511543,29.198609,-0.000001,-2.235293,56.955336,56.955336,0.000000,0.000000,0.000000,1
1,-0.973889,0.000000,0,0.000000,0.000000,0.000000,0.038713,0.000000,0.000000,56.955336,...,0.082267,29.049468,0.217685,-2.182609,56.955336,56.955336,0.000000,0.000000,0.000000,1
2,-0.445000,0.038713,0,0.000000,0.000000,0.000000,0.038713,0.000000,0.000000,0.000000,...,0.082267,28.605279,0.442121,-2.017972,56.955336,56.955336,0.000000,0.000680,0.000000,1
3,-0.410556,0.038713,0,0.000000,0.000000,0.038713,0.038713,0.000000,0.000000,0.000000,...,0.072588,27.849116,0.681158,-1.719456,56.955336,56.955336,0.000000,0.000680,0.000000,1
4,0.262500,0.038713,0,-1.337504,0.000000,0.038713,0.009679,0.000000,-0.023653,0.038713,...,0.072588,26.755517,0.945296,-1.241831,56.955336,56.955336,0.000000,0.000680,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1574912,8614.170000,74.933853,0,9.061164,-21559.872113,75.140970,74.723832,-5552.456879,12105.603226,75.041283,...,74.944015,8.472376,-0.943985,-1.242317,18.325003,75.280339,56.955336,0.995397,1.315660,1
1574913,8614.253333,74.723832,0,9.061173,12105.603226,74.933853,75.168070,-21559.872113,17258.134304,75.140970,...,74.938208,8.826045,-0.680144,-1.719310,18.325003,75.280339,56.955336,0.992608,1.311972,1
1574914,8614.336667,75.168070,0,9.061183,17258.134304,74.723832,75.057736,12105.603226,-11505.620434,74.933853,...,74.738833,9.048590,-0.441132,-2.017340,18.325003,75.280339,56.955336,0.998509,1.319772,1
1574915,8614.420000,75.057736,0,9.061193,-11505.620434,75.168070,74.945467,17258.134304,0.000000,74.723832,...,74.618337,9.178668,-0.216891,-2.181727,18.212734,75.168070,56.955336,0.998532,1.317835,1
