In [37]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from scipy.fftpack import fft
from scipy.signal import find_peaks

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

import joblib

from sklearn.preprocessing import StandardScaler

In [10]:
import pickle

with open('features.pkl', 'rb') as f:
    loaded_features_list = pickle.load(f)

In [11]:
test_df = pd.read_parquet('../data/test.parquet')

In [12]:
def fillna_in_lists(column):
    def fill_list(lst):
        if isinstance(lst, list):
            clean_lst = [x for x in lst if pd.notnull(x)]  
            if len(clean_lst) > 0:
                median_value = np.median(clean_lst)  
                return [x if pd.notnull(x) else median_value for x in lst]  
            else:
                return [0]*len(lst) 
        return lst  
    return column.apply(fill_list)

In [13]:
test_df['values'] = fillna_in_lists(test_df['values'])

### Generate features

In [14]:
loaded_features_list

['mean',
 'median_value',
 'q1_value',
 'kurtosis_value',
 'peaks_value',
 'autocorr_sign',
 'mean_amplitude']

In [28]:
def detect_autocorrelation(data, lag=1, threshold=0.5):
    """
    1 - положительная автокорреляция выше порога,
    0 - автокорреляции нет,
    -1 - отрицательная автокорреляция ниже порога.
    """
    data_copy = data.copy()
    
    def autocorr_classification(x, lag, threshold):
        x_array = np.array(x)
        
        if np.std(x_array) == 0:
            return 0  
        else:
            autocorr_value = pd.Series(x_array).autocorr(lag=lag)
            if autocorr_value > threshold:
                return 1 
            elif autocorr_value < -threshold:
                return -1  
            else:
                return 0  

    data_copy['autocorr_sign'] = data_copy['values'].apply(lambda x: autocorr_classification(x, lag, threshold))
    
    return data_copy


def extract_features(data):
    data_copy = data.copy()
    def calculate_amplitude(group):
        time_series = group['values'] 
        fft_values = np.fft.rfft(time_series)
        amplitude = np.abs(fft_values) // 2
        
        group['mean_amplitude'] = np.mean(amplitude)
        return group
    
    
    data_copy['mean_amplitude'] = data_copy['values'].apply(lambda x: np.mean(np.abs(np.fft.rfft(x)) // 2))
    
    return data_copy


def fill_missing_values_with_median(data, feature):
    data_filled = data.copy()
    
    for label in data_filled['label'].unique():
        median_value = data_filled[data_filled['label'] == label][feature].apply(
            lambda x: np.median(x) if isinstance(x, (list, np.ndarray)) and len(x) > 0 else 0
        ).median()
        
        data_filled.loc[data_filled['label'] == label, feature] = data_filled.loc[data_filled['label'] == label, feature].apply(
            lambda x: [median_value if v is None else v for v in x] if isinstance(x, (list, np.ndarray)) else x
        )
    
    return data_filled

In [22]:
stat_features = {}

stat_features['mean'] = test_df['values'].apply(np.mean)
stat_features['median_value'] = test_df['values'].apply(np.median)
stat_features['q1_value'] = test_df['values'].apply(lambda x: np.percentile(x, 25))
stat_features['kurtosis_value'] = test_df['values'].apply(kurtosis)
stat_features['peaks_value'] = test_df['values'].apply(lambda x: len(find_peaks(x)[0]))
    
stat_features_df = pd.DataFrame(stat_features)

In [25]:
autocorr_test = detect_autocorrelation(test_df, lag=1, threshold=0.3)
merged_test_df = stat_features_df.join(autocorr_test[['autocorr_sign']])
merged_test_df['autocorr_sign'] = merged_test_df['autocorr_sign'].replace(-1, 0)

In [30]:
test_df_amplit = extract_features(test_df)
merged_test_df_fd = merged_test_df.merge(test_df_amplit[['mean_amplitude']], left_index=True, right_index=True, how='left')
merged_test_df_fd = merged_test_df_fd.fillna(0)

In [40]:
final_test_df = test_df[['id']].merge(merged_test_df_fd, left_index=True, right_index=True, how='left')
scaler = StandardScaler()

scaled_test = scaler.fit_transform(final_test_df.drop(columns=['id']))  

scaled_test_df = pd.DataFrame(scaled_test, columns=loaded_features_list) 

In [34]:
clf_loaded = joblib.load('stacking_classifier_model.pkl')

In [41]:
y_test_pred_prob = clf_loaded.predict_proba(scaled_test_df)[:, 1]

submission = pd.DataFrame({
    'id': test_df['id'],  
    'score': y_test_pred_prob   
})

submission.to_csv('submission.csv', index=False)

print("Файл submission.csv успешно создан!")

Файл submission.csv успешно создан!
