In [1]:
import pandas as pd
import numpy as np
from typing import List, Tuple, TypeVar, Union

In [2]:
data = pd.read_parquet('data.parquet')
labels = pd.read_csv('labels.csv')

In [3]:
def check_constant_and_nan(df: pd.DataFrame) -> Tuple[List[str], List[str]]:
    """
    This function checks a pandas DataFrame for columns that consist 
    only of one unique value (constants) and columns that consist only of NaN values. 
    It prints the names of such columns and returns them in two separate lists.

    Parameters:
    df (pd.DataFrame): The pandas DataFrame to check.

    Returns:
    constant_cols (list of str): A list of column names that consist only of one unique value.
    nan_cols (list of str): A list of column names that consist only of NaN values.
    """
    # Check the uniqueness and presence of all NaN values
    is_constant = df.nunique() == 1
    is_nan = df.isnull().all()

    # Get column names
    constant_cols = is_constant[is_constant].index.tolist()
    nan_cols = is_nan[is_nan].index.tolist()

    if constant_cols:
        print(f'Columns consisting of only one value: {constant_cols}')
    else:
        print('No columns consisting of only one value.')

    if nan_cols:
        print(f'Columns consisting only of NaN values: {nan_cols}')
    else:
        print('No columns consisting only of NaN values.')
    
    return constant_cols, nan_cols


print('Check columns for "data" dataset')
check_constant_and_nan(data)
print()
print('Check columns for "labels" dataset')
check_constant_and_nan(labels)

Check columns for "data" dataset
No columns consisting of only one value.
No columns consisting only of NaN values.

Check columns for "labels" dataset
No columns consisting of only one value.
No columns consisting only of NaN values.


([], [])

In [4]:
data.head()

Unnamed: 0,MachineId,MeasurementId,Pressure
0,0_0_0,0,0.0
1,0_0_0,0,0.0
2,0_0_0,0,0.0
3,0_0_0,0,0.0
4,0_0_0,0,0.0


In [5]:
labels.head()

Unnamed: 0,MachineId,MeasurementId,PumpFailed,SlowStart,SlowEnd
0,0_0_0,0,False,False,False
1,0_0_1,-1,,,
2,0_0_2,0,False,False,False
3,0_0_3,0,True,False,False
4,0_1_0,-1,,,


In [6]:
T = TypeVar('T', bound='pd.core.series.Series')

def calculate_features(group: pd.DataFrame) -> T:
    """
    This function calculates several features based on the 'Pressure' column of the provided DataFrame.
    Features are calculated separately for the first and second half of the data.

    Parameters:
    group (pd.DataFrame): The DataFrame to calculate the features from. 
                          It should contain a 'Pressure' column.

    Returns:
    features (pd.Series): A Series object with the calculated features.
    """
    
    # Determine the size of the first half of the cycle
    half_cycle_size = len(group) // 2
    first_half = group.iloc[:half_cycle_size]
    
    # Calculate the statistics for the first half of the cycle
    mean_pressure_first_half = first_half['Pressure'].mean()
    median_pressure_first_half = first_half['Pressure'].median()
    std_pressure_first_half = first_half['Pressure'].std()
    max_pressure_first_half = first_half['Pressure'].max()
    
    # Calculate the maximum difference between two consecutive observations in the first half
    max_diff_first_half = first_half['Pressure'].diff().abs().max()
    
    # Separate the second half of the cycle
    second_half = group.iloc[half_cycle_size:]
    
    mean_pressure_second_half = second_half['Pressure'].mean()
    median_pressure_second_half = second_half['Pressure'].median()
    max_pressure_second_half = second_half['Pressure'].max()
    std_pressure_second_half = second_half['Pressure'].std()
    
    max_diff_second_half = second_half['Pressure'].diff().abs().max()
    
    return pd.Series({
        'mean_pressure_first_half': mean_pressure_first_half,
        'median_pressure_first_half': median_pressure_first_half,
        'std_pressure_first_half': std_pressure_first_half,
        'max_pressure_first_half': max_pressure_first_half,
        'max_diff_first_half': max_diff_first_half,
        'mean_pressure_second_half': mean_pressure_second_half,
        'median_pressure_second_half': median_pressure_second_half,
        'max_pressure_second_half': max_pressure_second_half,
        'std_pressure_second_half': std_pressure_second_half,
        'max_diff_second_half': max_diff_second_half
    })
grouped = data.groupby(['MachineId', 'MeasurementId']).apply(calculate_features).reset_index()

In [7]:
grouped.head()

Unnamed: 0,MachineId,MeasurementId,mean_pressure_first_half,median_pressure_first_half,std_pressure_first_half,max_pressure_first_half,max_diff_first_half,mean_pressure_second_half,median_pressure_second_half,max_pressure_second_half,std_pressure_second_half,max_diff_second_half
0,0_0_0,-1,0.230289,0.0,0.415121,2.439888,2.172414,0.187364,0.0,2.143523,0.380598,1.915191
1,0_0_0,0,0.605873,0.864078,0.582492,1.584466,0.720388,0.144275,0.0,1.449515,0.355725,0.590291
2,0_0_0,215,0.710727,0.435973,0.729057,1.83871,0.677419,0.0,0.0,0.0,0.0,0.0
3,0_0_0,237,0.97754,1.337876,0.706656,1.927632,1.081767,0.023251,0.0,0.575188,0.088071,0.218985
4,0_0_0,353,0.344599,0.0,0.390084,1.186589,0.455782,0.615127,0.755102,1.275996,0.381526,0.677357


In [8]:
grouped.shape

(27385, 12)

In [9]:
df = pd.merge(labels, grouped, on=['MachineId', 'MeasurementId'])

In [10]:
df.head()

Unnamed: 0,MachineId,MeasurementId,PumpFailed,SlowStart,SlowEnd,mean_pressure_first_half,median_pressure_first_half,std_pressure_first_half,max_pressure_first_half,max_diff_first_half,mean_pressure_second_half,median_pressure_second_half,max_pressure_second_half,std_pressure_second_half,max_diff_second_half
0,0_0_0,0,False,False,False,0.605873,0.864078,0.582492,1.584466,0.720388,0.144275,0.0,1.449515,0.355725,0.590291
1,0_0_1,-1,,,,0.310074,0.0,0.483122,2.833178,1.67754,0.249919,0.0,5.005592,0.400231,5.005592
2,0_0_2,0,False,False,False,0.444347,0.625709,0.425363,1.205104,0.435728,0.443484,0.613422,1.132325,0.440333,0.615312
3,0_0_3,0,True,False,False,0.588216,0.918919,0.541255,1.410603,0.426195,0.324494,0.0,1.497921,0.503955,0.753638
4,0_1_0,-1,,,,0.168185,0.0,0.362249,2.386766,1.955266,0.195165,0.0,2.553588,0.399479,2.150047


In [11]:
df.shape

(27385, 15)

In [12]:
# check for correct merge

In [13]:
df_filtered = df[df['MachineId'] == '0_0_0']

In [14]:
df_filtered.head()

Unnamed: 0,MachineId,MeasurementId,PumpFailed,SlowStart,SlowEnd,mean_pressure_first_half,median_pressure_first_half,std_pressure_first_half,max_pressure_first_half,max_diff_first_half,mean_pressure_second_half,median_pressure_second_half,max_pressure_second_half,std_pressure_second_half,max_diff_second_half
0,0_0_0,0,False,False,False,0.605873,0.864078,0.582492,1.584466,0.720388,0.144275,0.0,1.449515,0.355725,0.590291
33,0_0_0,-1,,,,0.230289,0.0,0.415121,2.439888,2.172414,0.187364,0.0,2.143523,0.380598,1.915191
568,0_0_0,215,False,False,False,0.710727,0.435973,0.729057,1.83871,0.677419,0.0,0.0,0.0,0.0,0.0
627,0_0_0,237,False,False,True,0.97754,1.337876,0.706656,1.927632,1.081767,0.023251,0.0,0.575188,0.088071,0.218985
845,0_0_0,353,False,False,False,0.344599,0.0,0.390084,1.186589,0.455782,0.615127,0.755102,1.275996,0.381526,0.677357


In [15]:
df_filtered.shape

(570, 15)

In [16]:
df_filtered['MeasurementId'].nunique()

570