In [1]:
import pandas as pd 
import numpy as np
from glob import glob 
# For file handling, specifically to retrieve file paths matching a specified pattern
import os
# For interacting with the operating system, such as file path manipulations

In [2]:
# Retrieve all CSV file paths from the specified directory
files = glob('/Users/sarithavuppula/Downloads/Data_Challenge_PHM2022_training_data 2/*.csv')

In [3]:
# Filter the files to categorize them by their measurement types: pin, po, and pdmp
pin_files = [file for file in files if 'data_pin' in file]
po_files = [file for file in files if 'data_po' in file]
pdmp_files = [file for file in files if 'data_pdmp' in file]

In [4]:
# Function to read a file and process its data into a DataFrame
def read_file(file_path, column='pin'):
    lines = [] # List to store each line from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            lines.append(line.strip()) # Read and strip each line, then append to the list
            
    # Extract the fault mode from each line
    fault_mode_list = [line.split(',')[0] for line in lines] 
    # Initialize lists to store measurements, cycles, modes, and conditions
    measurement_list = []
    cycle_list = []
    mode_list = []
    condition_list = []
    cycle = 1
    for line in lines:
        for mode in np.unique(fault_mode_list): # Iterate over each unique fault mode
            if line.split(',')[0] == mode:
                # Append measurements, cycle number, mode, and condition to respective lists
                measurement_list += line.split(',')[1:]
                cycle_list += [cycle]*len(line.split(',')[1:])
                mode_list += [mode]*len(line.split(',')[1:])
                condition_list += [file_path[-5]] * len(line.split(',')[1:])
            
                cycle+=1 # Increment cycle counter
    # Create a DataFrame from the lists, with specified columns    
    data = pd.DataFrame(np.concatenate([np.array(cycle_list).reshape(-1,1), np.array(mode_list).reshape(-1,1),  np.array(condition_list).reshape(-1,1), np.array(measurement_list).reshape(-1,1)], axis=1), columns=['cycle', 'mode', 'condition', column])                

    return data

In [5]:
# Initialize empty DataFrames to store integrated measurements
pin_measurement_integration = pd.DataFrame()
po_measurement_integration = pd.DataFrame()
pdmp_measurement_integration = pd.DataFrame()

# Process and integrate measurements from the first 5 files for each type
for i in range(5):  # Assume we are processing the first 5 files
    pin_measurement = read_file(pin_files[i], column='pin')  # Read and process pin file
    po_measurement = read_file(po_files[i], column='po')  # Read and process po file
    pdmp_measurement = read_file(pdmp_files[i], column='pdmp')  # Read and process pdmp file

    # Concatenate the measurements into the respective integrated DataFrames
    pin_measurement_integration = pd.concat([pin_measurement_integration, pin_measurement], axis=0)
    po_measurement_integration = pd.concat([po_measurement_integration, po_measurement], axis=0)
    pdmp_measurement_integration = pd.concat([pdmp_measurement_integration, pdmp_measurement], axis=0)

# Reset index to ensure proper alignment when merging
pin_measurement_integration.reset_index(drop=True, inplace=True)
po_measurement_integration.reset_index(drop=True, inplace=True)
pdmp_measurement_integration.reset_index(drop=True, inplace=True)

# Merge the DataFrames based on common columns (cycle, mode, condition)
data = pin_measurement_integration[['cycle', 'mode', 'condition']]
data['pin'] = pin_measurement_integration['pin']
data['po'] = po_measurement_integration['po']
data['pdmp'] = pdmp_measurement_integration['pdmp']

# Fill missing values using interpolation
data.interpolate(method='linear', axis=0, inplace=True)

# Convert the data types of specific columns to integers and floats
data[['cycle', 'mode', 'condition']] = data[['cycle', 'mode', 'condition']].astype(int)
data[['pin', 'po', 'pdmp']] = data[['pin', 'po', 'pdmp']].astype(float)

# Save the final integrated DataFrame to a CSV file
data.to_csv('/Users/sarithavuppula/Downloads/raw_data(train).csv', index=None)

TypeError: Cannot interpolate with all object-dtype columns in the DataFrame. Try setting at least one column to a numeric dtype.