In [1]:
import pandas as pd 
import numpy as np

# Step 1: Load the DataFrames
systems_df = pd.read_csv('systems.csv')
sensors_df = pd.read_csv('sensors.csv')
measurements_df = pd.read_csv('measurements.csv')
control_actions_df = pd.read_csv('control_actions.csv')
signal_data_df = pd.read_csv('signal_data.csv')
signal_characteristics_df = pd.read_csv('signal_characteristics.csv')

# Step 2: Data Cleaning

# Function to handle missing values intelligently
def clean_data(df):
    print(f"Initial missing values in {df.columns.tolist()}:")
    print(df.isnull().sum())

    # Interpolate missing numerical data for now, can look at it later again
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col] = df[col].interpolate(method='linear')
    # Interpolate categorical columns with the mode for now, can look at it later again
    for col in df.select_dtypes(include=[object]).columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
    # Confirm no missing values remain
    print(f"Missing values after cleaning in {df.columns.tolist()}:")
    print(df.isnull().sum())
    
    return df

# Clean each DataFrame
systems_df = clean_data(systems_df)
sensors_df = clean_data(sensors_df)
measurements_df = clean_data(measurements_df)
control_actions_df = clean_data(control_actions_df)
signal_data_df = clean_data(signal_data_df)
signal_characteristics_df = clean_data(signal_characteristics_df)

# Step 3: Check Data Types
def check_data_types(df):
    print(df.dtypes)

# Check data types for each DataFrame
print("Data types for SYSTEMS DataFrame:")
check_data_types(systems_df)

print("Data types for SENSORS DataFrame:")
check_data_types(sensors_df)

print("Data types for MEASUREMENTS DataFrame:")
check_data_types(measurements_df)

print("Data types for CONTROL_ACTIONS DataFrame:")
check_data_types(control_actions_df)

print("Data types for SIGNAL_DATA DataFrame:")
check_data_types(signal_data_df)

print("Data types for SIGNAL_CHARACTERISTICS DataFrame:")
check_data_types(signal_characteristics_df)


Initial missing values in ['system_id', 'system_name', 'system_type', 'description']:
system_id      1
system_name    1
system_type    0
description    0
dtype: int64
Missing values after cleaning in ['system_id', 'system_name', 'system_type', 'description']:
system_id      0
system_name    0
system_type    0
description    0
dtype: int64
Initial missing values in ['sensor_id', 'sensor_name', 'sensor_type', 'unit', 'system_id']:
sensor_id      0
sensor_name    0
sensor_type    0
unit           0
system_id      0
dtype: int64
Missing values after cleaning in ['sensor_id', 'sensor_name', 'sensor_type', 'unit', 'system_id']:
sensor_id      0
sensor_name    0
sensor_type    0
unit           0
system_id      0
dtype: int64
Initial missing values in ['measurement_id', 'sensor_id', 'timestamp', 'value']:
measurement_id    0
sensor_id         0
timestamp         0
value             0
dtype: int64
Missing values after cleaning in ['measurement_id', 'sensor_id', 'timestamp', 'value']:
measuremen