In [8]:
import pandas as pd 
import numpy as np

# Step 1: Load the DataFrames
systems_df = pd.read_csv('systems.csv')
sensors_df = pd.read_csv('sensors.csv')
measurements_df = pd.read_csv('measurements.csv')
control_actions_df = pd.read_csv('control_actions.csv')
signal_data_df = pd.read_csv('signal_data.csv')
signal_characteristics_df = pd.read_csv('signal_characteristics.csv')

# Step 2: Clean the data
def clean_data(df, df_name):
    print(f"Initial missing values in {df_name} DataFrame:")
    print(df.isnull().sum())
    
    if df_name == "SYSTEMS":
        # Interpolate missing system_id values
        if df['system_id'].isnull().any():
            df['system_id'] = df['system_id'].interpolate()
        
        # Use str.extract() to split the system_name into text and numeric parts
        split_names = df['system_name'].str.extract(r'(.+?)(\d+)$')
        df['text_part'] = split_names[0]
        df['num_part'] = pd.to_numeric(split_names[1], errors='coerce')

        # Interpolate the numeric part
        df['num_part'] = df['num_part'].interpolate()

        # Fill missing numeric parts with interpolated values and keep text parts intact
        for i in range(len(df)):
            if np.isnan(df['num_part'].iloc[i]) and np.isnan(df['system_id'].iloc[i]):
                # If both num_part and system_id are NaN, use the last known text and interpolate the number
                last_known_num = df['num_part'].ffill().iloc[i]
                df['num_part'].iloc[i] = last_known_num
        
        # Recombine the text and interpolated numeric parts
        df['system_name'] = df['text_part'] + ' ' + df['num_part'].fillna(0).astype(int).astype(str)

        # Drop temporary columns used for the operation
        df.drop(columns=['text_part', 'num_part'], inplace=True)
        

    # Confirm no missing values remain after specific cleaning
    print(f"Missing values after cleaning in {df_name} DataFrame:")
    print(df.isnull().sum())
    
    return df

# Clean the SYSTEMS DataFrame
systems_df = clean_data(systems_df, "SYSTEMS")
print(systems_df)

# Step 3: Check Data Types
def check_data_types(df):
    print(df.dtypes)

# Check data types for each DataFrame
print("Data types for SYSTEMS DataFrame:")
check_data_types(systems_df)

print("Data types for SENSORS DataFrame:")
check_data_types(sensors_df)

print("Data types for MEASUREMENTS DataFrame:")
check_data_types(measurements_df)

print("Data types for CONTROL_ACTIONS DataFrame:")
check_data_types(control_actions_df)

print("Data types for SIGNAL_DATA DataFrame:")
check_data_types(signal_data_df)

print("Data types for SIGNAL_CHARACTERISTICS DataFrame:")
check_data_types(signal_characteristics_df)

# Step 4: Convert Data Types if Necessary
def convert_data_types(df, df_name):
    # Convert system_id in SYSTEMS DataFrame
    if df_name == "SYSTEMS":
        df['system_id'] = df['system_id'].astype('Int64')  # Using Int64 for nullable integers
    # Convert timestamp columns to datetime
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    
    return df

# Convert data types for each DataFrame with names
systems_df = convert_data_types(systems_df, "SYSTEMS")
sensors_df = convert_data_types(sensors_df, "SENSORS")
measurements_df = convert_data_types(measurements_df, "MEASUREMENTS")
control_actions_df = convert_data_types(control_actions_df, "CONTROL_ACTIONS")
signal_data_df = convert_data_types(signal_data_df, "SIGNAL_DATA")
signal_characteristics_df = convert_data_types(signal_characteristics_df, "SIGNAL_CHARACTERISTICS")

# Check final data types
print("Final data types for SYSTEMS DataFrame:")
check_data_types(systems_df)
print("Final data types for SENSORS DataFrame:")
check_data_types(sensors_df)
print("Final data types for MEASUREMENTS DataFrame:")
check_data_types(measurements_df)
print("Final data types for CONTROL_ACTIONS DataFrame:")
check_data_types(control_actions_df)
print("Final data types for SIGNAL_DATA DataFrame:")
check_data_types(signal_data_df)
print("Final data types for SIGNAL_CHARACTERISTICS DataFrame:")
check_data_types(signal_characteristics_df)


Initial missing values in SYSTEMS DataFrame:
system_id      1
system_name    1
system_type    0
description    0
dtype: int64
Missing values after cleaning in SYSTEMS DataFrame:
system_id      0
system_name    1
system_type    0
description    0
dtype: int64
    system_id                  system_name       system_type  \
0         1.0   Advanced Control System  1       Fuzzy Logic   
1         2.0   Advanced Control System  2               PID   
2         3.0   Advanced Control System  3       Fuzzy Logic   
3         4.0   Advanced Control System  4          Adaptive   
4         5.0   Advanced Control System  5               PID   
5         6.0   Advanced Control System  6      Sliding Mode   
6         7.0   Advanced Control System  7               PID   
7         8.0   Advanced Control System  8               PID   
8         9.0   Advanced Control System  9               PID   
9        10.0  Advanced Control System  10       Fuzzy Logic   
10       11.0  Advanced Control Syste