In [5]:
import pandas as pd
import numpy as np

def calculate_fluid_statistics(root_dir):
    """
    Calculate statistics for fluid experiments from CSV files.

    This function loads data from fluids.csv, experiments.csv, and fluid_measurements.csv, merges the data, and calculates mean, median, and standard deviation of pressure, velocity, temperature, and flow_rate for each unique fluid.

    Parameters:
    root_dir (str): The root directory containing the CSV files.

    Returns:
    np.array: A structured NumPy array containing the calculated statistics for each fluid.
        The array has the following fields:
        - fluid_id (int): The unique identifier for each fluid.
        - fluid_name (str): The name of the fluid.
        - pressure_mean, pressure_median, pressure_std (float): Statistics for pressure.
        - velocity_mean, velocity_median, velocity_std (float): Statistics for velocity.
        - temperature_mean, temperature_median, temperature_std (float): Statistics for temperature.
        - flow_rate_mean, flow_rate_median, flow_rate_std (float): Statistics for flow rate.
    """

    # Load the CSV files
    fluids = pd.read_csv(f'fluids.csv')
    experiments = pd.read_csv(f'experiments.csv')
    fluid_measurements = pd.read_csv(f'fluid_measurements.csv')

    # Merge the dataframes to get fluid_name associated with each measurement
    merged_data = fluid_measurements.merge(experiments, on='experiment_id').merge(fluids, on='fluid_id')

    # Group the data by fluid_id and fluid_name
    fluid_groups = merged_data.groupby(['fluid_id', 'fluid_name'])

    # List to hold statistics
    statistics = []

    # Iterate through each fluid group and calculate statistics
    for fluid_key, fluid_data in fluid_groups:
        fluid_id, fluid_name = fluid_key

        # Calculate mean, median, and std deviation for pressure, velocity, temperature, and flow_rate
        pressure_mean, pressure_median, pressure_std = fluid_data['pressure'].mean(), fluid_data['pressure'].median(), fluid_data['pressure'].std()
        velocity_mean, velocity_median, velocity_std = fluid_data['velocity'].mean(), fluid_data['velocity'].median(), fluid_data['velocity'].std()
        temperature_mean, temperature_median, temperature_std = fluid_data['temperature'].mean(), fluid_data['temperature'].median(), fluid_data['temperature'].std()
        flow_rate_mean, flow_rate_median, flow_rate_std = fluid_data['flow_rate'].mean(), fluid_data['flow_rate'].median(), fluid_data['flow_rate'].std()

        # Append calculated statistics for this fluid
        statistics.append((fluid_id, fluid_name,
                           pressure_mean, pressure_median, pressure_std,
                           velocity_mean, velocity_median, velocity_std,
                           temperature_mean, temperature_median, temperature_std,
                           flow_rate_mean, flow_rate_median, flow_rate_std))

    # Convert list of results to a structured NumPy array
    dtype = [('fluid_id', 'int'), ('fluid_name', 'U50'),
             ('pressure_mean', 'float'), ('pressure_median', 'float'), ('pressure_std', 'float'),
             ('velocity_mean', 'float'), ('velocity_median', 'float'), ('velocity_std', 'float'),
             ('temperature_mean', 'float'), ('temperature_median', 'float'), ('temperature_std', 'float'),
             ('flow_rate_mean', 'float'), ('flow_rate_median', 'float'), ('flow_rate_std', 'float')]

    result_array = np.array(statistics, dtype=dtype)

    return result_array
    pass

 # Call the function and print the results
result_array = calculate_fluid_statistics(root_dir='exercise_data') # change root_dir to where your data for this exercise is
print(result_array)


[( 1, 'Water', 7.60826500e+03, 7.608265e+03, 3.29175884e+03, 2.255     , 2.255, 1.66170094, 98.105     , 98.105,  2.66579257, 2.25500000e+00,  2.255, 1.66170094e+00)
 ( 2, 'Air', 7.27333333e+00, 8.050000e+00, 1.66674333e+00, 2.45666667, 1.48 , 2.21301454, 56.45      , 57.61 ,  5.54181378, 3.33333333e-03,  0.   , 5.77350269e-03)
 ( 3, 'Oil', 4.73930000e+03, 4.509840e+03, 1.59797794e+03, 2.745     , 2.455, 1.21804293, 47.13375   , 31.215, 32.14529467, 2.33375000e+00,  2.09 , 1.03549764e+00)
 ( 6, 'Mercury', 6.09956075e+04, 6.099207e+04, 1.50032666e+04, 2.3575    , 2.79 , 1.15707606, 58.9325    , 53.525, 18.58726692, 3.19350000e+01, 37.79 , 1.56723929e+01)
 ( 7, 'Acetone', 5.22426429e+03, 5.345930e+03, 2.50313074e+03, 2.49      , 1.99 , 1.86653869, 49.85428571, 51.72 , 23.57545253, 1.95142857e+00,  1.56 , 1.46147675e+00)
 ( 8, 'Benzene', 1.87583333e+03, 7.946300e+02, 2.07502941e+03, 3.17333333, 4.3  , 2.27035974, 45.77666667, 46.17 , 25.46227864, 2.78000000e+00,  3.77 , 1.98924609e+00)
 (

In [6]:
def get_experiments_and_correlation(root_dir, fluid_id):
    """
    Retrieves experiment IDs for a given fluid and calculates the correlation matrix of measurements.

    Parameters:
    root_dir (str): The root directory containing the CSV files.
    fluid_id (int): The ID of the fluid to analyze.

    Returns:
    tuple: A tuple containing two elements:
        - numpy.ndarray: An array of experiment IDs associated with the given fluid_id.
        - pandas.DataFrame: A correlation matrix of pressure, velocity, temperature, and flow_rate for the experiments associated with the given fluid_id.
    """
    # Load the CSV files
    experiments = pd.read_csv(f'experiments.csv')
    fluid_measurements = pd.read_csv(f'fluid_measurements.csv')
    
    # Filter experiments for the given fluid_id
    relevant_experiments = experiments[experiments['fluid_id'] == fluid_id]
    experiment_ids = relevant_experiments['experiment_id'].values
    
    # Filter fluid_measurements for the relevant experiment_ids
    relevant_measurements = fluid_measurements[fluid_measurements['experiment_id'].isin(experiment_ids)]
    
    # Calculate the correlation matrix for pressure, velocity, temperature, and flow_rate
    correlation_matrix = relevant_measurements[['pressure', 'velocity', 'temperature', 'flow_rate']].corr()
    
    return experiment_ids, correlation_matrix

root_dir = 'Part 1' 
fluid_id = 1

# Get experiment IDs and correlation matrix
experiment_ids, correlation_matrix = get_experiments_and_correlation(root_dir, fluid_id)

# Print results
print(f"Experiment IDs for fluid_id {fluid_id}:")
print(experiment_ids)
print("\nCorrelation Matrix:")
print(correlation_matrix)

Experiment IDs for fluid_id 1:
[ 8 13 20]

Correlation Matrix:
             pressure  velocity  temperature  flow_rate
pressure          1.0       1.0         -1.0        1.0
velocity          1.0       1.0         -1.0        1.0
temperature      -1.0      -1.0          1.0       -1.0
flow_rate         1.0       1.0         -1.0        1.0
