In [2]:
import pandas as pd
import numpy as np

def calculate_fluid_statistics(root_dir):
    """
    Calculate statistics for fluid experiments from CSV files.

    This function loads data from fluids.csv, experiments.csv, and fluid_measurements.csv, merges the data, and calculates mean, median, and standard deviation of pressure, velocity, temperature, and flow_rate for each unique fluid.

    Parameters:
    root_dir (str): The root directory containing the CSV files.

    Returns:
    np.array: A structured NumPy array containing the calculated statistics for each fluid.
        The array has the following fields:
        - fluid_id (int): The unique identifier for each fluid.
        - fluid_name (str): The name of the fluid.
        - pressure_mean, pressure_median, pressure_std (float): Statistics for pressure.
        - velocity_mean, velocity_median, velocity_std (float): Statistics for velocity.
        - temperature_mean, temperature_median, temperature_std (float): Statistics for temperature.
        - flow_rate_mean, flow_rate_median, flow_rate_std (float): Statistics for flow rate.
    """

    # Load the CSV files
    fluids = pd.read_csv(f'fluids.csv')
    experiments = pd.read_csv(f'experiments.csv')
    fluid_measurements = pd.read_csv(f'fluid_measurements.csv')

    # Merge the dataframes to get fluid_name associated with each measurement
    merged_data = fluid_measurements.merge(experiments, on='experiment_id').merge(fluids, on='fluid_id')

    # Group the data by fluid_id and fluid_name
    fluid_groups = merged_data.groupby(['fluid_id', 'fluid_name'])

    # List to hold statistics
    statistics = []

    # Iterate through each fluid group and calculate statistics
    for fluid_key, fluid_data in fluid_groups:
        fluid_id, fluid_name = fluid_key

        # Calculate mean, median, and std deviation for pressure, velocity, temperature, and flow_rate
        pressure_mean, pressure_median, pressure_std = fluid_data['pressure'].mean(), fluid_data['pressure'].median(), fluid_data['pressure'].std()
        velocity_mean, velocity_median, velocity_std = fluid_data['velocity'].mean(), fluid_data['velocity'].median(), fluid_data['velocity'].std()
        temperature_mean, temperature_median, temperature_std = fluid_data['temperature'].mean(), fluid_data['temperature'].median(), fluid_data['temperature'].std()
        flow_rate_mean, flow_rate_median, flow_rate_std = fluid_data['flow_rate'].mean(), fluid_data['flow_rate'].median(), fluid_data['flow_rate'].std()

        # Append calculated statistics for this fluid
        statistics.append((fluid_id, fluid_name,
                           pressure_mean, pressure_median, pressure_std,
                           velocity_mean, velocity_median, velocity_std,
                           temperature_mean, temperature_median, temperature_std,
                           flow_rate_mean, flow_rate_median, flow_rate_std))

    # Convert list of results to a structured NumPy array
    dtype = [('fluid_id', 'int'), ('fluid_name', 'U50'),
             ('pressure_mean', 'float'), ('pressure_median', 'float'), ('pressure_std', 'float'),
             ('velocity_mean', 'float'), ('velocity_median', 'float'), ('velocity_std', 'float'),
             ('temperature_mean', 'float'), ('temperature_median', 'float'), ('temperature_std', 'float'),
             ('flow_rate_mean', 'float'), ('flow_rate_median', 'float'), ('flow_rate_std', 'float')]

    result_array = np.array(statistics, dtype=dtype)

    return result_array
    pass

 # Call the function and print the results
result_array = calculate_fluid_statistics(root_dir='exercise_data') # change root_dir to where your data for this exercise is
print(result_array)


[( 1, 'Water', 7.60826500e+03, 7.608265e+03, 3.29175884e+03, 2.255     , 2.255, 1.66170094, 98.105     , 98.105,  2.66579257, 2.25500000e+00,  2.255, 1.66170094e+00)
 ( 2, 'Air', 7.27333333e+00, 8.050000e+00, 1.66674333e+00, 2.45666667, 1.48 , 2.21301454, 56.45      , 57.61 ,  5.54181378, 3.33333333e-03,  0.   , 5.77350269e-03)
 ( 3, 'Oil', 4.73930000e+03, 4.509840e+03, 1.59797794e+03, 2.745     , 2.455, 1.21804293, 47.13375   , 31.215, 32.14529467, 2.33375000e+00,  2.09 , 1.03549764e+00)
 ( 6, 'Mercury', 6.09956075e+04, 6.099207e+04, 1.50032666e+04, 2.3575    , 2.79 , 1.15707606, 58.9325    , 53.525, 18.58726692, 3.19350000e+01, 37.79 , 1.56723929e+01)
 ( 7, 'Acetone', 5.22426429e+03, 5.345930e+03, 2.50313074e+03, 2.49      , 1.99 , 1.86653869, 49.85428571, 51.72 , 23.57545253, 1.95142857e+00,  1.56 , 1.46147675e+00)
 ( 8, 'Benzene', 1.87583333e+03, 7.946300e+02, 2.07502941e+03, 3.17333333, 4.3  , 2.27035974, 45.77666667, 46.17 , 25.46227864, 2.78000000e+00,  3.77 , 1.98924609e+00)
 (

In [3]:
def get_experiments_and_correlation(root_dir, fluid_id):
    """
    Retrieves experiment IDs for a given fluid and calculates the correlation matrix of measurements.

    Parameters:
    root_dir (str): The root directory containing the CSV files.
    fluid_id (int): The ID of the fluid to analyze.

    Returns:
    tuple: A tuple containing two elements:
        - numpy.ndarray: An array of experiment IDs associated with the given fluid_id.
        - pandas.DataFrame: A correlation matrix of pressure, velocity, temperature, and flow_rate for the experiments associated with the given fluid_id.
    """
    # Load the CSV files
    experiments = pd.read_csv(f'experiments.csv')
    fluid_measurements = pd.read_csv(f'fluid_measurements.csv')
    
    # Filter experiments for the given fluid_id
    relevant_experiments = experiments[experiments['fluid_id'] == fluid_id]
    experiment_ids = relevant_experiments['experiment_id'].values
    
    # Filter fluid_measurements for the relevant experiment_ids
    relevant_measurements = fluid_measurements[fluid_measurements['experiment_id'].isin(experiment_ids)]
    
    # Calculate the correlation matrix for pressure, velocity, temperature, and flow_rate
    correlation_matrix = relevant_measurements[['pressure', 'velocity', 'temperature', 'flow_rate']].corr()
    
    return experiment_ids, correlation_matrix

root_dir = 'Part 1' 
fluid_id = 1

# Get experiment IDs and correlation matrix
experiment_ids, correlation_matrix = get_experiments_and_correlation(root_dir, fluid_id)

# Print results
print(f"Experiment IDs for fluid_id {fluid_id}:")
print(experiment_ids)
print("\nCorrelation Matrix:")
print(correlation_matrix)

Experiment IDs for fluid_id 1:
[ 8 13 20]

Correlation Matrix:
             pressure  velocity  temperature  flow_rate
pressure          1.0       1.0         -1.0        1.0
velocity          1.0       1.0         -1.0        1.0
temperature      -1.0      -1.0          1.0       -1.0
flow_rate         1.0       1.0         -1.0        1.0


In [4]:
def create_normalized_fluid_matrix(root_dir):
    """
    Create a normalized 5x3 matrix of fluid properties.

    This function reads fluid data from a CSV file, selects the first 5 fluids, and creates a matrix of their density, viscosity, and specific heat properties. The matrix is then normalized using min-max normalization.

    Parameters:
    root_dir (str): The root directory containing the fluids.csv file.

    Returns:
    numpy.ndarray: A 5x3 normalized matrix where each row represents a fluid and each column represents a normalized property (density, viscosity, specific_heat).
    """
    # Load the fluids.csv file
    fluids = pd.read_csv(f'fluids.csv')
    
    # Select the first 5 fluids and extract density, viscosity, and specific_heat columns
    fluid_properties = fluids[['density', 'viscosity', 'specific_heat']].head(5).values
    
    # Min-max normalization for each column (density, viscosity, specific_heat)
    min_values = fluid_properties.min(axis=0)
    max_values = fluid_properties.max(axis=0)
    
    normalized_matrix = (fluid_properties - min_values) / (max_values - min_values)
    
    return normalized_matrix

root_dir = 'Part 1'  # change root_dir to where your data for this exercise is
result_matrix = create_normalized_fluid_matrix(root_dir)

# Print the result
print("Normalized 5x3 Fluid Property Matrix:")
print(result_matrix)

# Print with column names for clarity
column_names = ['density', 'viscosity', 'specific_heat']
result_df = pd.DataFrame(result_matrix, columns=column_names)
print("\nNormalized Matrix with Column Names:")
print(result_df)


Normalized 5x3 Fluid Property Matrix:
[[7.93449981e-01 6.54607899e-04 1.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.74286509e-01 6.65468030e-03 3.12991507e-01]
 [6.25826697e-01 7.87942841e-04 4.38817238e-01]
 [1.00000000e+00 1.00000000e+00 4.38817238e-01]]

Normalized Matrix with Column Names:
    density  viscosity  specific_heat
0  0.793450   0.000655       1.000000
1  0.000000   0.000000       0.000000
2  0.674287   0.006655       0.312992
3  0.625827   0.000788       0.438817
4  1.000000   1.000000       0.438817


In [5]:
def analyze_fluid_properties(normalized_matrix):
    """
    Analyze the already normalized fluid properties by calculating the correlation matrix and performing eigenvalue decomposition.

    Parameters:
    normalized_matrix (numpy.ndarray): A pre-normalized matrix of fluid properties.

    Returns:
    tuple: A tuple containing:
        - correlation_matrix (numpy.ndarray): The correlation matrix of the normalized data.
        - eigenvalues (numpy.ndarray): The eigenvalues of the correlation matrix.
        - eigenvectors (numpy.ndarray): The eigenvectors of the correlation matrix.
    """
    # Calculate the correlation matrix from normalized data
    correlation_matrix = np.corrcoef(normalized_matrix.T)
    
    # Eigenvalue decomposition of the correlation matrix
    eigenvalues, eigenvectors = np.linalg.eig(correlation_matrix)
    
    return correlation_matrix, eigenvalues, eigenvectors

# Main execution
root_dir = 'Part 1'  # change this to your actual data directory

# Get the normalized matrix once
normalized_matrix = create_normalized_fluid_matrix(root_dir)

# Perform analysis on the normalized matrix
correlation_matrix, eigenvalues, eigenvectors = analyze_fluid_properties(normalized_matrix)

# Print results
print("Normalized 5x3 Fluid Property Matrix:")
print(normalized_matrix)

print("\nCorrelation Matrix:")
print(correlation_matrix)

print("\nEigenvalues:")
print(eigenvalues)

print("\nEigenvectors:")
print(eigenvectors)

# Interpret the results
total_variance = np.sum(eigenvalues)
explained_variance_ratio = eigenvalues / total_variance

print("\nExplained Variance Ratio:")
for i, ratio in enumerate(explained_variance_ratio):
    print(f"Principal Component {i+1}: {ratio:.4f}")

# Determine which property or combination explains the most variance
properties = ['density', 'viscosity', 'specific_heat']
max_component = np.argmax(np.abs(eigenvectors[:, 0]))
max_contribution = eigenvectors[max_component, 0]

print(f"\nThe property that contributes most to the first principal component is: {properties[max_component]}")
print(f"Its contribution is: {max_contribution:.4f}")

# If the contribution is not overwhelmingly large, print the combination
if max_contribution < 0.8:
    print("\nThe combination of properties that explains the most variance is:")
    for i, prop in enumerate(properties):
        print(f"{prop}: {eigenvectors[i, 0]:.4f}")

Normalized 5x3 Fluid Property Matrix:
[[7.93449981e-01 6.54607899e-04 1.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.74286509e-01 6.65468030e-03 3.12991507e-01]
 [6.25826697e-01 7.87942841e-04 4.38817238e-01]
 [1.00000000e+00 1.00000000e+00 4.38817238e-01]]

Correlation Matrix:
[[1.00000000e+00 5.70589790e-01 6.68644167e-01]
 [5.70589790e-01 1.00000000e+00 3.52574102e-04]
 [6.68644167e-01 3.52574102e-04 1.00000000e+00]]

Eigenvalues:
[1.87918363 0.12116456 0.99965181]

Eigenvectors:
[[ 7.07036753e-01  7.07176800e-01  6.30797704e-05]
 [ 4.59082256e-01 -4.58923489e-01 -7.60679114e-01]
 [ 5.37905672e-01 -5.37857049e-01  6.49128094e-01]]

Explained Variance Ratio:
Principal Component 1: 0.6264
Principal Component 2: 0.0404
Principal Component 3: 0.3332

The property that contributes most to the first principal component is: density
Its contribution is: 0.7070

The combination of properties that explains the most variance is:
density: 0.7070
viscosity: 0.4591
specific_h

In [6]:
def cosine_similarity(v1, v2):
    """Calculate cosine similarity between two vectors"""
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def find_most_similar_fluids(root_dir):
    """
    Find the two most similar fluids based on their properties.

    This function loads fluid data from a CSV file, computes the cosine similarity between each pair of fluids based on their density, viscosity, and specific heat,
    and returns the two most similar fluids along with their similarity score.

    Parameters:
    root_dir (str): The directory path where the fluids.csv file is located.

    Returns:
    tuple: A tuple containing three elements:
        - fluid1 (pandas.Series): The first fluid of the most similar pair.
        - fluid2 (pandas.Series): The second fluid of the most similar pair.
        - max_similarity (float): The cosine similarity between the two most similar fluids.
    """
    # Load fluid data
    fluids = pd.read_csv(f'fluids.csv')
    
    # Select properties (density, viscosity, specific_heat) and create matrix
    fluid_properties = fluids[['density', 'viscosity', 'specific_heat']].values
    n_fluids = fluid_properties.shape[0]
    
    max_similarity = -1
    fluid1 = None
    fluid2 = None
    
    # Iterate over all pairs of fluids to compute cosine similarity
    for i in range(n_fluids):
        for j in range(i + 1, n_fluids):
            similarity = cosine_similarity(fluid_properties[i], fluid_properties[j])
            if similarity > max_similarity:
                max_similarity = similarity
                fluid1 = fluids.iloc[i]
                fluid2 = fluids.iloc[j]
    
    return fluid1, fluid2, max_similarity

# Call the function
root_dir = 'Part 1'  # change root_dir to where your data for this exercise is
fluid1, fluid2, similarity = find_most_similar_fluids(root_dir)

# Print results
print(f"The two most similar fluids are:")
print(f"1. {fluid1['fluid_name']} (ID: {fluid1['fluid_id']})")
print(f"2. {fluid2['fluid_name']} (ID: {fluid2['fluid_id']})")
print(f"Cosine similarity: {similarity:.4f}")

print("\nTheir properties are:")
print(f"{'Property':<15} {'Fluid 1':<15} {'Fluid 2':<15}")
print("-" * 45)
for prop in ['density', 'viscosity', 'specific_heat']:
    print(f"{prop:<15} {fluid1[prop]:<15.4f} {fluid2[prop]:<15.4f}")


The two most similar fluids are:
1. Oil (ID: 3)
2. Propylene Glycol (ID: 9)
Cosine similarity: 0.9998

Their properties are:
Property        Fluid 1         Fluid 2        
---------------------------------------------
density         850.0000        1030.0000      
viscosity       10.0000         60.0000        
specific_heat   2000.0000       2500.0000      
