In [1]:
!pip -q install pandas scikit-learn joblib

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

import joblib


In [2]:
DATA_PATH = "/content/flow2.txt"

df = pd.read_csv(DATA_PATH, sep="\t", engine="python")
print("Shape:", df.shape)
print("Columns:", list(df.columns))
df.head()


Shape: (271965, 18)
Columns: ['X [m]', 'Y [m]', 'Z [m]', 'Volume [m^3]', 'Surface [m^2]', 'Density (Fluid) [kg/m^3]', 'Pressure [Pa]', 'Temperature [K]', 'Temperature (Fluid) [K]', 'Velocity [m/s]', 'Velocity (X) [m/s]', 'Velocity (Y) [m/s]', 'Velocity (Z) [m/s]', 'Mach Number [ ]', 'Surface Heat Flux [W/m^2]', 'Total Enthalpy Flux [W/m^2]', 'Volumetric Heat Generation Rate [W/m^3]', 'Unnamed: 17']


Unnamed: 0,X [m],Y [m],Z [m],Volume [m^3],Surface [m^2],Density (Fluid) [kg/m^3],Pressure [Pa],Temperature [K],Temperature (Fluid) [K],Velocity [m/s],Velocity (X) [m/s],Velocity (Y) [m/s],Velocity (Z) [m/s],Mach Number [ ],Surface Heat Flux [W/m^2],Total Enthalpy Flux [W/m^2],Volumetric Heat Generation Rate [W/m^3],Unnamed: 17
0,-0.35383,-0.097752,-0.219093,7.284224e-07,,0.746102,101320.315,473.005384,473.005384,600.026442,-600.022794,-1.630521,-1.311166,1.381746,,,,
1,-0.347172,-0.097752,-0.219093,7.281935e-07,,0.746199,101335.586,473.015109,473.015109,599.981297,-599.977677,-1.660842,-1.258873,1.381628,,,,
2,-0.35383,-0.087779,-0.219093,6.915939e-07,,0.746071,101324.925,473.046498,473.046498,599.889562,-599.870719,-0.919991,4.664886,1.381372,,,,
3,-0.347172,-0.087779,-0.219093,6.913766e-07,,0.746047,101321.731,473.046988,473.046988,599.927335,-599.917816,-0.816421,3.279565,1.381459,,,,
4,-0.35383,-0.097752,-0.208666,6.924015e-07,,0.746234,101345.175,473.037712,473.037712,599.938288,-599.917981,4.775171,-1.250171,1.381497,,,,


In [3]:
# Clean column names (optional but helps)
df.columns = [c.strip() for c in df.columns]

# Convert everything to numeric where possible
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors="coerce")

print(df.isna().sum().sort_values(ascending=False).head(10))
df.describe().T.head(10)


Volumetric Heat Generation Rate [W/m^3]    271965
Unnamed: 17                                271965
Surface Heat Flux [W/m^2]                  257547
Total Enthalpy Flux [W/m^2]                250113
Surface [m^2]                              221277
Volume [m^3]                                50688
Mach Number [ ]                             25999
Velocity [m/s]                              25999
Density (Fluid) [kg/m^3]                    25999
Pressure [Pa]                               25999
dtype: int64


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
X [m],271965.0,-0.07493697,0.1692732,-0.3571597,-0.2140346,-0.1008675,0.06651456,0.243903
Y [m],271965.0,0.03065993,0.07869623,-0.1028678,-0.01757424,0.008849903,0.08565402,0.203782
Z [m],271965.0,-0.002856601,0.1132715,-0.2244386,-0.08895077,-0.003292548,0.08236589,0.215007
Volume [m^3],221277.0,3.660426e-07,1.703459e-07,4.555263e-20,2.924556e-07,3.595372e-07,4.568669e-07,1e-06
Surface [m^2],50688.0,3.04117e-05,2.438753e-05,9.684185e-14,8.611168e-06,4.236248e-05,4.45583e-05,0.000152
Density (Fluid) [kg/m^3],245966.0,0.7259162,0.07578338,0.1730612,0.7174443,0.746143,0.7547129,1.229078
Pressure [Pa],245966.0,99608.61,10421.49,23522.03,97275.23,101325.0,103500.1,194657.893
Temperature [K],245966.0,484.721,39.71551,379.8662,470.6792,473.0028,480.1877,665.541063
Temperature (Fluid) [K],245966.0,484.721,39.71551,379.8662,470.6792,473.0028,480.1877,665.541063
Velocity [m/s],245966.0,557.6876,146.5878,0.0,587.5535,599.9956,603.9207,741.365029


In [4]:
# Basic stats
print("Missing % per column:")
print((df.isna().mean() * 100).sort_values(ascending=False).round(2))

# Correlation with Temperature (if exists)
target_col = "Temperature [K]"
if target_col in df.columns:
    corr = df.corr(numeric_only=True)[target_col].sort_values(ascending=False)
    print("\nTop correlations with Temperature [K]:")
    print(corr.head(10))


Missing % per column:
Volumetric Heat Generation Rate [W/m^3]    100.00
Unnamed: 17                                100.00
Surface Heat Flux [W/m^2]                   94.70
Total Enthalpy Flux [W/m^2]                 91.97
Surface [m^2]                               81.36
Volume [m^3]                                18.64
Mach Number [ ]                              9.56
Velocity [m/s]                               9.56
Density (Fluid) [kg/m^3]                     9.56
Pressure [Pa]                                9.56
Velocity (Y) [m/s]                           9.56
Velocity (Z) [m/s]                           9.56
Temperature [K]                              9.56
Temperature (Fluid) [K]                      9.56
Velocity (X) [m/s]                           9.56
X [m]                                        0.00
Y [m]                                        0.00
Z [m]                                        0.00
dtype: float64

Top correlations with Temperature [K]:
Temperature [K]        

In [5]:
target_col = "Temperature [K]"

# Keep only numeric columns
num_df = df.select_dtypes(include=[np.number]).copy()

# Basic safety checks
if target_col not in num_df.columns:
    raise ValueError(f"Target column not found: {target_col}")

X = num_df.drop(columns=[target_col])
y = num_df[target_col]

print("X shape:", X.shape, "y shape:", y.shape)
print("Target range:", y.min(), "to", y.max())


X shape: (271965, 17) y shape: (271965,)
Target range: 379.866231 to 665.541063


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:
model = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("rf", RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

# Drop rows where y_train is NaN, and apply the same index to X_train
valid_train_indices = y_train.dropna().index
X_train_cleaned = X_train.loc[valid_train_indices]
y_train_cleaned = y_train.loc[valid_train_indices]

model.fit(X_train_cleaned, y_train_cleaned)



In [10]:
pred = model.predict(X_test)

# Filter out NaN values from y_test and corresponding predictions
valid_test_indices = y_test.dropna().index
y_test_cleaned = y_test.loc[valid_test_indices]
pred_cleaned = pred[y_test.index.isin(valid_test_indices)]

mae = mean_absolute_error(y_test_cleaned, pred_cleaned)
r2 = r2_score(y_test_cleaned, pred_cleaned)

print("MAE:", mae)
print("R2 :", r2)



MAE: 0.001060042321491605
R2 : 0.9999999193042425


In [12]:
rf = model.named_steps["rf"]

# Identify columns that were skipped by the imputer due to being all NaN.
# These columns will not have corresponding feature importances.
skipped_cols = ['Volumetric Heat Generation Rate [W/m^3]', 'Unnamed: 17'] # From the Imputer warning

# Create an index for feature importances by excluding the skipped columns from X.columns
feature_names_for_importances = X.columns.drop(skipped_cols)

importances = pd.Series(rf.feature_importances_, index=feature_names_for_importances).sort_values(ascending=False)

print("Top 10 important features:")
print(importances.head(10))

Top 10 important features:
Temperature (Fluid) [K]     9.999886e-01
Mach Number [ ]             7.533339e-06
Velocity [m/s]              2.752949e-06
Velocity (X) [m/s]          4.193057e-07
Density (Fluid) [kg/m^3]    3.907647e-07
Pressure [Pa]               2.112125e-07
Z [m]                       7.165047e-08
Volume [m^3]                1.739312e-08
Velocity (Z) [m/s]          7.980393e-09
X [m]                       7.244209e-09
dtype: float64


In [13]:
joblib.dump(
    {"model": model, "feature_columns": list(X.columns), "target": target_col},
    "temperature_model.joblib"
)
print("Saved: temperature_model.joblib")


Saved: temperature_model.joblib


In [14]:
bundle = joblib.load("temperature_model.joblib")
loaded_model = bundle["model"]
feature_cols = bundle["feature_columns"]

# Example: take first 5 rows from original data as "new data"
new_data = num_df[feature_cols].head(5)

pred_temp = loaded_model.predict(new_data)
print("Predicted Temperature [K] (first 5):", pred_temp)


Predicted Temperature [K] (first 5): [473.0053804  473.01510635 473.04649362 473.04698505 473.03768488]




In [16]:
# Change target
target_col = "Pressure [Pa]"
num_df = df.select_dtypes(include=[np.number]).copy()

X = num_df.drop(columns=[target_col])
y = num_df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Filter out NaN values from y_train and corresponding X_train
valid_train_indices = y_train.dropna().index
X_train_cleaned = X_train.loc[valid_train_indices]
y_train_cleaned = y_train.loc[valid_train_indices]

model.fit(X_train_cleaned, y_train_cleaned)

# Filter out NaN values from y_test and corresponding predictions for evaluation
valid_test_indices = y_test.dropna().index
pred = model.predict(X_test.loc[valid_test_indices]) # Predict only for non-NaN y_test rows
y_test_cleaned = y_test.loc[valid_test_indices]

mae = mean_absolute_error(y_test_cleaned, pred)
r2 = r2_score(y_test_cleaned, pred)

print("MAE:", mae)
print("R2 :", r2)



MAE: 125.92168579098143
R2 : 0.9946817545110431
