In [37]:
import pandas as pd
import json
import numpy as np
from scipy.interpolate import interp1d
from scipy.signal import savgol_filter



In [57]:
# Load JSON
with open('combined_papers.json') as f:
    data = json.load(f)
num_preview = 2  # change this to print more or fewer curves

if DEBUG:
    print("Top-level keys:", list(data.keys()))
    print("Total curves:", len(data.get('curves', [])))

    curves = data.get('curves', [])[:num_preview]

    for i, curve in enumerate(curves, start=1):
        print(f"\n=== Curve {i} / {num_preview} ===")

        # Show available keys at this curve level
        print("Curve keys:", list(curve.keys()))

        # Try to display alloy composition if present
        if 'alloy_composition' in curve:
            print("Alloy composition:", curve['alloy_composition'])

        # Safely get raw curve data
        raw = curve.get('curve_raw_data', {}).get('data', [])
        print("Raw points count:", len(raw))

        # Print first 5 (x, y) points
        print("First 5 raw points:")
        for p in raw[:5]:
            print(p)


# Convert to DataFrame (if you want tabular format)
#df = pd.json_normalize(data['curves'])

Top-level keys: ['curves']
Total curves: 392

=== Curve 1 / 2 ===
Curve keys: ['id', 'curve_id', 'curve_label', 'alloy_composition', 'curve_raw_data', 'Kocks–Mecking_hardening_parameters']
Alloy composition: {'Co': 20.0, 'Cr': 20.0, 'Fe': 20.0, 'Mn': 20.0, 'Ni': 20.0}
Raw points count: 50
First 5 raw points:
{'x': 0.011, 'y': 592.329}
{'x': 0.019, 'y': 614.075}
{'x': 0.031, 'y': 635.802}
{'x': 0.041, 'y': 661.89}
{'x': 0.054, 'y': 683.61}

=== Curve 2 / 2 ===
Curve keys: ['id', 'curve_id', 'curve_label', 'alloy_composition', 'curve_raw_data', 'Kocks–Mecking_hardening_parameters']
Alloy composition: {'Co': 20.0, 'Cr': 20.0, 'Fe': 20.0, 'Mn': 20.0, 'Ni': 20.0}
Raw points count: 35
First 5 raw points:
{'x': 0.009, 'y': 359.309}
{'x': 0.019, 'y': 383.219}
{'x': 0.031, 'y': 400.59}
{'x': 0.04, 'y': 415.796}
{'x': 0.051, 'y': 430.996}


In [61]:
# STEP 1: Define the feature extraction function FIRST
def extract_features_from_curve(curve_data):
    """
    Extract comprehensive features from a stress-strain curve.
    """
    # Check if the curve data exists and has enough points to compute features.
    # If the curve is missing (None or empty) or contains fewer than 6 data points,
    # return None to skip processing — this prevents errors when calculating
    # slopes, toughness, or other curve parameters that require multiple points.    
    if not curve_data or len(curve_data) < 6:
        return None
    
    #Convert the list of dictionaries into NumPy arrays for efficient numerical computation.
    # Each element in 'curve_data' contains {'x': strain, 'y': stress}.
    # This comprehension extracts all 'x' values (strain) and all 'y' values (stress) into separate arrays
    strains_unsorted = np.array([point['x'] for point in curve_data])
    stresses_unsorted = np.array([point['y'] for point in curve_data])

    

      
    # Sort by strain
    sort_idx = np.argsort(strains_unsorted)
    strains = strains_unsorted[sort_idx]
    stresses = stresses_unsorted[sort_idx]
    
    features = {}
    
    # === Basic Mechanical Properties ===
    features['ultimate_tensile_strength'] = np.max(stresses)
    features['max_strain'] = np.max(strains)
    features['uts_strain'] = strains[np.argmax(stresses)]
    
    # Yield strength approximation
    if strains[0] <= 0.002 <= strains[-1]:
        f_interp = interp1d(strains, stresses, kind='linear', fill_value='extrapolate')
        features['yield_strength_002'] = float(f_interp(0.002))
    else:
        features['yield_strength_002'] = stresses[0]
    
    # === Stress at specific strain points ===
    strain_points = [0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5]
    for sp in strain_points:
        if strains[0] <= sp <= strains[-1]:
            f_interp = interp1d(strains, stresses, kind='linear', fill_value='extrapolate')
            features[f'stress_at_{sp}'] = float(f_interp(sp))
        else:
            features[f'stress_at_{sp}'] = np.nan
    
    # === Work Hardening Analysis ===
    if len(strains) > 5:
        window = min(5, len(stresses) if len(stresses) % 2 == 1 else len(stresses) - 1)
        if window >= 3:
            stresses_smooth = savgol_filter(stresses, window, 2)
        else:
            stresses_smooth = stresses
        
        d_stress = np.diff(stresses_smooth)
        d_strain = np.diff(strains)
        hardening_rate = d_stress / (d_strain + 1e-10)
        
        features['avg_hardening_rate'] = np.mean(hardening_rate)
        features['max_hardening_rate'] = np.max(hardening_rate)
        features['min_hardening_rate'] = np.min(hardening_rate)
        features['std_hardening_rate'] = np.std(hardening_rate)
        
        n_points = len(hardening_rate)
        features['hardening_rate_early'] = np.mean(hardening_rate[:n_points//3])
        features['hardening_rate_mid'] = np.mean(hardening_rate[n_points//3:2*n_points//3])
        features['hardening_rate_late'] = np.mean(hardening_rate[2*n_points//3:])
    
    # === Energy Metrics ===
    features['toughness'] = np.trapz(stresses, strains)
    elastic_idx = int(len(strains) * 0.2)
    features['resilience'] = np.trapz(stresses[:elastic_idx], strains[:elastic_idx])
    
    # === Statistical Features ===
    features['stress_mean'] = np.mean(stresses)
    features['stress_std'] = np.std(stresses)
    features['stress_median'] = np.median(stresses)
    features['stress_25_percentile'] = np.percentile(stresses, 25)
    features['stress_75_percentile'] = np.percentile(stresses, 75)
    features['stress_range'] = np.max(stresses) - np.min(stresses)
    
    # === Curve Shape Features ===
    features['num_data_points'] = len(strains)
    features['strain_range'] = np.max(strains) - np.min(strains)
    
    n = len(strains)
    early_slope = (stresses[n//4] - stresses[0]) / (strains[n//4] - strains[0] + 1e-10)
    mid_slope = (stresses[n//2] - stresses[n//4]) / (strains[n//2] - strains[n//4] + 1e-10)
    late_slope = (stresses[-1] - stresses[n//2]) / (strains[-1] - strains[n//2] + 1e-10)
    
    features['early_slope'] = early_slope
    features['mid_slope'] = mid_slope
    features['late_slope'] = late_slope
    
    return features



In [62]:
# Extract features into DataFrame for ML (tabular)
feature_list = []
composition_list = []

for curve in data['curves']:
    # Extract features (this makes it tabular)
    features = extract_features_from_curve(curve['curve_raw_data']['data'])
    feature_list.append(features)
    composition_list.append(curve['alloy_composition'])

# NOW you have tabular data for ML
X = pd.DataFrame(feature_list)  # Features (tabular)
y = pd.DataFrame(composition_list)  # Target (tabular)

  features['toughness'] = np.trapz(stresses, strains)
  features['resilience'] = np.trapz(stresses[:elastic_idx], strains[:elastic_idx])
  slope = (y_hi - y_lo) / (x_hi - x_lo)[:, None]
  y_new = slope*(x_new - x_lo)[:, None] + y_lo


In [63]:
print("unsorted strain",strains_unsorted[:3])

NameError: name 'strains_unsorted' is not defined

In [None]:
import pandas as pd
import json
import numpy as np
from scipy.interpolate import interp1d
from scipy.signal import savgol_filter

# Load JSON
with open('combined_papers.json') as f:
    data = json.load(f)

# Define the feature extraction function
def extract_features_from_curve(curve_data):
    """Extract comprehensive features from a stress-strain curve."""
    if not curve_data or len(curve_data) < 3:
        return None
    
    # Convert to arrays
    strains = np.array([point['x'] for point in curve_data])
    stresses = np.array([point['y'] for point in curve_data])
    
    # Sort by strain
    sort_idx = np.argsort(strains)
    strains = strains[sort_idx]
    stresses = stresses[sort_idx]
    
    features = {}
    
    # === Basic Mechanical Properties ===
    features['ultimate_tensile_strength'] = np.max(stresses)
    features['max_strain'] = np.max(strains)
    features['uts_strain'] = strains[np.argmax(stresses)]
    
    # Yield strength approximation
    if strains[0] <= 0.002 <= strains[-1]:
        f_interp = interp1d(strains, stresses, kind='linear', fill_value='extrapolate')
        features['yield_strength_002'] = float(f_interp(0.002))
    else:
        features['yield_strength_002'] = stresses[0]
    
    # === Stress at specific strain points ===
    strain_points = [0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5]
    for sp in strain_points:
        if strains[0] <= sp <= strains[-1]:
            f_interp = interp1d(strains, stresses, kind='linear', fill_value='extrapolate')
            features[f'stress_at_{sp}'] = float(f_interp(sp))
        else:
            features[f'stress_at_{sp}'] = np.nan
    
    # === Work Hardening Analysis ===
    if len(strains) > 5:
        window = min(5, len(stresses) if len(stresses) % 2 == 1 else len(stresses) - 1)
        if window >= 3:
            stresses_smooth = savgol_filter(stresses, window, 2)
        else:
            stresses_smooth = stresses
        
        d_stress = np.diff(stresses_smooth)
        d_strain = np.diff(strains)
        hardening_rate = d_stress / (d_strain + 1e-10)
        
        features['avg_hardening_rate'] = np.mean(hardening_rate)
        features['max_hardening_rate'] = np.max(hardening_rate)
        features['min_hardening_rate'] = np.min(hardening_rate)
        features['std_hardening_rate'] = np.std(hardening_rate)
        
        n_points = len(hardening_rate)
        features['hardening_rate_early'] = np.mean(hardening_rate[:n_points//3])
        features['hardening_rate_mid'] = np.mean(hardening_rate[n_points//3:2*n_points//3])
        features['hardening_rate_late'] = np.mean(hardening_rate[2*n_points//3:])
    
    # === Energy Metrics ===
    features['toughness'] = np.trapz(stresses, strains)
    elastic_idx = int(len(strains) * 0.2)
    features['resilience'] = np.trapz(stresses[:elastic_idx], strains[:elastic_idx])
    
    # === Statistical Features ===
    features['stress_mean'] = np.mean(stresses)
    features['stress_std'] = np.std(stresses)
    features['stress_median'] = np.median(stresses)
    features['stress_25_percentile'] = np.percentile(stresses, 25)
    features['stress_75_percentile'] = np.percentile(stresses, 75)
    features['stress_range'] = np.max(stresses) - np.min(stresses)
    
    # === Curve Shape Features ===
    features['num_data_points'] = len(strains)
    features['strain_range'] = np.max(strains) - np.min(strains)
    
    n = len(strains)
    early_slope = (stresses[n//4] - stresses[0]) / (strains[n//4] - strains[0] + 1e-10)
    mid_slope = (stresses[n//2] - stresses[n//4]) / (strains[n//2] - strains[n//4] + 1e-10)
    late_slope = (stresses[-1] - stresses[n//2]) / (strains[-1] - strains[n//2] + 1e-10)
    
    features['early_slope'] = early_slope
    features['mid_slope'] = mid_slope
    features['late_slope'] = late_slope
    
    return features

# Extract features for all curves
feature_list = []
composition_list = []

for curve in data['curves']:
    features = extract_features_from_curve(curve['curve_raw_data']['data'])
    if features is not None:
        feature_list.append(features)
        composition_list.append(curve['alloy_composition'])

# Create DataFrames
X = pd.DataFrame(feature_list)
y = pd.DataFrame(composition_list)

print(f"✓ Feature extraction complete!")
print(f"Feature matrix shape: {X.shape}")
print(f"Target matrix shape: {y.shape}")