<a href="https://colab.research.google.com/github/bhattadeb34/FACT_Data_Extraction_Scientific_Papers/blob/main/LLm_embeddings_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas sentence-transformers scikit-learn matplotlib seaborn umap-learn

In [None]:
!pip install openai

In [None]:
# Cell 1: Imports and Configuration (Modified)

import os
import re
import pandas as pd
import numpy as np
from openai import OpenAI
import google.generativeai as genai  # NEW: Import the Google AI library
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import matplotlib.pyplot as plt
#from tqdm.notebook import tqdm
from tqdm import tqdm
from collections import Counter

# --- Configuration ---
CSV_PATH = "master_conductivity_data.csv"
OUTPUT_FILENAME = "conductivity_pca_analysis.png"

# NEW: Choose your embedding provider here ("openai" or "gemini")
EMBEDDING_PROVIDER = "gemini"

# Select the model based on the provider
if EMBEDDING_PROVIDER == "openai":
    EMBEDDING_MODEL = "text-embedding-ada-002"
elif EMBEDDING_PROVIDER == "gemini":
    EMBEDDING_MODEL = "gemini-embedding-001"
# Configuration dictionary for models and their known dimensions
MODEL_CONFIG = {
    "text-embedding-ada-002": 1536,
    "gemini-embedding-001": 1536  # We will request this size for consistency
}

In [None]:
# Cell 2: API Key and Client Initialization (Modified)

api_key = None
client = None

if EMBEDDING_PROVIDER == "openai":
    try:
        from google.colab import userdata
        api_key = userdata.get('OPENAI_API_KEY')
        print("Retrieved OpenAI API key from Colab secrets.")
    except (ImportError, KeyError):
        print("Not in Colab, checking environment variable for OPENAI_API_KEY...")
        api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OpenAI API key not found.")
    client = OpenAI(api_key=api_key)
    print("OpenAI client initialized.")

elif EMBEDDING_PROVIDER == "gemini":
    try:
        from google.colab import userdata
        api_key = userdata.get('GOOGLE_API_KEY')
        print("Retrieved Google Gemini API key from Colab secrets.")
    except (ImportError, KeyError):
        print("Not in Colab, checking environment variable for GOOGLE_GEMINI_API_KEY...")
        api_key = os.environ.get("GOOGLE_GEMINI_API_KEY")
    if not api_key:
        raise ValueError("Google Gemini API key not found.")
    genai.configure(api_key=api_key)
    print("Google Gemini client configured.")

In [None]:
# Cell 3: Helper Functions (Updated)

def _parse_temperature_value(value):
    """
    Parses a single temperature string into a numerical Celsius value.
    """
    if pd.isna(value): return np.nan
    value_str = str(value).lower()
    if 'rt' in value_str or 'room' in value_str: return 25.0
    if 'assumed' in value_str or 'not specified' in value_str: return np.nan
    numbers = re.findall(r'[-+]?\d*\.\d+|\d+', value_str)
    if numbers:
        temp = float(numbers[0])
        if temp > 200: return temp - 273.15
        return temp
    return np.nan

def clean_and_standardize_temperature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Applies the temperature parsing logic to create a standardized 'temperature_celsius' column.
    """
    print("Standardizing temperature column...")
    df['temperature_celsius'] = df['temperature'].apply(_parse_temperature_value)
    parsed_count = df['temperature_celsius'].notna().sum()
    print(f"  -> Successfully parsed {parsed_count} / {len(df)} temperature entries.")
    return df

# In Cell 3 (Helper Functions)

def analyze_pca_by_word_correlation(df: pd.DataFrame, pca: PCA, provider: str, model_name: str, num_components: int = 2):
    """
    Analyzes PCA components by correlating them directly with the embeddings of
    individual words found within the dataset.
    """
    print("\n--- Interpreting PCA via Word-Component Correlation ---")

    stop_words = set(ENGLISH_STOP_WORDS)
    custom_stop_words = {'material', 'description', 'processing', 'method', 'na'}
    stop_words.update(custom_stop_words)

    interpretation_df = df[~df['material_description'].str.contains("Cited Work", na=False)]
    if interpretation_df.empty:
        print("Warning: No primary data found for interpretation.")
        return

    all_text = ' '.join(interpretation_df['feature_text'])
    words = set(re.findall(r'\b[a-zA-Z]{3,}\b', all_text.lower()))
    vocabulary = sorted([word for word in words if word not in stop_words])
    print(f"Built a vocabulary of {len(vocabulary)} unique terms from the data.")

    # MODIFIED: Use the generic dispatcher function instead of the OpenAI-specific one
    vocab_embeddings = get_embeddings(provider, model_name, vocabulary)

    for i in range(num_components):
        component_vector = pca.components_[i]
        alignment_scores = np.dot(vocab_embeddings, component_vector)
        sorted_indices = np.argsort(alignment_scores)

        positive_pole_words = [vocabulary[j] for j in sorted_indices[-5:][::-1]]
        negative_pole_words = [vocabulary[j] for j in sorted_indices[:5]]

        print(f"\n[+] Component {i+1} appears to distinguish between:")
        print(f"    - High values (Positive Pole) are defined by: {positive_pole_words}")
        print(f"    - Low values (Negative Pole) are defined by: {negative_pole_words}")

In [None]:
# Cell 3: Helper Functions (Modified)

# --- (Your _parse_temperature_value and clean_and_standardize_temperature functions remain here) ---

def get_openai_embeddings(texts: list[str], client: OpenAI, model_name: str, batch_size: int = 500) -> np.ndarray:
    """Generates embeddings using OpenAI's API."""
    embedding_dim = MODEL_CONFIG.get(model_name, 1536) # Default to 1536 if unknown
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Generating {model_name} Embeddings"):
        batch = texts[i:i + batch_size]
        try:
            response = client.embeddings.create(input=batch, model=model_name)
            embeddings = [item.embedding for item in response.data]
            all_embeddings.extend(embeddings)
        except Exception as e:
            print(f"An error occurred: {e}")
            all_embeddings.extend([[0] * embedding_dim] * len(batch))
    return np.array(all_embeddings)

def get_gemini_embeddings(texts: list[str], model_name: str, batch_size: int = 100) -> np.ndarray:
    """
    Generates embeddings using Gemini's API, following recommended practices.
    """
    embedding_dim = MODEL_CONFIG.get(model_name, 1536)
    all_embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc=f"Generating {model_name} Embeddings"):
        batch = texts[i:i + batch_size]
        try:
            # Recommended practice: Specify task_type and output_dimensionality [cite: 1149, 1207]
            result = genai.embed_content(
                model=model_name,
                content=batch,
                task_type="SEMANTIC_SIMILARITY",
                output_dimensionality=embedding_dim
            )

            # Recommended practice: Normalize embeddings for dimensions other than 3072 [cite: 1229]
            for embedding in result['embedding']:
                embedding_np = np.array(embedding)
                normed_embedding = embedding_np / np.linalg.norm(embedding_np)
                all_embeddings.append(normed_embedding)

        except Exception as e:
            print(f"An error occurred: {e}")
            all_embeddings.extend([[0] * embedding_dim] * len(batch))

    return np.array(all_embeddings)

def get_embeddings(provider: str, model_name: str, texts: list[str]) -> np.ndarray:
    """Dispatcher function to call the correct embedding service."""
    if provider == "openai":
        return get_openai_embeddings(texts, client, model_name)
    elif provider == "gemini":
        return get_gemini_embeddings(texts, model_name)
    else:
        raise ValueError(f"Unknown provider: {provider}. Choose 'openai' or 'gemini'.")


In [None]:
# Load the master CSV
df = pd.read_csv(CSV_PATH)
print(f"Loaded {len(df)} total data points from CSV.")

In [None]:
df.head(5)

In [None]:
unique_paper_titles=df['paper_title'].unique()
for title in unique_paper_titles:
    print(title)

In [None]:
df['paper_title']

In [None]:
# --- NEW STEP 1: Clean DOIs and Deduplicate Papers ---
# This step ensures each paper is only represented once in our analysis dataset.
# MODIFIED: Use 'paper_title' for deduplication instead of 'paper_doi'
df.dropna(subset=['paper_title'], inplace=True)
unique_papers_before = df['paper_title'].nunique()
# We keep the first set of entries for each unique paper based on 'paper_title'
df_unique_papers = df.drop_duplicates(subset=['paper_title'], keep='first').copy()
print(f"Filtered down to {len(df_unique_papers)} entries representing {unique_papers_before} unique papers based on paper title.")
# ---

In [None]:
df_unique_papers

In [None]:
# --- NEW STEP 2: Clean and FILTER for Room Temperature ---
# We will now work with the deduplicated DataFrame
df_cleaned = clean_and_standardize_temperature(df_unique_papers)


In [None]:
# Create a new DataFrame containing ONLY room temperature data (20-30°C)
rt_df = df_cleaned[(df_cleaned['temperature_celsius'] >= 20) & (df_cleaned['temperature_celsius'] <= 30)].copy()
print(f"Further filtered to {len(rt_df)} data points measured at Room Temperature (20-30°C).")

In [None]:
print(rt_df['temperature_celsius'].unique())

In [None]:
if not rt_df.empty:
    # Create the feature text for embedding using only the RT data
    rt_df['feature_text'] = (
        "Material: " + rt_df['full_name'].fillna('') +
        ". Description: " + rt_df['material_description'].fillna('') +
        ". Processing: " + rt_df['processing_method'].fillna('')
    )

    # --- THIS IS THE ONLY CALL YOU NEED ---
    # Generate embeddings using the dispatcher function
    embeddings = get_embeddings(
        provider=EMBEDDING_PROVIDER,
        model_name=EMBEDDING_MODEL,
        texts=rt_df['feature_text'].tolist()
    )
    # ------------------------------------

    # Filter out any rows where embedding might have failed
    valid_mask = np.all(embeddings != 0, axis=1)
    rt_df = rt_df[valid_mask].copy()
    embeddings = embeddings[valid_mask]

    print(f"\nSuccessfully generated embeddings for {len(rt_df)} room temperature data points.")
else:
    print("\nNo room temperature data found after filtering. Cannot proceed with embedding.")

# The 'rt_df' DataFrame is now ready for the next cell (PCA and visualization)

In [None]:
# Cell 5: Perform PCA and Interpret Results (Corrected)

# Perform PCA on the embeddings of the filtered room-temperature data
print("\nReducing embedding dimensions with PCA...")
pca = PCA(n_components=2, random_state=42)
embeddings_2d = pca.fit_transform(embeddings)

# Add PCA columns to the 'rt_df' DataFrame
rt_df['pca_1'] = embeddings_2d[:, 0]
rt_df['pca_2'] = embeddings_2d[:, 1]

# --- NEW: Programmatic Log Calculation ---
# First, try to calculate the log directly
# We temporarily suppress the "divide by zero" warning that numpy will raise for log(0)
with np.errstate(divide='ignore'):
    log_values = np.log10(rt_df['ionic_conductivity_S_per_cm'].values)

# Check if the operation produced any -inf values (the result of log(0))
if np.isneginf(log_values).any():
    print("Warning: Zero or non-positive conductivity values found. Adding a small constant for log scaling.")
    # If -inf exists, recalculate by adding a small constant to ensure numerical stability
    rt_df['log_conductivity'] = np.log10(rt_df['ionic_conductivity_S_per_cm'] + 1e-15)
else:
    # If no errors occurred, use the directly calculated values
    rt_df['log_conductivity'] = log_values
# --- END NEW LOGIC ---

# Run the updated analysis function, passing in the correct DataFrame and arguments
analyze_pca_by_word_correlation(rt_df, pca, EMBEDDING_PROVIDER, EMBEDDING_MODEL)

In [None]:
print(rt_df.head(5))

In [None]:
# Cell 6: Visualize and Save the Plot (Corrected)

# Create and save the final visualization using the 'rt_df' DataFrame
print(f"\nCreating visualization and saving to '{OUTPUT_FILENAME}'...")
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(14, 10))

scatter = ax.scatter(
    x=rt_df['pca_1'], y=rt_df['pca_2'], c=rt_df['log_conductivity'],
    cmap='viridis', s=50, alpha=0.8
)

cbar = plt.colorbar(scatter)
cbar.set_label('Log10(Ionic Conductivity [S/cm]) @ Room Temp', fontsize=12) # Updated label for clarity

ax.set_title('PCA of Room Temperature Materials (ada-002) vs. Ionic Conductivity', fontsize=16, pad=20) # Updated title
ax.set_xlabel('Principal Component 1', fontsize=12)
ax.set_ylabel('Principal Component 2', fontsize=12)

plt.savefig(OUTPUT_FILENAME, dpi=300, bbox_inches='tight')
print("Done.")
plt.show()

In [None]:
!pip install plotly

In [None]:
# Cell 6: Create an Interactive Visualization with Wrapped Text (Corrected)
import plotly.express as px
import textwrap

def wrap_text_for_plotly(text: str, width: int = 80) -> str:
    """Wraps a long string into multiple lines using HTML <br> tags."""
    # textwrap.wrap splits the string into a list of lines
    # '<br>'.join(...) then joins them back together with the HTML line break tag
    return '<br>'.join(textwrap.wrap(text, width=width))

# --- Prepare Data for Hovering ---
# Apply the wrapping function to the feature_text column to create a new column
# with HTML-formatted text for our tooltip.
rt_df['feature_text_wrapped'] = rt_df['feature_text'].apply(wrap_text_for_plotly)

# Create the interactive scatter plot
print(f"\nCreating interactive visualization with multi-line tooltips...")
fig = px.scatter(
    data_frame=rt_df,
    x='pca_1',
    y='pca_2',
    color='log_conductivity',
    color_continuous_scale=px.colors.sequential.Viridis,

    # Pass the columns we want to show in the tooltip to custom_data
    # The order here is important and matches the indices in the hovertemplate below.
    custom_data=[
        'feature_text_wrapped',        # customdata[0]
        'acronym',                     # customdata[1]
        'ionic_conductivity_S_per_cm', # customdata[2]
        'paper_title'                  # customdata[3]
    ],

    labels={
        "pca_1": "Principal Component 1",
        "pca_2": "Principal Component 2",
        "log_conductivity": "Log10(Conductivity)"
    },
    title='Interactive PCA of Room Temperature Materials'
)

# --- Define the Custom Hover Template ---
# This template uses HTML and references the columns from custom_data by index.
fig.update_traces(
    hovertemplate=(
        "<b>%{customdata[1]}</b><br><br>" +  # Show Acronym in bold
        "<b>Description:</b><br>%{customdata[0]}<br><br>" +  # Show the wrapped feature text
        "<b>Conductivity:</b> %{customdata[2]:.2e} S/cm<br>" + # Show conductivity
        "<b>Paper:</b> %{customdata[3]}" + # Show paper title
        "<extra></extra>"  # This special tag hides the default trace name
    )
)

# Improve the layout
fig.update_layout(
    width=900,
    height=700,
    title_font_size=20
)

# Show the interactive plot directly in the notebook
fig.show()

# Optionally, save the plot to a self-contained HTML file
html_output_filename = "interactive_conductivity_pca_wrapped.html"
fig.write_html(html_output_filename)
print(f"Interactive plot also saved to '{html_output_filename}'")

# From that alloy example-Leon

In [None]:
import pandas as pd
import numpy as np

def create_enhanced_feature_texts(df, strategy='comprehensive'):
    """
    Create feature text with different strategies for embedding generation.

    Args:
        df: DataFrame with material data (your rt_df)
        strategy: Strategy for creating text
            - 'comprehensive': All available information
            - 'material_only': Focus on material composition
            - 'processing_focused': Emphasize processing methods
            - 'description_focused': Prioritize material description

    Returns:
        Series with feature text for each material
    """

    print(f"\n{'='*60}")
    print(f"Creating feature text using '{strategy}' strategy")
    print(f"{'='*60}")

    if strategy == 'comprehensive':
        # Include all available information
        feature_text = (
            "Material: " + df['full_name'].fillna('Unknown') +
            " (" + df['acronym'].fillna('') + "). " +
            "Class: " + df['material_class'].fillna('Unknown') + ". " +
            "Description: " + df['material_description'].fillna('No description') + ". " +
            "Processing: " + df['processing_method'].fillna('Not specified') + ". " +
            "Source: " + df['source_location'].fillna('Unknown')
        )

    elif strategy == 'material_only':
        # Focus on material composition
        feature_text = (
            "Material: " + df['full_name'].fillna('Unknown') +
            " (" + df['acronym'].fillna('') + "). " +
            "Material class: " + df['material_class'].fillna('Unknown')
        )

    elif strategy == 'processing_focused':
        # Emphasize processing methods
        feature_text = (
            "Material: " + df['full_name'].fillna('Unknown') + ". " +
            "Processing method: " + df['processing_method'].fillna('Not specified') + ". " +
            "Material class: " + df['material_class'].fillna('Unknown')
        )

    elif strategy == 'description_focused':
        # Prioritize material description
        feature_text = (
            "Material: " + df['full_name'].fillna('Unknown') + ". " +
            "Description: " + df['material_description'].fillna('No description') + ". " +
            "Class: " + df['material_class'].fillna('Unknown')
        )

    else:
        raise ValueError(f"Unknown strategy: {strategy}. Choose from: 'comprehensive', 'material_only', 'processing_focused', 'description_focused'")

    # Print examples
    print(f"\n📊 Created {len(feature_text)} feature texts")
    print(f"\n📝 Example of first 3 materials:\n")
    for i in range(min(3, len(feature_text))):
        print(f"Material {i+1}:")
        print(f"  {feature_text.iloc[i][:200]}...")  # First 200 chars
        print()

    # Print statistics
    text_lengths = feature_text.str.len()
    print(f"📏 Text length statistics:")
    print(f"  Mean length: {text_lengths.mean():.0f} characters")
    print(f"  Min length:  {text_lengths.min():.0f} characters")
    print(f"  Max length:  {text_lengths.max():.0f} characters")

    return feature_text


# ============================================================================
# TEST THE FUNCTION
# ============================================================================

if __name__ == '__main__':
    print("="*60)
    print("TESTING FEATURE TEXT CREATION")
    print("="*60)

    # Assuming you have your rt_df loaded, test each strategy
    print("\n🧪 Test 1: Comprehensive strategy")
    text_comprehensive = create_enhanced_feature_texts(rt_df, strategy='comprehensive')

    print("\n" + "="*60)
    print("\n🧪 Test 2: Material-only strategy")
    text_material = create_enhanced_feature_texts(rt_df, strategy='material_only')

    print("\n" + "="*60)
    print("\n🧪 Test 3: Processing-focused strategy")
    text_processing = create_enhanced_feature_texts(rt_df, strategy='processing_focused')

    print("\n" + "="*60)
    print("\n🧪 Test 4: Description-focused strategy")
    text_description = create_enhanced_feature_texts(rt_df, strategy='description_focused')

    print("\n" + "="*60)
    print("✅ All tests completed!")
    print(f"\nYou can now use these feature texts for embedding generation.")
    print(f"Example: rt_df['feature_text'] = create_enhanced_feature_texts(rt_df, 'comprehensive')")

In [None]:
# Test it with your rt_df
text_comprehensive = create_enhanced_feature_texts(rt_df, strategy='comprehensive')

# Compare with your current approach
print("\nYour current feature text:")
print(rt_df['feature_text'].iloc[0])

print("\nNew comprehensive text:")
print(text_comprehensive.iloc[0])

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def generate_embeddings_with_pca(df, embedding_provider, embedding_model,
                                 feature_text_column='feature_text',
                                 n_pca_components=50):
    """
    Generate embeddings from text and apply PCA reduction with proper standardization.

    Args:
        df: DataFrame with material data
        embedding_provider: Provider name (e.g., 'openai', 'google')
        embedding_model: Model name
        feature_text_column: Column name containing the text to embed
        n_pca_components: Number of PCA components to keep (default: 50)

    Returns:
        tuple: (df_with_pca, pca_model, scaler_model, original_embeddings)
            - df_with_pca: Original df with added Embedding_PC1, Embedding_PC2, etc. columns
            - pca_model: Fitted PCA model (for later analysis)
            - scaler_model: Fitted StandardScaler (for consistency)
            - original_embeddings: Raw embedding array before PCA
    """

    print("="*80)
    print("GENERATING EMBEDDINGS WITH PCA REDUCTION")
    print("="*80)

    # Step 1: Generate embeddings
    print(f"\n🔄 Generating embeddings using {embedding_provider}/{embedding_model}...")
    embeddings = get_embeddings(
        provider=embedding_provider,
        model_name=embedding_model,
        texts=df[feature_text_column].tolist()
    )

    print(f"✅ Generated embeddings shape: {embeddings.shape}")
    print(f"   Samples: {embeddings.shape[0]}")
    print(f"   Original dimensions: {embeddings.shape[1]}")

    # Step 2: Filter out failed embeddings (all zeros)
    valid_mask = np.all(embeddings != 0, axis=1)
    n_failed = (~valid_mask).sum()

    if n_failed > 0:
        print(f"⚠️  Warning: {n_failed} embeddings failed (all zeros). Removing them.")

    df_filtered = df[valid_mask].copy()
    embeddings_filtered = embeddings[valid_mask]

    print(f"✅ Valid embeddings: {len(df_filtered)}")

    # Step 3: Standardize embeddings (IMPORTANT for PCA!)
    print(f"\n📊 Standardizing embeddings (zero mean, unit variance)...")
    scaler = StandardScaler()
    embeddings_scaled = scaler.fit_transform(embeddings_filtered)

    print(f"   Mean after scaling: {embeddings_scaled.mean():.6f}")
    print(f"   Std after scaling:  {embeddings_scaled.std():.6f}")

    # Step 4: Apply PCA
    if n_pca_components > embeddings_filtered.shape[1]:
        n_pca_components = embeddings_filtered.shape[1]
        print(f"⚠️  Reduced n_components to {n_pca_components} (max available)")

    print(f"\n🔬 Applying PCA reduction: {embeddings_filtered.shape[1]} → {n_pca_components} dimensions...")
    pca = PCA(n_components=n_pca_components, random_state=42)
    embeddings_pca = pca.fit_transform(embeddings_scaled)

    # Step 5: Create DataFrame with PCA components
    pca_columns = [f'Embedding_PC{i+1}' for i in range(n_pca_components)]
    pca_df = pd.DataFrame(embeddings_pca, columns=pca_columns, index=df_filtered.index)

    # Combine with original dataframe
    df_with_pca = pd.concat([df_filtered, pca_df], axis=1)

    # Step 6: Print PCA statistics
    print(f"\n✅ PCA completed!")
    print(f"   Final shape: {df_with_pca.shape}")
    print(f"   PCA components: {n_pca_components}")

    print(f"\n📈 Variance explained by first 10 components:")
    for i in range(min(10, n_pca_components)):
        print(f"   PC{i+1}: {pca.explained_variance_ratio_[i]:.4f} ({pca.explained_variance_ratio_[i]*100:.2f}%)")

    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    print(f"\n📊 Cumulative variance explained:")
    for threshold in [0.5, 0.7, 0.8, 0.9, 0.95]:
        n_components_needed = np.argmax(cumulative_variance >= threshold) + 1
        print(f"   {threshold*100:.0f}% variance: {n_components_needed} components")

    print(f"\n   Total variance explained by all {n_pca_components} components: {cumulative_variance[-1]:.4f}")

    # Step 7: Create variance explained plot
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # Plot 1: Scree plot (individual variance)
    ax1.bar(range(1, min(21, n_pca_components+1)),
            pca.explained_variance_ratio_[:20],
            alpha=0.7, color='steelblue', edgecolor='black')
    ax1.set_xlabel('Principal Component', fontsize=12)
    ax1.set_ylabel('Variance Explained Ratio', fontsize=12)
    ax1.set_title('Scree Plot: Variance per Component (First 20)', fontsize=13, fontweight='bold')
    ax1.grid(True, alpha=0.3)

    # Plot 2: Cumulative variance
    ax2.plot(range(1, n_pca_components+1), cumulative_variance, 'o-', linewidth=2, markersize=4)
    ax2.axhline(y=0.9, color='r', linestyle='--', label='90% variance')
    ax2.axhline(y=0.95, color='orange', linestyle='--', label='95% variance')
    ax2.set_xlabel('Number of Components', fontsize=12)
    ax2.set_ylabel('Cumulative Variance Explained', fontsize=12)
    ax2.set_title('Cumulative Variance Explained', fontsize=13, fontweight='bold')
    ax2.legend()
    ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    print(f"\n💡 TIP: You can now use 'Embedding_PC1', 'Embedding_PC2', etc. as features in your models")
    print(f"         or for visualization (e.g., scatter plot with PC1 vs PC2)")

    return df_with_pca, pca, scaler, embeddings_filtered


# ============================================================================
# TEST THE FUNCTION
# ============================================================================

if __name__ == '__main__':
    print("="*80)
    print("TESTING EMBEDDING + PCA PIPELINE")
    print("="*80)

    # First, create feature text (using the function from before)
    print("\n📝 Step 1: Creating feature text...")
    rt_df['feature_text_new'] = create_enhanced_feature_texts(rt_df, strategy='comprehensive')

    # Generate embeddings with PCA
    print("\n🚀 Step 2: Generating embeddings and applying PCA...")
    df_with_embeddings, pca_model, scaler_model, raw_embeddings = generate_embeddings_with_pca(
        df=rt_df,
        embedding_provider=EMBEDDING_PROVIDER,
        embedding_model=EMBEDDING_MODEL,
        feature_text_column='feature_text_new',
        n_pca_components=50  # Start with 50, adjust based on variance plot
    )

    print("\n" + "="*80)
    print("✅ EMBEDDING + PCA COMPLETED!")
    print("="*80)

    print(f"\n📋 New columns added:")
    embedding_cols = [col for col in df_with_embeddings.columns if col.startswith('Embedding_PC')]
    print(f"   {embedding_cols[:5]} ... (and {len(embedding_cols)-5} more)")

    print(f"\n🎯 You now have:")
    print(f"   • df_with_embeddings: Your data with PCA components")
    print(f"   • pca_model: The fitted PCA model")
    print(f"   • scaler_model: The fitted StandardScaler")
    print(f"   • raw_embeddings: Original embedding vectors (before PCA)")

In [None]:
def visualize_pca_embeddings(df_with_embeddings, target_column='log_conductivity'):
    """
    Create comprehensive visualizations of PCA-reduced embeddings.

    Args:
        df_with_embeddings: DataFrame with Embedding_PC columns
        target_column: Column to use for coloring (default: 'log_conductivity')

    Returns:
        matplotlib figure
    """

    print("="*80)
    print("CREATING PCA VISUALIZATIONS")
    print("="*80)

    fig = plt.figure(figsize=(18, 12))

    # Plot 1: PC1 vs PC2 colored by conductivity
    ax1 = plt.subplot(2, 3, 1)
    scatter1 = ax1.scatter(df_with_embeddings['Embedding_PC1'],
                          df_with_embeddings['Embedding_PC2'],
                          c=df_with_embeddings[target_column],
                          cmap='viridis', s=100, alpha=0.7,
                          edgecolors='black', linewidth=0.5)
    cbar1 = plt.colorbar(scatter1, ax=ax1)
    cbar1.set_label('Log₁₀(Conductivity)', fontsize=10)
    ax1.set_xlabel('Principal Component 1 (14.8% var)', fontsize=11)
    ax1.set_ylabel('Principal Component 2 (6.7% var)', fontsize=11)
    ax1.set_title('PCA: Colored by Conductivity', fontsize=12, fontweight='bold')
    ax1.grid(True, alpha=0.3)

    # Plot 2: PC1 vs PC2 colored by material class
    ax2 = plt.subplot(2, 3, 2)
    material_classes = df_with_embeddings['material_class'].unique()
    colors = plt.cm.Set3(np.linspace(0, 1, len(material_classes)))

    for mat_class, color in zip(material_classes, colors):
        mask = df_with_embeddings['material_class'] == mat_class
        ax2.scatter(df_with_embeddings.loc[mask, 'Embedding_PC1'],
                   df_with_embeddings.loc[mask, 'Embedding_PC2'],
                   label=mat_class, color=color, s=100, alpha=0.7,
                   edgecolors='black', linewidth=0.5)

    ax2.set_xlabel('Principal Component 1', fontsize=11)
    ax2.set_ylabel('Principal Component 2', fontsize=11)
    ax2.set_title('PCA: Colored by Material Class', fontsize=12, fontweight='bold')
    ax2.legend(fontsize=9, loc='best')
    ax2.grid(True, alpha=0.3)

    # Plot 3: PC1 vs PC3 colored by conductivity
    ax3 = plt.subplot(2, 3, 3)
    scatter3 = ax3.scatter(df_with_embeddings['Embedding_PC1'],
                          df_with_embeddings['Embedding_PC3'],
                          c=df_with_embeddings[target_column],
                          cmap='plasma', s=100, alpha=0.7,
                          edgecolors='black', linewidth=0.5)
    cbar3 = plt.colorbar(scatter3, ax=ax3)
    cbar3.set_label('Log₁₀(Conductivity)', fontsize=10)
    ax3.set_xlabel('Principal Component 1 (14.8% var)', fontsize=11)
    ax3.set_ylabel('Principal Component 3 (3.9% var)', fontsize=11)
    ax3.set_title('PCA: PC1 vs PC3', fontsize=12, fontweight='bold')
    ax3.grid(True, alpha=0.3)

    # Plot 4: Conductivity distribution by material class
    ax4 = plt.subplot(2, 3, 4)
    material_classes_sorted = df_with_embeddings.groupby('material_class')[target_column].median().sort_values().index

    data_by_class = [df_with_embeddings[df_with_embeddings['material_class'] == mc][target_column].values
                     for mc in material_classes_sorted]

    bp = ax4.boxplot(data_by_class, labels=material_classes_sorted, patch_artist=True)
    for patch, color in zip(bp['boxes'], plt.cm.Set3(np.linspace(0, 1, len(material_classes_sorted)))):
        patch.set_facecolor(color)

    ax4.set_ylabel('Log₁₀(Conductivity)', fontsize=11)
    ax4.set_title('Conductivity by Material Class', fontsize=12, fontweight='bold')
    ax4.tick_params(axis='x', rotation=45, labelsize=9)
    ax4.grid(True, alpha=0.3, axis='y')

    # Plot 5: PC1 distribution
    ax5 = plt.subplot(2, 3, 5)
    ax5.hist(df_with_embeddings['Embedding_PC1'], bins=20, alpha=0.7,
            color='steelblue', edgecolor='black')
    ax5.axvline(df_with_embeddings['Embedding_PC1'].mean(), color='red',
               linestyle='--', linewidth=2, label='Mean')
    ax5.set_xlabel('Principal Component 1', fontsize=11)
    ax5.set_ylabel('Frequency', fontsize=11)
    ax5.set_title('Distribution of PC1', fontsize=12, fontweight='bold')
    ax5.legend()
    ax5.grid(True, alpha=0.3)

    # Plot 6: Correlation between PC1 and conductivity
    ax6 = plt.subplot(2, 3, 6)
    ax6.scatter(df_with_embeddings['Embedding_PC1'],
               df_with_embeddings[target_column],
               alpha=0.6, s=80, color='coral', edgecolors='black', linewidth=0.5)

    # Add trend line
    z = np.polyfit(df_with_embeddings['Embedding_PC1'], df_with_embeddings[target_column], 1)
    p = np.poly1d(z)
    x_line = np.linspace(df_with_embeddings['Embedding_PC1'].min(),
                        df_with_embeddings['Embedding_PC1'].max(), 100)
    ax6.plot(x_line, p(x_line), "r--", linewidth=2, label=f'Trend: y={z[0]:.2f}x+{z[1]:.2f}')

    # Calculate correlation
    corr = df_with_embeddings['Embedding_PC1'].corr(df_with_embeddings[target_column])
    ax6.set_xlabel('Principal Component 1', fontsize=11)
    ax6.set_ylabel('Log₁₀(Conductivity)', fontsize=11)
    ax6.set_title(f'PC1 vs Conductivity (r={corr:.3f})', fontsize=12, fontweight='bold')
    ax6.legend(fontsize=9)
    ax6.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig('pca_embeddings_visualization.png', dpi=300, bbox_inches='tight')
    print("✅ Saved visualization to 'pca_embeddings_visualization.png'")
    plt.show()

    # Print correlation analysis
    print("\n" + "="*80)
    print("CORRELATION ANALYSIS: PCA COMPONENTS vs CONDUCTIVITY")
    print("="*80)

    pc_cols = [col for col in df_with_embeddings.columns if col.startswith('Embedding_PC')][:10]
    correlations = [(col, df_with_embeddings[col].corr(df_with_embeddings[target_column]))
                   for col in pc_cols]
    correlations.sort(key=lambda x: abs(x[1]), reverse=True)

    print("\nTop 10 PCs by correlation with conductivity:")
    for i, (pc, corr) in enumerate(correlations, 1):
        print(f"  {i:2d}. {pc:20s}: {corr:+.4f}")

    # Material class statistics
    print("\n" + "="*80)
    print("MATERIAL CLASS STATISTICS")
    print("="*80)

    class_stats = df_with_embeddings.groupby('material_class').agg({
        target_column: ['count', 'mean', 'std', 'min', 'max']
    }).round(3)
    print(class_stats)

    return fig


# ============================================================================
# TEST THE VISUALIZATION
# ============================================================================

if __name__ == '__main__':
    print("="*80)
    print("VISUALIZING PCA EMBEDDINGS")
    print("="*80)

    # Create visualizations
    fig = visualize_pca_embeddings(df_with_embeddings, target_column='log_conductivity')

    print("\n✅ Visualization complete!")

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

def quick_model_test(df_with_embeddings, n_pca_to_test=[5, 10, 20, 50],
                     target_column='log_conductivity', n_iterations=10):
    """
    Quick test: Can embeddings predict conductivity better than baseline?
    Tests different numbers of PCA components.

    Args:
        df_with_embeddings: DataFrame with Embedding_PC columns
        n_pca_to_test: List of PCA component counts to test
        target_column: Target variable
        n_iterations: Number of random train/test splits

    Returns:
        DataFrame with results
    """

    print("="*80)
    print("QUICK MODEL TEST: DO EMBEDDINGS HELP PREDICT CONDUCTIVITY?")
    print("="*80)

    # Get available PCA columns
    all_pca_cols = [col for col in df_with_embeddings.columns if col.startswith('Embedding_PC')]
    max_available = len(all_pca_cols)

    print(f"\n📊 Testing with {n_iterations} random train/test splits (80/20)")
    print(f"   Available PCA components: {max_available}")

    results = []

    # Baseline: Predict mean (simplest possible model)
    print(f"\n🎯 Baseline: Always predict the mean...")
    y = df_with_embeddings[target_column]

    for iteration in range(n_iterations):
        train_idx, test_idx = train_test_split(range(len(y)), test_size=0.2,
                                               random_state=42+iteration)
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Predict mean
        y_pred_mean = np.full(len(y_test), y_train.mean())

        results.append({
            'n_components': 0,
            'model': 'Baseline (mean)',
            'iteration': iteration + 1,
            'r2': r2_score(y_test, y_pred_mean),
            'mse': mean_squared_error(y_test, y_pred_mean)
        })

    print(f"   Baseline R²: {np.mean([r['r2'] for r in results if r['model']=='Baseline (mean)']):.4f}")

    # Test with different numbers of PCA components
    for n_pca in n_pca_to_test:
        if n_pca > max_available:
            print(f"\n⚠️  Skipping {n_pca} components (max available: {max_available})")
            continue

        print(f"\n🔬 Testing with {n_pca} PCA components...")

        # Select PCA columns
        pca_cols = all_pca_cols[:n_pca]
        X = df_with_embeddings[pca_cols]
        y = df_with_embeddings[target_column]

        for iteration in range(n_iterations):
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42+iteration
            )

            # Train Random Forest
            rf = RandomForestRegressor(n_estimators=100, random_state=42+iteration, n_jobs=-1)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_test)

            results.append({
                'n_components': n_pca,
                'model': f'RF ({n_pca} PCs)',
                'iteration': iteration + 1,
                'r2': r2_score(y_test, y_pred),
                'mse': mean_squared_error(y_test, y_pred)
            })

        avg_r2 = np.mean([r['r2'] for r in results if r['n_components']==n_pca])
        avg_mse = np.mean([r['mse'] for r in results if r['n_components']==n_pca])
        print(f"   Avg R²: {avg_r2:.4f}, Avg MSE: {avg_mse:.4f}")

    # Convert to DataFrame
    results_df = pd.DataFrame(results)

    # Summary statistics
    print("\n" + "="*80)
    print("SUMMARY RESULTS")
    print("="*80)

    summary = results_df.groupby(['model', 'n_components']).agg({
        'r2': ['mean', 'std'],
        'mse': ['mean', 'std']
    }).round(4)

    print("\n📊 Performance by model:")
    print(summary)

    # Create visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # Plot 1: R² scores
    models = results_df['model'].unique()
    colors = plt.cm.tab10(np.linspace(0, 1, len(models)))

    for model, color in zip(models, colors):
        model_data = results_df[results_df['model'] == model]
        ax1.scatter(model_data['iteration'], model_data['r2'],
                   label=model, alpha=0.6, s=60, color=color)

    ax1.axhline(y=0, color='red', linestyle='--', linewidth=1, alpha=0.5)
    ax1.set_xlabel('Iteration', fontsize=11)
    ax1.set_ylabel('R² Score', fontsize=11)
    ax1.set_title('R² Performance Across Iterations', fontsize=12, fontweight='bold')
    ax1.legend(fontsize=9)
    ax1.grid(True, alpha=0.3)

    # Plot 2: Boxplot comparison
    box_data = [results_df[results_df['model'] == model]['r2'].values for model in models]
    bp = ax2.boxplot(box_data, labels=[m.replace('RF (', '').replace(' PCs)', '') for m in models],
                     patch_artist=True)

    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.6)

    ax2.axhline(y=0, color='red', linestyle='--', linewidth=1, alpha=0.5)
    ax2.set_ylabel('R² Score', fontsize=11)
    ax2.set_title('R² Distribution by Model', fontsize=12, fontweight='bold')
    ax2.tick_params(axis='x', rotation=45)
    ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig('embedding_model_test.png', dpi=300, bbox_inches='tight')
    plt.show()

    # Interpretation
    print("\n" + "="*80)
    print("💡 INTERPRETATION")
    print("="*80)

    baseline_r2 = results_df[results_df['model']=='Baseline (mean)']['r2'].mean()
    best_model = summary['r2']['mean'].idxmax()[0]
    best_r2 = summary.loc[(best_model, summary.loc[best_model].index[0]), ('r2', 'mean')]

    print(f"\n• Baseline (predicting mean): R² = {baseline_r2:.4f}")
    print(f"• Best embedding model: {best_model} with R² = {best_r2:.4f}")

    if best_r2 > 0.3:
        print(f"\n✅ GOOD: Embeddings can predict conductivity (R² > 0.3)")
        print(f"   The text descriptions contain useful information!")
    elif best_r2 > 0.1:
        print(f"\n⚠️  WEAK: Embeddings show weak predictive power (0.1 < R² < 0.3)")
        print(f"   Text has some signal, but may need additional features")
    else:
        print(f"\n❌ POOR: Embeddings don't predict well (R² < 0.1)")
        print(f"   The text descriptions may not capture conductivity drivers")

    print(f"\n🔍 Next steps:")
    if best_r2 < 0.3:
        print(f"   1. Add traditional features (temperature, processing indicators)")
        print(f"   2. Try different embedding strategies (material_only, processing_focused)")
        print(f"   3. Feature engineering from text (keyword extraction)")
    else:
        print(f"   1. Combine embeddings with traditional features")
        print(f"   2. Feature selection to find most important components")
        print(f"   3. Try different models (XGBoost, Neural Networks)")

    return results_df, fig


# ============================================================================
# RUN THE TEST
# ============================================================================

if __name__ == '__main__':
    # Test if embeddings can predict conductivity
    results_df, fig = quick_model_test(
        df_with_embeddings,
        n_pca_to_test=[5, 10, 20, 50],
        target_column='log_conductivity',
        n_iterations=10
    )

In [None]:
def create_traditional_features(df):
    """
    Extract traditional features from your data that might predict conductivity.

    Args:
        df: DataFrame with material data

    Returns:
        DataFrame with traditional features only
    """

    print("="*80)
    print("CREATING TRADITIONAL FEATURES")
    print("="*80)

    features = pd.DataFrame(index=df.index)

    # ========================================================================
    # 1. TEMPERATURE (already numeric)
    # ========================================================================
    features['temperature_celsius'] = df['temperature_celsius']
    print(f"\n✅ Temperature feature added")

    # ========================================================================
    # 2. MATERIAL CLASS (one-hot encoding)
    # ========================================================================
    class_dummies = pd.get_dummies(df['material_class'], prefix='class', drop_first=False)
    features = pd.concat([features, class_dummies], axis=1)
    print(f"✅ Material class features added: {list(class_dummies.columns)}")

    # ========================================================================
    # 3. PROCESSING METHOD INDICATORS (keyword detection)
    # ========================================================================
    processing_keywords = {
        'sintering': ['sinter', 'sintered', 'sintering'],
        'casting': ['cast', 'casting'],
        'pressing': ['press', 'pressed', 'pressing', 'compression'],
        'annealing': ['anneal', 'annealed', 'annealing'],
        'drying': ['dry', 'dried', 'drying'],
        'heating': ['heat', 'heated', 'heating'],
        'milling': ['mill', 'milled', 'milling', 'ball-mill'],
        'mixing': ['mix', 'mixed', 'mixing', 'blend'],
        'electrospinning': ['electrospin'],
        'calcination': ['calcin'],
        'grinding': ['grind', 'ground'],
        'coating': ['coat', 'coated', 'coating']
    }

    processing_text = df['processing_method'].fillna('').str.lower()

    for feature_name, keywords in processing_keywords.items():
        features[f'process_{feature_name}'] = processing_text.apply(
            lambda x: int(any(kw in x for kw in keywords))
        )

    print(f"✅ Processing features added: {len(processing_keywords)} indicators")

    # ========================================================================
    # 4. MATERIAL NAME FEATURES (complexity indicators)
    # ========================================================================

    # Length of material name (proxy for complexity)
    features['material_name_length'] = df['full_name'].fillna('').str.len()

    # Has acronym
    features['has_acronym'] = (~df['acronym'].isna()).astype(int)

    # Number of words in name (composite materials often have more words)
    features['name_word_count'] = df['full_name'].fillna('').str.split().str.len()

    print(f"✅ Material name features added: 3 features")

    # ========================================================================
    # 5. DESCRIPTION FEATURES (information richness)
    # ========================================================================

    # Description length
    features['description_length'] = df['material_description'].fillna('').str.len()

    # Has description
    features['has_description'] = (df['material_description'].fillna('').str.len() > 0).astype(int)

    print(f"✅ Description features added: 2 features")

    # ========================================================================
    # 6. CHEMICAL ELEMENT INDICATORS (from material names)
    # ========================================================================

    common_elements = {
        'Li': ['Li', 'Lithium'],
        'La': ['La', 'Lanthanum'],
        'Zr': ['Zr', 'Zircon'],
        'Ti': ['Ti', 'Titan'],
        'Al': ['Al', 'Aluminum', 'Aluminium'],
        'Zn': ['Zn', 'Zinc'],
        'O': ['Oxide', 'Oxygen'],
        'F': ['Fluor', 'PVDF'],
        'Ga': ['Ga', 'Gallium'],
        'Ta': ['Ta', 'Tantalum'],
        'Nb': ['Nb', 'Niobium']
    }

    material_text = df['full_name'].fillna('') + ' ' + df['acronym'].fillna('')
    material_text = material_text.str.lower()

    for element, keywords in common_elements.items():
        features[f'element_{element}'] = material_text.apply(
            lambda x: int(any(kw.lower() in x for kw in keywords))
        )

    print(f"✅ Chemical element features added: {len(common_elements)} indicators")

    # ========================================================================
    # 7. SOURCE LOCATION (where data was extracted from)
    # ========================================================================

    # Some sources might be more reliable than others
    source_dummies = pd.get_dummies(df['source_location'].fillna('Unknown'),
                                   prefix='source', drop_first=False)

    # Only keep sources that appear at least 3 times
    source_counts = df['source_location'].value_counts()
    frequent_sources = source_counts[source_counts >= 3].index
    source_dummies = source_dummies[[col for col in source_dummies.columns
                                    if any(src in col for src in frequent_sources)]]

    features = pd.concat([features, source_dummies], axis=1)
    print(f"✅ Source location features added: {len(source_dummies.columns)} indicators")

    # ========================================================================
    # SUMMARY
    # ========================================================================
    print("\n" + "="*80)
    print("FEATURE SUMMARY")
    print("="*80)
    print(f"\nTotal features created: {len(features.columns)}")
    print(f"Samples: {len(features)}")
    print(f"\nFeature types:")
    print(f"  • Temperature: 1")
    print(f"  • Material class: {len(class_dummies.columns)}")
    print(f"  • Processing methods: {len(processing_keywords)}")
    print(f"  • Material name: 3")
    print(f"  • Description: 2")
    print(f"  • Chemical elements: {len(common_elements)}")
    print(f"  • Source location: {len(source_dummies.columns)}")

    # Check for missing values
    missing = features.isnull().sum()
    if missing.any():
        print(f"\n⚠️  Missing values found:")
        print(missing[missing > 0])
        print(f"\n   Filling missing values with 0...")
        features = features.fillna(0)
    else:
        print(f"\n✅ No missing values")

    return features


# ============================================================================
# TEST TRADITIONAL FEATURES
# ============================================================================

if __name__ == '__main__':
    print("="*80)
    print("TESTING TRADITIONAL FEATURES")
    print("="*80)

    # Create traditional features
    traditional_features = create_traditional_features(df_with_embeddings)

    # Test them with the same function
    print("\n" + "="*80)
    print("TESTING: CAN TRADITIONAL FEATURES PREDICT CONDUCTIVITY?")
    print("="*80)

    # Add traditional features to a copy of the dataframe
    df_with_traditional = df_with_embeddings.copy()
    df_with_traditional = pd.concat([df_with_traditional, traditional_features], axis=1)

    # Now test with traditional features
    # We'll manually do a quick test
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error, r2_score

    # Get traditional feature columns
    trad_cols = list(traditional_features.columns)

    print(f"\n🔬 Testing Random Forest with {len(trad_cols)} traditional features...")
    print(f"   Features: {trad_cols[:5]} ... (and {len(trad_cols)-5} more)")

    X = traditional_features
    y = df_with_embeddings['log_conductivity']

    results = []
    for iteration in range(10):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42+iteration
        )

        rf = RandomForestRegressor(n_estimators=100, random_state=42+iteration, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)

        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        results.append({'r2': r2, 'mse': mse})

    avg_r2 = np.mean([r['r2'] for r in results])
    std_r2 = np.std([r['r2'] for r in results])
    avg_mse = np.mean([r['mse'] for r in results])

    print(f"\n📊 RESULTS:")
    print(f"   R² = {avg_r2:.4f} ± {std_r2:.4f}")
    print(f"   MSE = {avg_mse:.4f}")

    if avg_r2 > 0.3:
        print(f"\n✅ GOOD! Traditional features can predict conductivity")
    elif avg_r2 > 0.1:
        print(f"\n⚠️  WEAK: Some predictive power but limited")
    else:
        print(f"\n❌ POOR: Traditional features don't help much either")

    # Feature importance
    print(f"\n🎯 Training final model to check feature importance...")
    rf_final = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_final.fit(X, y)

    feature_importance = pd.DataFrame({
        'feature': trad_cols,
        'importance': rf_final.feature_importances_
    }).sort_values('importance', ascending=False)

    print(f"\n📊 Top 10 most important traditional features:")
    for i, row in feature_importance.head(10).iterrows():
        print(f"   {row['feature']:40s}: {row['importance']:.4f}")

In [None]:
def diagnose_dataset(df, target_column='log_conductivity'):
    """
    Comprehensive diagnostic to understand why models aren't working.

    Args:
        df: DataFrame with material data
        target_column: Target variable column
    """

    print("="*80)
    print("DATASET DIAGNOSTIC ANALYSIS")
    print("="*80)

    # ========================================================================
    # 1. SAMPLE SIZE AND VARIANCE
    # ========================================================================
    print("\n📊 SAMPLE SIZE AND TARGET VARIANCE")
    print("-"*80)

    y = df[target_column]
    print(f"Number of samples: {len(y)}")
    print(f"Target mean: {y.mean():.4f}")
    print(f"Target std: {y.std():.4f}")
    print(f"Target range: [{y.min():.4f}, {y.max():.4f}]")
    print(f"Coefficient of variation: {(y.std() / abs(y.mean())):.4f}")

    # Check for outliers
    q1, q3 = y.quantile([0.25, 0.75])
    iqr = q3 - q1
    outliers = ((y < q1 - 1.5*iqr) | (y > q3 + 1.5*iqr)).sum()
    print(f"Potential outliers (IQR method): {outliers} ({outliers/len(y)*100:.1f}%)")

    # ========================================================================
    # 2. TEMPERATURE ANALYSIS
    # ========================================================================
    print("\n🌡️  TEMPERATURE ANALYSIS")
    print("-"*80)

    temp = df['temperature_celsius']
    print(f"Temperature range: [{temp.min():.1f}, {temp.max():.1f}]°C")
    print(f"Temperature std: {temp.std():.4f}")
    print(f"Unique temperatures: {temp.nunique()}")

    if temp.std() < 5:
        print("⚠️  WARNING: Temperature has very low variance!")
        print("   All data is essentially at the same temperature.")
        print("   This means temperature cannot predict conductivity.")

    # ========================================================================
    # 3. MATERIAL CLASS DISTRIBUTION
    # ========================================================================
    print("\n🧪 MATERIAL CLASS DISTRIBUTION")
    print("-"*80)

    class_counts = df['material_class'].value_counts()
    print(class_counts)

    print(f"\n   Conductivity by class:")
    for mat_class in df['material_class'].unique():
        class_data = df[df['material_class'] == mat_class][target_column]
        print(f"   {mat_class:12s}: mean={class_data.mean():7.3f}, std={class_data.std():6.3f}, n={len(class_data):3d}")

    # Check if classes overlap too much
    print("\n   Class separation (ANOVA F-statistic):")
    from scipy import stats
    groups = [df[df['material_class'] == mc][target_column].values
              for mc in df['material_class'].unique()]
    f_stat, p_value = stats.f_oneway(*groups)
    print(f"   F-statistic: {f_stat:.4f}, p-value: {p_value:.4f}")

    if p_value > 0.05:
        print("   ⚠️  Classes do NOT have significantly different conductivities")
    else:
        print("   ✅ Classes have significantly different conductivities")

    # ========================================================================
    # 4. CHECK DATA QUALITY ISSUES
    # ========================================================================
    print("\n🔍 DATA QUALITY CHECKS")
    print("-"*80)

    # Check for duplicate materials
    duplicates = df.duplicated(subset=['full_name'], keep=False).sum()
    print(f"Duplicate material names: {duplicates}")

    # Check text quality
    empty_descriptions = (df['material_description'].fillna('').str.len() == 0).sum()
    empty_processing = (df['processing_method'].fillna('').str.len() == 0).sum()

    print(f"Empty descriptions: {empty_descriptions} ({empty_descriptions/len(df)*100:.1f}%)")
    print(f"Empty processing methods: {empty_processing} ({empty_processing/len(df)*100:.1f}%)")

    # ========================================================================
    # 5. VISUALIZATION
    # ========================================================================
    print("\n📈 Creating diagnostic plots...")

    fig, axes = plt.subplots(2, 3, figsize=(18, 10))

    # Plot 1: Target distribution
    ax = axes[0, 0]
    ax.hist(y, bins=30, alpha=0.7, color='steelblue', edgecolor='black')
    ax.axvline(y.mean(), color='red', linestyle='--', linewidth=2, label='Mean')
    ax.axvline(y.median(), color='orange', linestyle='--', linewidth=2, label='Median')
    ax.set_xlabel('Log₁₀(Conductivity)', fontsize=11)
    ax.set_ylabel('Frequency', fontsize=11)
    ax.set_title('Target Variable Distribution', fontsize=12, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # Plot 2: Q-Q plot (check normality)
    ax = axes[0, 1]
    stats.probplot(y, dist="norm", plot=ax)
    ax.set_title('Q-Q Plot (Normality Check)', fontsize=12, fontweight='bold')
    ax.grid(True, alpha=0.3)

    # Plot 3: Conductivity by material class
    ax = axes[0, 2]
    class_data = [df[df['material_class'] == mc][target_column].values
                  for mc in df['material_class'].unique()]
    bp = ax.boxplot(class_data, labels=df['material_class'].unique(), patch_artist=True)
    for patch in bp['boxes']:
        patch.set_facecolor('lightblue')
    ax.set_ylabel('Log₁₀(Conductivity)', fontsize=11)
    ax.set_title('Conductivity by Material Class', fontsize=12, fontweight='bold')
    ax.tick_params(axis='x', rotation=45)
    ax.grid(True, alpha=0.3)

    # Plot 4: Sample size per class
    ax = axes[1, 0]
    class_counts.plot(kind='bar', ax=ax, color='coral', alpha=0.7, edgecolor='black')
    ax.set_ylabel('Number of Samples', fontsize=11)
    ax.set_title('Sample Size per Class', fontsize=12, fontweight='bold')
    ax.tick_params(axis='x', rotation=45)
    ax.grid(True, alpha=0.3)

    # Plot 5: Conductivity vs temperature
    ax = axes[1, 1]
    ax.scatter(df['temperature_celsius'], y, alpha=0.6, s=60, color='green', edgecolors='black')
    ax.set_xlabel('Temperature (°C)', fontsize=11)
    ax.set_ylabel('Log₁₀(Conductivity)', fontsize=11)
    ax.set_title('Conductivity vs Temperature', fontsize=12, fontweight='bold')
    ax.grid(True, alpha=0.3)

    # Plot 6: Variance decomposition
    ax = axes[1, 2]

    # Calculate variance components
    total_var = y.var()
    within_class_var = sum(
        len(df[df['material_class'] == mc]) * df[df['material_class'] == mc][target_column].var()
        for mc in df['material_class'].unique()
    ) / len(df)
    between_class_var = total_var - within_class_var

    ax.bar(['Within\nClass', 'Between\nClass', 'Total'],
           [within_class_var, between_class_var, total_var],
           color=['lightcoral', 'lightgreen', 'lightblue'],
           alpha=0.7, edgecolor='black')
    ax.set_ylabel('Variance', fontsize=11)
    ax.set_title('Variance Decomposition', fontsize=12, fontweight='bold')
    ax.grid(True, alpha=0.3)

    # Add percentage labels
    ax.text(0, within_class_var/2, f'{within_class_var/total_var*100:.1f}%',
            ha='center', fontsize=10, fontweight='bold')
    ax.text(1, between_class_var/2, f'{between_class_var/total_var*100:.1f}%',
            ha='center', fontsize=10, fontweight='bold')

    plt.tight_layout()
    plt.savefig('dataset_diagnostic.png', dpi=300, bbox_inches='tight')
    plt.show()

    # ========================================================================
    # 6. RECOMMENDATIONS
    # ========================================================================
    print("\n" + "="*80)
    print("💡 RECOMMENDATIONS")
    print("="*80)

    issues = []
    recommendations = []

    if len(y) < 200:
        issues.append("Small sample size (<200)")
        recommendations.append("Collect more data if possible")

    if temp.std() < 5:
        issues.append("Temperature has no variance (all room temp)")
        recommendations.append("Filter out temperature as a feature OR collect data at different temps")

    if p_value > 0.05:
        issues.append("Material classes don't separate conductivity well")
        recommendations.append("Look for other grouping variables (e.g., processing methods)")

    if outliers > len(y) * 0.1:
        issues.append(f"High number of outliers ({outliers/len(y)*100:.1f}%)")
        recommendations.append("Investigate and possibly remove extreme outliers")

    if within_class_var / total_var > 0.8:
        issues.append("Most variance is WITHIN classes, not between them")
        recommendations.append("Material class alone won't predict well - need other features")

    if issues:
        print("\n⚠️  Issues detected:")
        for i, issue in enumerate(issues, 1):
            print(f"   {i}. {issue}")

        print("\n🔧 Recommendations:")
        for i, rec in enumerate(recommendations, 1):
            print(f"   {i}. {rec}")
    else:
        print("\n✅ No major issues detected with dataset structure")

    print("\n" + "="*80)
    print("🎯 BOTTOM LINE")
    print("="*80)

    if len(y) < 200 and within_class_var / total_var > 0.8:
        print("\nYour dataset has:")
        print("  • Small sample size")
        print("  • High within-class variance")
        print("  • Limited distinguishing features")
        print("\nThis makes prediction very difficult!")
        print("\nBest approach: Focus on DESCRIPTIVE analysis rather than prediction")
        print("  - What material classes perform best on average?")
        print("  - What processing methods correlate with high conductivity?")
        print("  - Can you identify patterns manually in the data?")
    else:
        print("\nYou have enough signal for prediction. Try:")
        print("  - Feature engineering (extract more from text)")
        print("  - Different embedding strategies")
        print("  - Combined traditional + embedding features")


# ============================================================================
# RUN DIAGNOSTIC
# ============================================================================

if __name__ == '__main__':
    diagnose_dataset(df_with_embeddings, target_column='log_conductivity')

# Train ML model

In [None]:
!pip install xgboost

In [None]:
# In Cell 1, add these new imports
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.cross_decomposition import PLSRegression
import xgboost as xgb

In [None]:
# Define the features (X) and the target (y) from your cleaned, room-temperature DataFrame
X = embeddings
y = rt_df['log_conductivity'].values

# Create a dictionary of all the models we want to test
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(alpha=0.1, random_state=42),
    "Ridge": Ridge(alpha=1.0, random_state=42),
    "Bayesian Ridge": BayesianRidge(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "SVR": SVR(kernel='rbf'),
    "PLS Regression": PLSRegression(n_components=10),
    "XGBoost": xgb.XGBRegressor(objective='reg:squarederror', n_estimators=500, random_state=42)
}

In [None]:
# Set up 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

results = []

print("Running 5-fold cross-validation for each model...")
for name, model in tqdm(models.items(), desc="Validating Models"):
    # Calculate R-squared scores
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')

    # Calculate Mean Absolute Error scores
    mae_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_absolute_error')

    results.append({
        "Model": name,
        "Mean R2": np.mean(r2_scores),
        "Std R2": np.std(r2_scores),
        "Mean MAE": np.mean(mae_scores),
        "Std MAE": np.std(mae_scores)
    })

# Display the results in a clean DataFrame
results_df = pd.DataFrame(results).sort_values(by="Mean R2", ascending=False).reset_index(drop=True)
print("\n--- Cross-Validation Performance Summary ---")
display(results_df)

In [None]:
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Split the data one time for consistent visualization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the 2x4 subplot grid
fig, axes = plt.subplots(2, 4, figsize=(20, 10)) # MODIFIED: Changed layout and size
axes = axes.flatten() # Flatten the 2D array of axes for easy iteration

print("\n--- Generating Actual vs. Predicted Plots for Test Set ---")
for i, (name, model) in enumerate(models.items()):
    ax = axes[i]

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate test score
    test_r2 = r2_score(y_test, y_pred)

    # Create scatter plot
    sns.scatterplot(x=y_test, y=y_pred, ax=ax, alpha=0.7, s=50, label=f'Test Data (R²={test_r2:.2f})')

    # Plot the ideal fit line (y=x)
    lims = [min(y_test.min(), y_pred.min()) - 1, max(y_test.max(), y_pred.max()) + 1]
    ax.plot(lims, lims, 'r--', alpha=0.75, zorder=0, label='Ideal Fit')
    ax.set_xlim(lims)
    ax.set_ylim(lims)

    ax.set_title(f"{name} Performance", fontsize=12)
    ax.set_xlabel("Actual Log10(Conductivity)")
    ax.set_ylabel("Predicted Log10(Conductivity)")
    ax.set_aspect('equal', adjustable='box')
    ax.legend(loc='upper left')

# Adjust layout and display the plot
plt.tight_layout(pad=3.0)
plt.show()

In [None]:
# Cell: Predict on New, Hypothetical Materials (Corrected)

# --- 1. Automatically Select the Best Model ---
# Find the model with the highest Mean R-squared from our cross-validation results
best_model_name = results_df.loc[results_df['Mean R2'].idxmax()]['Model']
best_model = models[best_model_name]
print(f"Selected the best model based on cross-validation: '{best_model_name}'")

# --- 2. Retrain the Best Model on the Entire Dataset ---
# For final predictions, it's best practice to train the model on all available data
print(f"Retraining '{best_model_name}' on the full dataset...")
best_model.fit(X, y)
print("Model retraining complete.")

# --- 3. Define Hypothetical Materials and Get Embeddings ---
hypothetical_material_1 = "Material: Ga-doped LLZO. Description: Dense ceramic pellet with cubic garnet phase. Processing: High-temperature sintering at 1200°C for 5 hours."
hypothetical_material_2 = "Material: PEO with LiTFSI. Description: Amorphous polymer film. Processing: Solution casting followed by drying in vacuum at 60°C."
hypothetical_materials = [hypothetical_material_1, hypothetical_material_2]

# Use the generic dispatcher function to get embeddings
new_embeddings = get_embeddings(
    provider=EMBEDDING_PROVIDER,
    model_name=EMBEDDING_MODEL,
    texts=hypothetical_materials
)

# --- 4. Make Predictions with the Best Model ---
predicted_log_conductivities = best_model.predict(new_embeddings)

# Convert log conductivity back to standard scientific notation for easier interpretation
predicted_conductivities = [10**log_val for log_val in predicted_log_conductivities]

# --- 5. Print the Results ---
print("\n--- Predictions for Hypothetical Materials ---")
print(f"Using model: {best_model_name}")
print("-" * 40)
print(f"Material 1 (LLZO):")
print(f"  - Predicted Log Conductivity: {predicted_log_conductivities[0]:.2f}")
print(f"  - Predicted Conductivity (S/cm): {predicted_conductivities[0]:.2e}")
print("-" * 40)
print(f"Material 2 (PEO):")
print(f"  - Predicted Log Conductivity: {predicted_log_conductivities[1]:.2f}")
print(f"  - Predicted Conductivity (S/cm): {predicted_conductivities[1]:.2e}")

In [None]:
# Cell: Re-evaluating Models with Scaling and PCA

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from tqdm.notebook import tqdm
import numpy as np

# --- 1. Create a new set of models wrapped in a preprocessing pipeline ---
models_with_pca = {}
print("Creating new models with StandardScaler and PCA preprocessing...")
for name, model in models.items():
    # Each pipeline will first scale the data, then apply PCA, then train the model
    models_with_pca[name] = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=0.95, random_state=42)),
        ('regressor', model)
    ])

# --- 2. Run 5-fold cross-validation on the new pipelines ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pca_results = []

print("\nRunning 5-fold cross-validation on models with PCA...")
for name, pipeline in tqdm(models_with_pca.items(), desc="Validating PCA Models"):
    r2_scores = cross_val_score(pipeline, X, y, cv=kf, scoring='r2')
    mae_scores = -cross_val_score(pipeline, X, y, cv=kf, scoring='neg_mean_absolute_error')

    pca_results.append({
        "Model": name,
        "Mean R2": np.mean(r2_scores),
        "Mean MAE": np.mean(mae_scores)
    })

pca_results_df = pd.DataFrame(pca_results)

# --- 3. Create a Comparison DataFrame ---
# Rename columns for clarity before merging
original_renamed = results_df.rename(columns={'Mean R2': 'R2 (Original)', 'Mean MAE': 'MAE (Original)'})
pca_renamed = pca_results_df.rename(columns={'Mean R2': 'R2 (With PCA)', 'Mean MAE': 'MAE (With PCA)'})

# Merge the original and new results on the 'Model' column
comparison_df = pd.merge(
    original_renamed[['Model', 'R2 (Original)', 'MAE (Original)']],
    pca_renamed[['Model', 'R2 (With PCA)', 'MAE (With PCA)']],
    on='Model'
)

# Add a column to explicitly show the change in performance
comparison_df['R2 Change'] = comparison_df['R2 (With PCA)'] - comparison_df['R2 (Original)']

# Sort by the new R2 score to see the best-performing models at the top
comparison_df = comparison_df.sort_values(by='R2 (With PCA)', ascending=False).reset_index(drop=True)

# --- 4. Display the Final Comparison ---
print("\n--- Performance Comparison: Original vs. Scaled + PCA ---")
display(comparison_df)