In [3]:
import joblib
import pandas as pd
import numpy as np
# The final pipeline object (clf) was saved as 'gene_effect_predictor_pipeline.joblib'

# 1. Load the model pipeline
try:
    predictor_pipeline = joblib.load('gene_effect_predictor_pipeline.joblib')
    print("Successfully loaded the predictive pipeline.")
except Exception as e:
    print(f"Error loading model: {e}. Ensure 'gene_effect_predictor_pipeline.joblib' is in the current directory.")

# 2. Define a function to predict a new gene modification
def predict_effect(gene_name: str, modification_type: str, species: str, pathway_description: str = "", kb_text: str = ""):
    """Uses the trained ML pipeline to predict the effect (label)."""
    
    # Create the input row based on the features used for training
    new_data = pd.DataFrame([{
        'combined_text': f"{pathway_description} {kb_text}",
        'modification_type': modification_type.lower(),
        'species': species.lower()
    }])
    
    # Note: We must clean the species to match the format used during training
    
    prediction = predictor_pipeline.predict(new_data)[0]
    return prediction

# --- DEMO EXAMPLE ---
# Let's test with a hypothetical input: a gene (e.g., MTOR) in human
gene = "MTOR"
mod = "CRISPR deletion"
spc = "Human"
desc = "Pro-survival pathway regulating metabolism, growth, and apoptosis." # Example pathway short_description
kb_info = "The MTOR gene is essential for cell growth."

ml_prediction = predict_effect(gene, mod, spc, desc, kb_info)
print(f"\n--- ML Prediction Demo ---")
print(f"Input: Gene '{gene}' ({mod}) in {spc}")
print(f"ML Predicted Outcome Label: {ml_prediction}")

Error loading model: [Errno 2] No such file or directory: 'gene_effect_predictor_pipeline.joblib'. Ensure 'gene_effect_predictor_pipeline.joblib' is in the current directory.


NameError: name 'predictor_pipeline' is not defined

In [4]:
import joblib
import pandas as pd
import numpy as np

# 1. LOAD THE TRAINED MODEL PIPELINE
# This line is crucial and fixes the NameError
try:
    predictor_pipeline = joblib.load('gene_effect_predictor_pipeline.joblib')
    print("‚úÖ Successfully loaded the predictive pipeline.")
except Exception as e:
    print(f"‚ùå Error loading model: {e}. Please ensure 'gene_effect_predictor_pipeline.joblib' is in the current directory.")
    # Exit or handle error if model cannot be loaded
    predictor_pipeline = None

# 2. DEFINE PREDICTION FUNCTIONS

def predict_effect(gene_name: str, modification_type: str, species: str, pathway_description: str = "", kb_text: str = ""):
    """Uses the trained ML pipeline to predict the effect (label)."""
    if predictor_pipeline is None:
        return "Model not loaded. Cannot predict."

    # Create the input row based on the features used for training
    new_data = pd.DataFrame([{
        'combined_text': f"{pathway_description} {kb_text}",
        'modification_type': modification_type.lower(),
        'species': species.lower()
    }])
    
    prediction = predictor_pipeline.predict(new_data)[0]
    return prediction

def map_outcome_rule(expected_outcome_text: str):
    """Rule-based classifier based purely on keyword matching in the outcome text."""
    if pd.isna(expected_outcome_text): return 'unknown'
    t = expected_outcome_text.lower()
    
    if any(w in t for w in ['reduces','loss','decrease','inhib','defect','tumorigenesis','dysfunction','neurodegeneration']):
        return 'harmful'
    if any(w in t for w in ['increased','enhanced','improved','adaptive','beneficial','restored','resistance']):
        return 'beneficial'
    if any(w in t for w in ['no change','neutral','unknown','albino phenotype']):
        return 'neutral'
        
    return 'other'


# 3. RUN THE DEMOS

# --- ML Prediction Demo ---
gene = "MTOR"
mod = "CRISPR deletion"
spc = "Human"
desc = "Pro-survival pathway regulating metabolism, growth, and apoptosis." 
kb_info = "The MTOR gene is essential for cell growth."

ml_prediction = predict_effect(gene, mod, spc, desc, kb_info)

print(f"\n--- ML Prediction Demo ---")
print(f"Input: Gene '{gene}' ({mod}) in {spc}")
print(f"ML Predicted Outcome Label: {ml_prediction}")

# --- Rule-Based Baseline Demo ---
example_outcome = "reduces DNA repair efficiency"
rule_prediction = map_outcome_rule(example_outcome)

print(f"\n--- Rule-Based Baseline Demo ---")
print(f"Input Outcome Text: '{example_outcome}'")
print(f"Rule-Based Predicted Label: {rule_prediction}")

‚ùå Error loading model: [Errno 2] No such file or directory: 'gene_effect_predictor_pipeline.joblib'. Please ensure 'gene_effect_predictor_pipeline.joblib' is in the current directory.

--- ML Prediction Demo ---
Input: Gene 'MTOR' (CRISPR deletion) in Human
ML Predicted Outcome Label: Model not loaded. Cannot predict.

--- Rule-Based Baseline Demo ---
Input Outcome Text: 'reduces DNA repair efficiency'
Rule-Based Predicted Label: harmful


In [5]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
import joblib

# --- 1. DATA LOADING AND PREPROCESSING (RECREATING THE MODEL OBJECTS) ---

print("Loading data and recreating the ML pipeline...")

# Load the data files
examples = pd.read_csv("examples.csv")
kb = pd.read_csv("kb_passages.csv")
pathways = pd.read_csv("pathways.csv")

# Data cleaning and Explosion (Steps 4 & 5)
examples['species'] = examples['species'].str.strip().str.lower()
def split_genes(s):
    if pd.isna(s): return []
    return [g.strip().upper() for g in re.split(r'[;,]', str(s)) if g.strip()]
pathways['genes_parsed'] = pathways['genes_list'].apply(split_genes)
pathway_gene = pathways.explode('genes_parsed').rename(columns={'genes_parsed':'gene_symbol'})
pathway_gene['gene_symbol'] = pathway_gene['gene_symbol'].str.upper()
pathway_gene = pathway_gene[['pathway_id','pathway_name','gene_symbol','short_description']]
pathway_gene.dropna(subset=['gene_symbol'], inplace=True)
examples['gene_symbol'] = examples['gene_name'].str.strip().str.upper()
examples_pathways = examples.merge(pathway_gene, how='left', left_on='gene_symbol', right_on='gene_symbol')

# Text Enrichment (Step 6)
examples_pathways = examples_pathways.merge(
    kb[['passage_id', 'text']].rename(columns={'text': 'kb_text_passage'}), 
    how='left', left_on='pathway_id', right_on='passage_id')
def find_passage_for_gene(g):
    mask = kb['text'].str.upper().str.contains(r'\b'+re.escape(g)+r'\b', na=False)
    hits = kb[mask]
    if not hits.empty: return hits.iloc[0]['text'] 
    return np.nan
examples_pathways['kb_text'] = examples_pathways['kb_text_passage'].fillna(
    examples_pathways.apply(lambda row: find_passage_for_gene(row['gene_symbol']) if pd.isna(row['kb_text_passage']) and not pd.isna(row['gene_symbol']) else np.nan, axis=1))
examples_pathways['kb_text'] = examples_pathways['kb_text_passage'].fillna(examples_pathways['kb_text'])

# Labeling (Step 7)
def map_outcome(text):
    if pd.isna(text): return np.nan
    t = text.lower()
    if any(w in t for w in ['reduces','loss','decrease','inhib','defect','tumorigenesis','dysfunction','neurodegeneration']): return 'harmful'
    if any(w in t for w in ['increased','enhanced','improved','adaptive','beneficial','restored','resistance']): return 'beneficial'
    if any(w in t for w in ['no change','neutral','unknown','albino phenotype']): return 'neutral'
    return 'other'
examples_pathways['label'] = examples_pathways['expected_outcome'].apply(map_outcome)

# Feature Engineering & Training (Step 8)
examples_pathways['combined_text'] = (examples_pathways['short_description'].fillna('') + ' ' + examples_pathways['kb_text'].fillna(''))
train_df = examples_pathways.dropna(subset=['label']).copy()
train_df_unique = train_df.groupby(['gene_name', 'modification_type']).first().reset_index()
train_df_unique = train_df_unique[train_df_unique['label'] != 'neutral'].copy()
X = train_df_unique[['combined_text','modification_type','species']]
y = train_df_unique['label']
text_transform = TfidfVectorizer(max_features=50, ngram_range=(1,1)) 
cat_transform = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer([
    ('txt', text_transform, 'combined_text'),
    ('cat', cat_transform, ['modification_type','species'])
], remainder='drop')
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Define the ML Pipeline and Train
predictor_pipeline = Pipeline([
    ('pre', preprocessor),
    ('lr', LogisticRegression(max_iter=1000, random_state=42)) 
])
predictor_pipeline.fit(X_train, y_train)

print("‚úÖ ML Pipeline successfully recreated and trained.")

# --- 2. DEFINE DEMO FUNCTIONS ---

def predict_effect(gene_name: str, modification_type: str, species: str, pathway_description: str = "", kb_text: str = ""):
    """Uses the newly trained ML pipeline to predict the effect (label)."""
    # Create the input row, ensuring species and modification are lowercased
    new_data = pd.DataFrame([{
        'combined_text': f"{pathway_description} {kb_text}",
        'modification_type': modification_type.lower(),
        'species': species.lower()
    }])
    prediction = predictor_pipeline.predict(new_data)[0]
    return prediction

def map_outcome_rule(expected_outcome_text: str):
    """Rule-based classifier (Baseline) for comparison."""
    if pd.isna(expected_outcome_text): return 'unknown'
    t = expected_outcome_text.lower()
    
    if any(w in t for w in ['reduces','loss','decrease','inhib','defect','tumorigenesis','dysfunction','neurodegeneration']):
        return 'harmful'
    if any(w in t for w in ['increased','enhanced','improved','adaptive','beneficial','restored','resistance']):
        return 'beneficial'
    if any(w in t for w in ['no change','neutral','unknown','albino phenotype']):
        return 'neutral'
    return 'other'

# --- 3. RUN THE FINAL DEMO ---

# ML Prediction Demo
gene = "MTOR"
mod = "CRISPR deletion"
spc = "Human"
# Use pathway info for MTOR (P0014) from your pathways.csv
desc = "Pro-survival pathway regulating metabolism, growth, and apoptosis." 
kb_info = "The MTOR gene is essential for cell growth."

ml_prediction = predict_effect(gene, mod, spc, desc, kb_info)

print(f"\n==================================================")
print(f"       ‚úÖ Prediction Demo Complete")
print(f"==================================================")
print(f"Input Gene: '{gene}' ({mod}) in {spc}")
print(f"ML Predicted Outcome Label: \033[1m{ml_prediction.upper()}\033[0m") # Use bold for emphasis

# Rule-Based Baseline Demo
example_outcome = "reduces DNA repair efficiency"
rule_prediction = map_outcome_rule(example_outcome)

print(f"\nRule-Based Baseline (Based on outcome text):")
print(f"Input Text: '{example_outcome}'")
print(f"Rule-Based Predicted Label: {rule_prediction.upper()}")
print(f"==================================================")

Loading data and recreating the ML pipeline...
‚úÖ ML Pipeline successfully recreated and trained.

       ‚úÖ Prediction Demo Complete
Input Gene: 'MTOR' (CRISPR deletion) in Human
ML Predicted Outcome Label: [1mBENEFICIAL[0m

Rule-Based Baseline (Based on outcome text):
Input Text: 'reduces DNA repair efficiency'
Rule-Based Predicted Label: HARMFUL


In [9]:
# Assuming the necessary dependencies are imported and setup is complete from prior steps.

import pandas as pd
import numpy as np

# --- REDEFINITION OF THE NARRATIVE GENERATOR (Final Syntax Fix) ---

def generate_narrative(gene: str, mod: str, species: str, pathway_context: str, ml_prediction: str):
    """
    Constructs a sophisticated prompt using all available data
    to generate a plausible biological narrative for the prediction.
    """
    
    # Define a variable for the newline character to simplify the f-string expression
    newline = '\n'
    path_name = pathway_context.split(':')[0].strip()
    
    # 1. Gather the key evidence
    evidence = [
        f"Gene: {gene}",
        f"Modification: {mod}",
        f"Species: {species}",
        f"Associated Pathway Context: {pathway_context}"
    ]
    
    # 2. Construct the structured prompt for the LLM
    # Use the 'newline' variable to join the evidence list
    prompt_template = f"""
    You are a highly experienced computational biologist. Your task is to generate a concise, scientifically plausible
    narrative (maximum 3 sentences) explaining the predicted outcome for a genetic modification.

    ---
    EVIDENCE:
    {newline.join(evidence)}

    PREDICTED OUTCOME LABEL: {ml_prediction.upper()}
    ---

    Based on the evidence and the predicted label, generate the explanatory narrative.
    The narrative should sound professional, connecting the gene's known pathway function to the modification type,
    and concluding with the predicted effect.
    """
    
    # 3. CALL TO THE GENERATIVE MODEL (Using the fixed, cleaner .format())
    # NOTE: You must replace the 'if/else' block below with your actual LLM API call
    
    if ml_prediction.upper() == "BENEFICIAL":
        # Using .format() with triple quotes to avoid f-string concatenation issues
        return """
            CRISPR deletion of {gene}, a key regulator in the {path_name} pathway, likely resulted in a targeted metabolic adjustment.
            This pathway perturbation caused a beneficial change, such as enhanced resilience or increased yield,
            consistent with genetic engineering strategies in {species}.
        """.format(gene=gene, path_name=path_name, species=species).strip()
    else:
        # Using .format() with triple quotes
        return """
            CRISPR deletion of {gene} in {species} is predicted to result in a {ml_prediction} outcome.
            Given its role in the {path_name} pathway, this modification likely introduced a loss-of-function defect,
            resulting in impaired growth or reduced efficiency.
        """.format(gene=gene, species=species, ml_prediction=ml_prediction.lower(), path_name=path_name).strip()


# --- FINAL DEMO RUN WITH NARRATIVE ---

# Reuse the MTOR example data
gene = "MTOR"
mod = "CRISPR deletion"
spc = "Human"
desc = "PI3K-AKT signaling: Pro-survival pathway regulating metabolism, growth, and apoptosis." 
ml_prediction = "BENEFICIAL" # Using the result from your last prediction

# Run the fixed narrative generator
generated_narrative = generate_narrative(gene, mod, spc, desc, ml_prediction)

print("\n==================================================")
print("     üß¨ Generative AI Prediction Demo")
print("==================================================")
print(f"Input: Gene '{gene}' ({mod}) in {spc}")
print(f"ML Predicted Outcome Label: \033[1m{ml_prediction.upper()}\033[0m")
print("\n[GENERATED NARRATIVE]")
print(generated_narrative)
print("==================================================")


     üß¨ Generative AI Prediction Demo
Input: Gene 'MTOR' (CRISPR deletion) in Human
ML Predicted Outcome Label: [1mBENEFICIAL[0m

[GENERATED NARRATIVE]
CRISPR deletion of MTOR, a key regulator in the PI3K-AKT signaling pathway, likely resulted in a targeted metabolic adjustment.
            This pathway perturbation caused a beneficial change, such as enhanced resilience or increased yield,
            consistent with genetic engineering strategies in Human.
