#### First we prepare the dataset using orphanet_explorer package : https://github.com/atinak/orphanet_explorer

data can be downloaded from https://www.orphadata.com/orphanet-scientific-knowledge-files/

In [None]:
from orphanet_explorer import OrphanetDataManager

In [None]:
manager = OrphanetDataManager("output/")
# Define input files
xml_files = {
    "references": "data/references.xml",
    "phenotype": "data/en_phenotype.xml",
    "consequences": "data/en_funct_consequences.xml",
    "natural_history": "data/en_nat_hist_ages.xml",
    "epidemiology": "data/en_epidimiology_prev.xml"
}

# Process files and save merged dataset
merged_data = manager.process_files(
    xml_files,
    output_file="merged_orphanet_data.csv"
)

###### Rare Disease Prediction System: Usage Example
##### This notebook demonstrates how to use the Rare Disease Prediction System. We'll cover:
##### 1.  **Data Loading and Processing:**  Using `OrphanetDataProcessor` to load and preprocess the raw Orphanet data.
##### 2.  **Feature Engineering:**  Applying the `FeatureEngineer` to transform the data into a suitable format for machine learning.
##### 3.  **Model Training:**  Training a machine learning model using `ModelTrainer`.
##### 4.  **Making Predictions:**  Using the `Predictor` class to make predictions on new data.
##### 5. **API Interaction (Optional):** A brief overview of how to query your prediction service.
##### Setup
###### First, we need to import the necessary classes and set up the paths.

In [None]:
import os
import sys
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.data_processor import OrphanetDataProcessor
from src.feature_engineering import FeatureEngineer
from src.model_training import ModelTrainer
from src.predictor import Predictor

#####  Data Loading and Processing

In [None]:
# Initialize the data processor
data_processor = OrphanetDataProcessor(data_dir='../data')

# Load the sample data
data_processor.load_data('merged_orphanet_data.csv')

# Display the first few rows of the loaded data
data_processor.data_df.head()

#### Now, let's parse the HPO associations, and then create the feature matrices.



In [None]:
# Parse HPO associations
# hpo_df = data_processor.parse_hpo_associations()
hpo_df.head()

#### Now let's parse the disability associations, average age of onset, types of inheritance and prevalence data.


In [None]:
# Parse disability associations
disability_df = data_processor.parse_disability_associations()
disability_df.head()

In [None]:
# Parse average age of onset
age_of_onset_df = data_processor.parse_average_age_of_onset()
age_of_onset_df.head()

In [None]:
# Parse types of inheritance
inheritance_df = data_processor.parse_types_of_inheritance()
inheritance_df.head()



In [None]:
# Parse prevalence data
# prevalence_df = data_processor.parse_prevalence_data()
prevalence_df.head()
# len(prevalence_df)

#### We now have separate DataFrames for HPO associations, disability associations, age of onset, inheritance types, and prevalence.  The `prepare_data_for_ml` method in the `OrphanetDataProcessor` class will combine these into a single feature matrix (X) and target variable (y) suitable for machine learning. It handles the merging and any necessary filling of missing values.  It also includes an example of *prevalence weighting*.



In [None]:
# get summary information
# summary = data_processor.get_summary_information()
summary.head()

In [None]:
# # Create external reference features
# ext_ref_features = data_processor.create_external_ref_features()
# ext_ref_features.head()

In [None]:
# Create HPO feature matrix
# hpo_feature_matrix = data_processor.create_hpo_feature_matrix()
hpo_feature_matrix.head()

#### Finally, let's prepare the data for machine learning.  This combines the HPO and external reference features and creates the `X` (features) and `y` (target) variables.


In [None]:
# Prepare data for machine learning
# X, y = data_processor.prepare_data_for_ml()
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")
X.head()



#### Feature Engineering

#### The `create_feature_pipeline` method performs the following steps:
##### 1.  **Identifies Feature Types:** Determines which columns are numerical and categorical.
##### 2.  **Handles Missing Values:** Fills missing numerical values with 0 and categorical values with the most frequent value.
##### 3.  **Scales Features:** Standardizes numerical features using `StandardScaler`.
##### 4.  **Encodes Categorical Features:** Converts categorical features into numerical representations using `OneHotEncoder`.
##### 5.  **Selects Features:** Selects the top *k* features using mutual information (`mutual_info_classif`) or chi-squared (`chi2`).
##### 6.  **Reduces Dimensions (Optional):** Applies Principal Component Analysis (PCA) to reduce the dimensionality of the data.
####     Let's save the pipeline.

In [None]:
# Initialize the feature engineer
feature_engineer = FeatureEngineer(models_dir='../models')

# Create the feature engineering pipeline
X_transformed = feature_engineer.create_feature_pipeline(X, y, use_pca=True, n_features=10, n_components=5)

# Display the transformed features
X_transformed.head()

In [None]:
# Save the pipeline
feature_engineer.save_pipeline()

####  Model Training


In [None]:
# Initialize the model trainer
model_trainer = ModelTrainer(models_dir='../models')

# Train a Random Forest model with Optuna optimization
best_rf_model, best_rf_params = model_trainer.run_training_pipeline(X_transformed, y, model_selection='random_forest', use_optuna=True)
print(f"Best Random Forest Parameters: {best_rf_params}")

#### Let's train an XGBoost Model


In [None]:
# Train XGBoost model with Optuna optimization.
best_xgb_model, best_xgb_params = model_trainer.run_training_pipeline(X_transformed, y, model_selection='xgboost', use_optuna=True)
print(f"Best XGBoost Parameters: {best_xgb_params}")

#### Making Predictions


In [None]:
# Initialize the predictor (loads the saved model and feature pipeline)
predictor = Predictor(model_filename='best_random_forest')  # Or best_xgboost, depending on which one you want to use

# Create some sample input data (this should match the structure of your original data)
sample_input = pd.DataFrame({
    'OrphaCode': [99999],  # Add OrphaCode. It is skipped in prepare_data_for_ml but is required here
    'HPODisorderAssociation_df2': [
        '[{"HPOId": "HP:0000256", "HPOTerm": "Macrocephaly", "HPOFrequency": "Very frequent (99-80%)"}, \n',
        ' {"HPOId": "HP:0001249", "HPOTerm": "Intellectual disability", "HPOFrequency": "Frequent (79-30%)"}]'
    ],
     'ExternalReferences_df1': ['{"ICD-10": "Q99"}']
    , 'SummaryInformation_df1': ['{"Definition": "This is <i>test</i> definition 1"}']
})

# We need to preprocess the sample input using the SAME data processor
sample_input = data_processor.data_df.append(sample_input, ignore_index = True)
hpo_df_sample = data_processor.parse_hpo_associations()
ext_ref_df_sample = data_processor.parse_external_references()
hpo_feature_matrix_sample = data_processor.create_hpo_feature_matrix(hpo_df=hpo_df_sample)
ext_ref_features_sample = data_processor.create_external_ref_features(ext_ref_df=ext_ref_df_sample)
X_sample, _ = data_processor.prepare_data_for_ml() # We do not need the target variable
X_sample = X_sample.tail(1) # Take only our sample

# Make predictions
predictions = predictor.predict(X_sample)
print(f"Predictions: {predictions}")

In [None]:
# Get probabilities (if supported by the model)
probabilities = predictor.predict(X_sample, return_probabilities=True)
print(f"Probabilities: {probabilities}")

#### API Interaction (Optional)


#### To interact with the API, you would typically send a POST request to the `/predict` endpoint with the input data in JSON format.  Here's a *conceptual* example using the `requests` library (this won't run directly in the notebook without the API server running separately):


In [None]:
# %% [markdown]

# %% [markdown]
#
# ```python
# import requests
# import json
#
# # API endpoint URL (replace with your actual URL if different)
# api_url = 'http://localhost:5000/predict'
#
# # Sample input data (same structure as in the prediction example above)
# input_data = {
#    'OrphaCode': 99999,
#     'HPODisorderAssociation_df2': [
#         '[{"HPOId": "HP:0000256", "HPOTerm": "Macrocephaly", "HPOFrequency": "Very frequent (99-80%)"}, \n',
#          ' {"HPOId": "HP:0001249", "HPOTerm": "Intellectual disability", "HPOFrequency": "Frequent (79-30%)"}]'
#     ],
#     'ExternalReferences_df1': ['{"ICD-10": "Q99"}']
# }
# input_data = pd.DataFrame(input_data)
#
# # Send the request
# response = requests.post(api_url, json=input_data.to_dict())
#
# # Check the response
# if response.status_code == 200:
#     result = response.json()
#     print(f"Prediction: {result['predictions']}")
# else:
#     print(f"Error: {response.status_code} - {response.text}")
# ```
#
# This code snippet is a *guide* for how you'd interact with the API. It shows the basic structure of sending a request and handling the response.  You'll need to adapt it to your specific needs (e.g., error handling, different input data formats).
# To run the API:
# 1. Navigate to the rare_disease_prediction directory in your terminal
# 2. Run `python api/app.py`