In [5]:
import os
import json
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Union, Optional
import logging
import warnings

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class OrphanetDataProcessor:

    def __init__(self, data_dir: str = "../data"):
        self.data_dir = data_dir
        self.data_df = None
        self.hpo_features = None
        self.disability_features = None
        self.prevalence_features = None
        self.inheritance_features = None
        self.age_of_onset_features = None

        self.frequency_mapping = {
            "Very frequent (99-80%)": 0.9,
            "Frequent (79-30%)": 0.5,
            "Occasional (29-5%)": 0.15,
            "Very rare (<5%)": 0.025,
            "Excluded (0%)": 0.0,
            "": 0.0,  # Handle empty values
            "N/A": 0.0 # Handle N/A
        }

        # Create data directory if it doesn't exist
        os.makedirs(data_dir, exist_ok=True)

    def _safe_json_loads(self, json_string: str) -> Union[Dict, List, None]:
        """Safely loads potentially malformed JSON strings."""
        try:
            return json.loads(json_string)
        except Exception as e:  # Catch *any* exception during JSON parsing
            logger.warning(f"Invalid JSON encountered: {json_string}, error: {e}")
            return None

    def load_data(self, file_path: str) -> pd.DataFrame:
        """Loads data from a CSV file."""
        full_path = os.path.join(self.data_dir, file_path)
        logger.info(f"Loading data from {full_path}")
        try:
            self.data_df = pd.read_csv(full_path)
            #CRITICAL: Replace 'N/A' strings with actual NaN *before* doing anything
            self.data_df = self.data_df.replace('N/A', pd.NA)
            logger.info(f"Loaded {len(self.data_df)} records")
            return self.data_df
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            raise

    def parse_hpo_associations(self) -> pd.DataFrame:
        """Parses HPO associations from the HPODisorderAssociation_df2 column."""
        if self.data_df is None:
            raise ValueError("Data not loaded. Call load_data() first.")

        logger.info("Parsing HPO associations")
        hpo_data = []

        for idx, row in self.data_df.iterrows():
            orpha_code = row['OrphaCode']
            disease_name = row['Name']
            # Use .get() with a default empty string
            hpo_list = self._safe_json_loads(row.get('HPODisorderAssociation_df2', '[]'))
            if hpo_list is None:
                continue

            for hpo_item in hpo_list:
                hpo_data.append({
                    'OrphaCode': orpha_code,
                    'DiseaseName': disease_name,
                    'HPOId': hpo_item.get('HPOId', ''),
                    'HPOTerm': hpo_item.get('HPOTerm', ''),
                    'HPOFrequency': hpo_item.get('HPOFrequency', ''),
                    'HPOFrequencyValue': self.frequency_mapping.get(hpo_item.get('HPOFrequency', ''), 0.0),
                    'DiagnosticCriteria': hpo_item.get('DiagnosticCriteria', '')
                })
        hpo_df = pd.DataFrame(hpo_data)
        logger.info(f"Extracted {len(hpo_df)} HPO associations")
        return hpo_df

    def create_hpo_feature_matrix(self, hpo_df: Optional[pd.DataFrame] = None) -> pd.DataFrame:
        """Creates a feature matrix where each row is a disease and columns are HPO terms."""
        if hpo_df is None:
            hpo_df = self.parse_hpo_associations()

        logger.info("Creating HPO feature matrix")

        # Ensure OrphaCode is string type BEFORE pivoting
        hpo_df['OrphaCode'] = hpo_df['OrphaCode'].astype(str)

        feature_matrix = hpo_df.pivot_table(
            index=['OrphaCode', 'DiseaseName'],
            columns='HPOId',
            values='HPOFrequencyValue',
            fill_value=0
        )
        feature_matrix = feature_matrix.reset_index()
        self.hpo_features = feature_matrix
        logger.info(f"Created feature matrix with {feature_matrix.shape[1]-2} HPO features")
        return feature_matrix

    def get_hpo_term_mapping(self, hpo_df: Optional[pd.DataFrame] = None) -> Dict[str, str]:
        """Creates a mapping from HPO IDs to HPO terms."""
        if hpo_df is None:
            hpo_df = self.parse_hpo_associations()
        hpo_mapping = hpo_df[['HPOId', 'HPOTerm']].drop_duplicates()
        hpo_dict = dict(zip(hpo_mapping['HPOId'], hpo_mapping['HPOTerm']))
        return hpo_dict

    def parse_disability_associations(self) -> pd.DataFrame:
        if self.data_df is None:
          raise ValueError("Data not loaded. Call load_data() first.")

        logger.info("Parsing disability associations")
        disability_data = []

        for idx, row in self.data_df.iterrows():
          orpha_code = row['OrphaCode']
          disease_name = row['Name']

          for col_num in range(3, 6):  # Columns df3, df4, and df5
              col_name = f'DisabilityDisorderAssociations_df{col_num}'
              # Use .get() to handle missing columns gracefully.
              disability_list = self._safe_json_loads(row.get(col_name, '[]'))
              if disability_list is None:
                  continue

              for disability_item in disability_list:
                  disability_data.append({
                      'OrphaCode': orpha_code,
                      'DiseaseName': disease_name,
                      'Disability': disability_item.get('Disability', ''),
                      'FrequencyDisability': disability_item.get('FrequencyDisability', ''),
                      'FrequencyDisabilityValue': self.frequency_mapping.get(disability_item.get('FrequencyDisability', ''), 0.0),
                      'TemporalityDisability': disability_item.get('TemporalityDisability', ''),
                      'SeverityDisability': disability_item.get('SeverityDisability', ''),
                      'LossOfAbility': disability_item.get('LossOfAbility', ''),
                      'TypeDisability': disability_item.get('TypeDisability', ''),
                      'Defined': disability_item.get('Defined', '')
                    })
        disability_df = pd.DataFrame(disability_data)
        logger.info(f"Extracted {len(disability_df)} disability associations")
        return disability_df
    def parse_average_age_of_onset(self) -> pd.DataFrame:
        if self.data_df is None:
          raise ValueError("Data not loaded.  Call load_data() first.")
        logger.info("Parsing average age of onset")
        age_of_onset_data = []

        for idx, row in self.data_df.iterrows():
            orpha_code = row['OrphaCode']
            disease_name = row['Name']
            for col_num in range(4, 6):  # Columns df4 and df5
                col_name = f'AverageAgesOfOnset_df{col_num}'
                # Use .get() to handle missing columns gracefully
                age_of_onset_list = self._safe_json_loads(row.get(col_name, '[]'))
                if age_of_onset_list is None:
                    continue

                for age_item in age_of_onset_list:
                    age_of_onset_data.append({
                        'OrphaCode': orpha_code,
                        'DiseaseName': disease_name,
                        'AverageAgeOfOnset': age_item.get('AverageAgeOfOnset', '')
                    })

        age_of_onset_df = pd.DataFrame(age_of_onset_data)
        logger.info(f"Extracted {len(age_of_onset_df)} age of onset entries")
        return age_of_onset_df

    def parse_types_of_inheritance(self) -> pd.DataFrame:
        if self.data_df is None:
          raise ValueError("Data not loaded.  Call load_data() first.")

        logger.info("Parsing types of inheritance")
        inheritance_data = []

        for idx, row in self.data_df.iterrows():
            orpha_code = row['OrphaCode']
            disease_name = row['Name']
            for col_num in range(4, 6):  # Columns df4 and df5
                col_name = f'TypesOfInheritance_df{col_num}'
                # Use .get() to handle missing columns gracefully.
                inheritance_list = self._safe_json_loads(row.get(col_name, '[]'))
                if inheritance_list is None:
                    continue

                for inheritance_item in inheritance_list:
                    inheritance_data.append({
                        'OrphaCode': orpha_code,
                        'DiseaseName': disease_name,
                        'TypeOfInheritance': inheritance_item.get('TypeOfInheritance', '')
                    })
        inheritance_df = pd.DataFrame(inheritance_data)
        logger.info(f"Extracted {len(inheritance_df)} inheritance entries")
        return inheritance_df
      
    def parse_prevalence_data(self) -> pd.DataFrame:
        """Parses prevalence data from the PrevalenceData_df5 column."""
        if self.data_df is None:
            raise ValueError("Data not loaded. Call load_data() first.")

        logger.info("Parsing prevalence data")
        prevalence_data = []

        for idx, row in self.data_df.iterrows():
            orpha_code = row['OrphaCode']
            disease_name = row['Name']
            # Use .get() with a default empty string to handle missing columns
            prevalence_list = self._safe_json_loads(row.get('PrevalenceData_df5', '[]'))
            if prevalence_list is None:
                continue

            for prev_item in prevalence_list:
                prevalence_data.append({
                    'OrphaCode': orpha_code,
                    'DiseaseName': disease_name,
                    'PrevalenceType': prev_item.get('PrevalenceType', ''),
                    'PrevalenceQualification': prev_item.get('PrevalenceQualification', ''),
                    'PrevalenceClass': prev_item.get('PrevalenceClass', ''),
                    'ValMoy': float(prev_item.get('ValMoy', 0.0)),  # Convert to float
                    'PrevalenceGeographic': prev_item.get('PrevalenceGeographic', ''),
                    'PrevalenceValidationStatus': prev_item.get('PrevalenceValidationStatus', '')
                })

        prevalence_df = pd.DataFrame(prevalence_data)
        logger.info(f"Extracted {len(prevalence_df)} prevalence entries")
        return prevalence_df

    def get_summary_information(self) -> pd.DataFrame:
      if self.data_df is None:
          raise ValueError("Data not loaded. Call load_data() first.")

      logger.info("Extracting summary information")
      summary_data = []
      for idx, row in self.data_df.iterrows():
        orpha_code = row['OrphaCode']
        disease_name = row['Name']
        #Use .get to handle potential missing
        summary_info = self._safe_json_loads(row.get('SummaryInformation_df1','{}'))
        if summary_info is None:
          continue

        definition = summary_info.get('Definition', '')
        definition = definition.replace('<i>', '').replace('</i>', '')  # Clean HTML

        summary_data.append({
            'OrphaCode': orpha_code,
            'DiseaseName': disease_name,
            'Definition': definition
        })

      summary_df = pd.DataFrame(summary_data)
      logger.info(f"Extracted {len(summary_df)} disease summaries")
      return summary_df

    def _extract_prevalence_value(self, prevalence_class: str) -> float:
      """Extracts a numerical prevalence value from the PrevalenceClass string."""
      #Handles prevalence strings, like '<1 / 1 000 000'  or '1-9 / 10 000'
      if prevalence_class == 'Unknown' or  prevalence_class == 'N/A':
        return 0.0
      parts = prevalence_class.split('/')
      if len(parts) != 2:
        logger.warning(f"Could not parse prevalence string: {prevalence_class}") #LOG THE STRING!
        return 0.0  # Handle malformed strings
      try:
        numerator_str = parts[0].strip()
        denominator_str = parts[1].strip().replace(' ', '') # Remove spaces in denominator

        # Handle ranges in numerator
        if '-' in numerator_str:
            numerator_range = [float(x) for x in numerator_str.split('-')]
            numerator = sum(numerator_range) / len(numerator_range) # Average the range
        else:
            numerator = float(numerator_str)

        denominator = float(denominator_str)
        return numerator/denominator
      except ValueError:
        logger.warning(f'Couldnt extract a value from: {prevalence_class}')
        return 0.0 # Handle parsing errors

    def prepare_data_for_ml(self) -> Tuple[pd.DataFrame, pd.Series]:
      """Prepares the final dataset for machine learning."""
      logger.info("Preparing final dataset for machine learning")

      # Get HPO features
      if self.hpo_features is None:
          self.create_hpo_feature_matrix()

      # Create disability features (pivot as needed)
      disability_df = self.parse_disability_associations()
      if not disability_df.empty:
          # Ensure OrphaCode is string type BEFORE pivoting
          disability_df['OrphaCode'] = disability_df['OrphaCode'].astype(str)
          self.disability_features = disability_df.pivot_table(
              index='OrphaCode',
              columns='Disability',
              values='FrequencyDisabilityValue',
              fill_value=0
          ).reset_index()

      # Create age of onset features (pivot as needed)
      age_of_onset_df = self.parse_average_age_of_onset()
      if not age_of_onset_df.empty:
          # Ensure OrphaCode is string type BEFORE pivoting
          age_of_onset_df['OrphaCode'] = age_of_onset_df['OrphaCode'].astype(str)
          self.age_of_onset_features = age_of_onset_df.pivot_table(
              index='OrphaCode',
              columns='AverageAgeOfOnset',
              values = 'DiseaseName', #dummy value
              aggfunc = 'first', #Presence/absence
              fill_value=0
            ).reset_index()


      # Create inheritance features
      inheritance_df = self.parse_types_of_inheritance()
      if not inheritance_df.empty:
          # Ensure OrphaCode is string type BEFORE pivoting
          inheritance_df['OrphaCode'] = inheritance_df['OrphaCode'].astype(str)
          self.inheritance_features = inheritance_df.pivot_table(
              index='OrphaCode',
              columns='TypeOfInheritance',
              values = 'DiseaseName', #dummy value
              aggfunc = 'first',
              fill_value = 0
          ).reset_index()

      # Create prevalence features - keep raw values and class.
      prevalence_df = self.parse_prevalence_data()
      if not prevalence_df.empty:
          # Ensure OrphaCode is string type BEFORE grouping
          prevalence_df['OrphaCode'] = prevalence_df['OrphaCode'].astype(str)
          # Numerical prevalence.
          prevalence_df['PrevalenceValue'] = prevalence_df['PrevalenceClass'].apply(self._extract_prevalence_value)
          self.prevalence_features = prevalence_df.groupby('OrphaCode')['PrevalenceValue'].mean().reset_index() # Use mean for now

      # --- Merge features, with explicit type handling ---
      features = self.hpo_features
      # Ensure OrphaCode is consistent type (string) BEFORE merging
      features['OrphaCode'] = features['OrphaCode'].astype(str)


      if self.disability_features is not None:
        features = features.merge(self.disability_features, on='OrphaCode', how='left')

      if self.age_of_onset_features is not None:
        features = features.merge(self.age_of_onset_features, on='OrphaCode', how='left')

      if self.inheritance_features is not None:
          features = features.merge(self.inheritance_features, on='OrphaCode', how='left')

      if self.prevalence_features is not None:
          features = features.merge(self.prevalence_features, on='OrphaCode', how='left')


      # Extract target and features
      y = features['OrphaCode']
      X = features.drop(['OrphaCode', 'DiseaseName'], axis=1, errors='ignore')

      # Fill missing values (left after merging)
      X = X.fillna(0)

      # --- TYPE ENFORCEMENT (BEFORE SCALING/ENCODING) ---
      # Identify likely categorical columns (after merging)
      likely_categorical = []
      for col in X.columns:
          if X[col].dtype == 'object':  # If it's already object, it's categorical
              likely_categorical.append(col)
          elif X[col].nunique() < 20: #If has few values, probably a category
              likely_categorical.append(col)


      # Enforce string type on these columns
      for col in likely_categorical:
            X[col] = X[col].astype(str)

      # -----------------------------------------------------

      # Apply prevalence weighting (example - adjust as needed)
      if 'PrevalenceValue' in X.columns:
          # Scale prevalence to avoid overwhelming other features
          #  Here, we're simply taking the log, but you might want something else.
          X['PrevalenceValue'] = np.log1p(X['PrevalenceValue'])
      logger.info(f"Final dataset: {X.shape[0]} samples, {X.shape[1]} features")

      return X, y

    def save_processed_data(self, output_path: str) -> None:
        """Save processed data to CSV file."""
        if self.hpo_features is None:
            raise ValueError("No processed data available.  Process data first.")

        full_path = os.path.join(self.data_dir, output_path)
        logger.info(f"Saving processed data to {full_path}")

        X, y = self.prepare_data_for_ml()
        output_df = X.copy()
        output_df['OrphaCode'] = y  # Add target back
        output_df.to_csv(full_path, index=False)
        logger.info(f"Saved processed data with {output_df.shape[1]} columns")
        
        
        
"""
Feature Engineering Module for Rare Disease Prediction System.

This module handles feature transformation, selection, and encoding for
the rare disease prediction models.
"""

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.decomposition import PCA
from typing import Tuple, List, Dict, Union, Optional
import logging
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib
import os

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class FeatureEngineer:
    """
    Class for advanced feature engineering operations on Orphanet data.

    This class handles feature scaling, selection, dimensionality reduction,
    and encoding for improving model performance.
    """

    def __init__(self, models_dir: str = "../models"):
        """
        Initialize the FeatureEngineer.

        Args:
            models_dir: Directory where preprocessing models will be stored
        """
        self.models_dir = models_dir
        self.scaler = None
        self.feature_selector = None
        self.pca = None
        self.encoders = {}
        self.numerical_columns = []
        self.categorical_columns = []
        self.selected_features = []

        # Create models directory if it doesn't exist
        os.makedirs(models_dir, exist_ok=True)

    def identify_feature_types(self, X: pd.DataFrame) -> Tuple[List[str], List[str]]:
        """
        Identify numerical and categorical features in the dataset.

        Args:
            X: Feature DataFrame

        Returns:
            Tuple of (numerical column names, categorical column names)
        """
        logger.info("Identifying feature types")

        numerical_cols = []
        categorical_cols = []

        for col in X.columns:
            # Check if the column has numeric data
            if X[col].dtype in ['int64', 'float64']:
                numerical_cols.append(col)
            else:
                categorical_cols.append(col)

        self.numerical_columns = numerical_cols
        self.categorical_columns = categorical_cols

        logger.info(f"Identified {len(numerical_cols)} numerical and {len(categorical_cols)} categorical features")
        return numerical_cols, categorical_cols

    def handle_missing_values(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Handle missing values in the dataset.

        Args:
            X: Feature DataFrame

        Returns:
            DataFrame with missing values handled
        """
        logger.info("Handling missing values")

        # For HPO features (numerical), fill NaN with 0 (absence of symptom)
        # For categorical features, fill with most frequent value

        # Identify feature types if not already done
        if not self.numerical_columns and not self.categorical_columns:
            self.identify_feature_types(X)

        # Handle numerical features
        X_num = X[self.numerical_columns].fillna(0)

        # Handle categorical features
        X_cat = X[self.categorical_columns].copy()
        if not X_cat.empty:
            for col in X_cat.columns:
                X_cat[col] = X_cat[col].fillna(X_cat[col].mode()[0] if not X_cat[col].mode().empty else "UNKNOWN")

        # Combine processed features
        X_processed = pd.concat([X_num, X_cat], axis=1)

        logger.info(f"Handled missing values in {X.shape[1]} features")
        return X_processed


    def scale_features(self, X: pd.DataFrame, fit: bool = True) -> pd.DataFrame:
        """
        Scale numerical features.

        Args:
            X: Feature DataFrame
            fit: Whether to fit the scaler on this data

        Returns:
            DataFrame with scaled features
        """
        logger.info("Scaling numerical features")

        # Identify feature types if not already done
        if not self.numerical_columns and not self.categorical_columns:
            self.identify_feature_types(X)

        # --- ADDED CHECK FOR EMPTY NUMERICAL COLUMNS ---
        if not self.numerical_columns:
            logger.info("No numerical features to scale. Returning original DataFrame.")
            return X.copy()  # Return the input unchanged (but as a copy!)
        # -----------------------------------------------

        # Get numerical columns
        X_num = X[self.numerical_columns].copy()

        # Create and fit scaler if needed
        if fit or self.scaler is None:
            self.scaler = StandardScaler()
            X_num_scaled = pd.DataFrame(
                self.scaler.fit_transform(X_num),
                columns=X_num.columns,
                index=X_num.index
            )
        else:
            X_num_scaled = pd.DataFrame(
                self.scaler.transform(X_num),
                columns=X_num.columns,
                index=X_num.index
            )

        # Combine with categorical features
        X_cat = X[self.categorical_columns].copy() if self.categorical_columns else pd.DataFrame(index=X.index)
        X_scaled = pd.concat([X_num_scaled, X_cat], axis=1)

        logger.info(f"Scaled {len(self.numerical_columns)} numerical features")
        return X_scaled

    def encode_categorical_features(self, X: pd.DataFrame, fit: bool = True) -> pd.DataFrame:
        """
        Encode categorical features using one-hot encoding.

        Args:
            X: Feature DataFrame
            fit: Whether to fit the encoders on this data

        Returns:
            DataFrame with encoded features
        """
        logger.info("Encoding categorical features")

        # Identify feature types if not already done
        if not self.numerical_columns and not self.categorical_columns:
            self.identify_feature_types(X)

        # No categorical features to encode
        if not self.categorical_columns:
            logger.info("No categorical features to encode")
            return X

        # Get numerical features
        X_num = X[self.numerical_columns].copy()

        # Initialize list to store encoded DataFrames
        encoded_dfs = [X_num]

        # Encode each categorical feature
        for col in self.categorical_columns:
            # Handle potential NaN values
            X[col] = X[col].fillna('UNKNOWN')

            # Create encoder if needed
            if fit or col not in self.encoders:
                encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
                encoded_data = encoder.fit_transform(X[[col]])
                self.encoders[col] = encoder
            else:
                encoder = self.encoders[col]
                encoded_data = encoder.transform(X[[col]])

            # Create DataFrame with encoded data
            encoded_df = pd.DataFrame(
                encoded_data,
                columns=[f"{col}_{val}" for val in encoder.categories_[0]],
                index=X.index
            )

            # Add to list of encoded DataFrames
            encoded_dfs.append(encoded_df)

        # Combine all encoded features
        X_encoded = pd.concat(encoded_dfs, axis=1)

        logger.info(f"Encoded {len(self.categorical_columns)} categorical features")
        return X_encoded

    def select_features(self, X: pd.DataFrame, y: pd.Series, n_features: int = 100, method: str = 'mi', fit: bool = True) -> pd.DataFrame:
        """
        Select top features using statistical methods.

        Args:
            X: Feature DataFrame
            y: Target variable
            n_features: Number of features to select
            method: Feature selection method ('mi' for mutual info, 'chi2' for chi-squared)
            fit: Whether to fit the selector on this data

        Returns:
            DataFrame with selected features
        """
        logger.info(f"Selecting top {n_features} features using {method}")

        # Adjust n_features if it exceeds available features
        n_features = min(n_features, X.shape[1])

    def select_features(self, X: pd.DataFrame, y: pd.Series, n_features: int = 100, method: str = 'mi', fit: bool = True) -> pd.DataFrame:
        """
        Select top features using statistical methods.

        Args:
            X: Feature DataFrame
            y: Target variable
            n_features: Number of features to select
            method: Feature selection method ('mi' for mutual info, 'chi2' for chi-squared)
            fit: Whether to fit the selector on this data

        Returns:
            DataFrame with selected features
        """
        logger.info(f"Selecting top {n_features} features using {method}")

        # Adjust n_features if it exceeds available features
        n_features = min(n_features, X.shape[1])

        # --- EMPTY DATAFRAME CHECK ---
        if X.empty or X.shape[0] == 0 or X.shape[1] == 0:  # Check for empty DataFrame OR if 0 columns
            logger.warning("Empty DataFrame passed to select_features or DataFrame with 0 columns. Returning empty DataFrame with n_features columns.")
            return pd.DataFrame(columns=[f"feature_{i}" for i in range(n_features)])  # Return empty DataFrame
        # -----------------------------


        # Create feature selector if needed
        if fit or self.feature_selector is None:
            if method == 'mi':
                 # --- FORCE CONTINUOUS FEATURES ---
                self.feature_selector = SelectKBest(lambda X, y: mutual_info_classif(X, y, discrete_features=False), k=n_features)
                # ------------------------------------
            elif method == 'chi2':
                # Ensure all features are non-negative for chi2
                X_non_neg = X.copy()
                for col in X.columns:
                    if X_non_neg[col].min() < 0:
                        X_non_neg[col] = X_non_neg[col] - X_non_neg[col].min()
                self.feature_selector = SelectKBest(chi2, k=n_features)
                X = X_non_neg  # Use the non-negative version for chi2
            else:
                raise ValueError(f"Unknown feature selection method: {method}")

            # Fit and transform
            # Check if y has samples. If not, return empty DataFrame with appropriate columns.
            if len(y) == 0:
                return pd.DataFrame(columns=[f"feature_{i}" for i in range(n_features)]) # Return DF with correct number of columns
            X_selected = self.feature_selector.fit_transform(X, y)

            # Get selected feature names
            selected_mask = self.feature_selector.get_support()
            self.selected_features = X.columns[selected_mask].tolist()
        else:
            # Transform using existing selector
            X_selected = self.feature_selector.transform(X)

        # Convert to DataFrame
        X_selected_df = pd.DataFrame(
            X_selected,
            columns=self.selected_features,
            index=X.index  # Keep the original index
        )

        logger.info(f"Selected {len(self.selected_features)} features")
        return X_selected_df

    def reduce_dimensions(self, X: pd.DataFrame, n_components: int = 50, fit: bool = True) -> pd.DataFrame:
        """
        Reduce dimensions using PCA.

        Args:
            X: Feature DataFrame
            n_components: Number of PCA components
            fit: Whether to fit PCA on this data

        Returns:
            DataFrame with reduced dimensions
        """
        logger.info(f"Reducing dimensions to {n_components} components")

        # --- ADDED EMPTY/SINGLE-COLUMN CHECK ---
        if X.empty or X.shape[1] <= 1:  # Check for empty or single-column DataFrame
            logger.warning("Empty DataFrame or only one column in reduce_dimensions. Returning original DataFrame (no PCA).")
            return X.copy()  # Return a copy to avoid modifying the original
        # ---------------------------------------

        # Adjust n_components if it exceeds available features
        n_components = min(n_components, X.shape[1])

        # Create and fit PCA if needed
        if fit or self.pca is None:
            self.pca = PCA(n_components=n_components)
            X_pca = self.pca.fit_transform(X)
        else:
            X_pca = self.pca.transform(X)

        # Convert to DataFrame
        X_pca_df = pd.DataFrame(
            X_pca,
            columns=[f"PC{i+1}" for i in range(n_components)],
            index=X.index
        )

        # Log explained variance
        if fit or self.pca is None:
            explained_var = self.pca.explained_variance_ratio_.sum()
            logger.info(f"PCA explains {explained_var:.2%} of variance with {n_components} components")

        return X_pca_df

    def create_feature_pipeline(self, X: pd.DataFrame, y: pd.Series, use_pca: bool = True,
                              n_features: int = 100, n_components: int = 50) -> pd.DataFrame:
        """
        Apply the full feature engineering pipeline.

        Args:
            X: Raw feature DataFrame
            y: Target variable
            use_pca: Whether to use PCA dimensionality reduction
            n_features: Number of features to select
            n_components: Number of PCA components (if use_pca is True)

        Returns:
            Fully processed feature DataFrame
        """
        logger.info("Applying full feature engineering pipeline")

        # Identify feature types
        self.identify_feature_types(X)

        # Handle missing values
        X_processed = self.handle_missing_values(X)

        # Scale features
        X_scaled = self.scale_features(X_processed, fit=True)

        # Encode categorical features
        X_encoded = self.encode_categorical_features(X_scaled, fit=True)

        # --- ADDED EMPTY DATAFRAME CHECK AFTER ENCODING ---
        if X_encoded.shape[0] == 0:
            logger.warning("Empty DataFrame after encoding. Returning empty DataFrame.")
            return pd.DataFrame()  # Or return X_encoded, since it's already empty
        # ----------------------------------------------------

        # Select features
        X_selected = self.select_features(X_encoded, y, n_features=n_features, fit=True)

        # Reduce dimensions if requested
        if use_pca:
            X_final = self.reduce_dimensions(X_selected, n_components=n_components, fit=True)
        else:
            X_final = X_selected

        logger.info(f"Feature pipeline complete: {X.shape} -> {X_final.shape}")
        return X_final

    def transform_features(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Transform new data using the existing pipeline.

        Args:
            X: Raw feature DataFrame

        Returns:
            Processed feature DataFrame
        """
        if self.scaler is None:
            raise ValueError("Feature pipeline not initialized. Call create_feature_pipeline() first.")

        # Handle missing values
        X_processed = self.handle_missing_values(X)

        # Scale features
        X_scaled = self.scale_features(X_processed, fit=False)

        # Encode categorical features
        X_encoded = self.encode_categorical_features(X_scaled, fit=False)

        # Select features
        if self.feature_selector is not None:
            # Check if all selected features are present
            missing_cols = set(self.selected_features) - set(X_encoded.columns)
            for col in missing_cols:
                X_encoded[col] = 0  # Add missing columns with default values

            # Select only the columns needed for the feature selector
            X_selected = X_encoded[self.selected_features]
        else:
            X_selected = X_encoded

        # Reduce dimensions if PCA was used
        if self.pca is not None:
            X_final = self.reduce_dimensions(X_selected, fit=False)
        else:
            X_final = X_selected

        return X_final

    def save_pipeline(self, filename: str = "feature_pipeline") -> None:
        """
        Save the feature engineering pipeline objects.

        Args:
            filename: Base filename for saved objects
        """
        logger.info(f"Saving feature engineering pipeline to {self.models_dir}")

        # Create dictionary of objects to save
        pipeline_objects = {
            'scaler': self.scaler,
            'feature_selector': self.feature_selector,
            'pca': self.pca,
            'encoders': self.encoders,
            'selected_features': self.selected_features,
            'numerical_columns': self.numerical_columns,
            'categorical_columns': self.categorical_columns
        }

        # Save each object individually
        for key, value in pipeline_objects.items():
            joblib.dump(value, os.path.join(self.models_dir, f"{filename}_{key}.joblib"))

        logger.info("Feature engineering pipeline saved successfully.")


    def load_pipeline(self, filename: str = "feature_pipeline") -> None:
        """
        Load the feature engineering pipeline objects.

        Args:
            filename: Base filename for saved objects.
        """
        logger.info(f"Loading feature engineering pipeline from {self.models_dir}")

        try:
            self.scaler = joblib.load(os.path.join(self.models_dir, f"{filename}_scaler.joblib"))
            self.feature_selector = joblib.load(os.path.join(self.models_dir, f"{filename}_feature_selector.joblib"))
            self.pca = joblib.load(os.path.join(self.models_dir, f"{filename}_pca.joblib"))
            self.encoders = joblib.load(os.path.join(self.models_dir, f"{filename}_encoders.joblib"))
            self.selected_features = joblib.load(os.path.join(self.models_dir, f"{filename}_selected_features.joblib"))
            self.numerical_columns = joblib.load(os.path.join(self.models_dir, f"{filename}_numerical_columns.joblib"))
            self.categorical_columns = joblib.load(os.path.join(self.models_dir, f"{filename}_categorical_columns.joblib"))
            logger.info("Feature engineering pipeline loaded successfully.")

        except FileNotFoundError as e:
            logger.error(f"Error loading pipeline: {e}.  Make sure the pipeline has been saved.")
            raise  # Re-raise the exception to halt execution if critical components are missing.

            
            
  

# if __name__ == "__main__":
   

            


In [6]:
try:
    # Initialize and load data
    data_processor = OrphanetDataProcessor(data_dir='../data')  # Use local directory
    data_processor.load_data("sample_debug_orphanet_data.csv")

    # Prepare data for ML (includes parsing and initial feature matrix creation)
    X, y = data_processor.prepare_data_for_ml()

    # Ensure y is a string BEFORE feature engineering
    y = y.astype(str)
    # Initialize FeatureEngineer
    feature_engineer = FeatureEngineer(models_dir="./")

    # Create the feature engineering pipeline
    X_transformed = feature_engineer.create_feature_pipeline(X, y, use_pca=True, n_features=10, n_components=5)


except Exception as e:
    import traceback
    traceback.print_exc()
    breakpoint() # !important
    print(e)

2025-02-26 19:45:15,640 - __main__ - INFO - Loading data from ../data/sample_debug_orphanet_data.csv
2025-02-26 19:45:15,643 - __main__ - INFO - Loaded 2 records
2025-02-26 19:45:15,643 - __main__ - INFO - Preparing final dataset for machine learning
2025-02-26 19:45:15,643 - __main__ - INFO - Parsing HPO associations
2025-02-26 19:45:15,644 - __main__ - INFO - Extracted 40 HPO associations
2025-02-26 19:45:15,644 - __main__ - INFO - Creating HPO feature matrix
2025-02-26 19:45:15,648 - __main__ - INFO - Created feature matrix with 36 HPO features
2025-02-26 19:45:15,648 - __main__ - INFO - Parsing disability associations
2025-02-26 19:45:15,650 - __main__ - INFO - Extracted 0 disability associations
2025-02-26 19:45:15,650 - __main__ - INFO - Parsing average age of onset
2025-02-26 19:45:15,651 - __main__ - INFO - Extracted 0 age of onset entries
2025-02-26 19:45:15,651 - __main__ - INFO - Parsing types of inheritance
2025-02-26 19:45:15,652 - __main__ - INFO - Extracted 0 inheritance

Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
Traceback (most recent call last):
  File "/tmp/ipykernel_2370256/3052700220.py", line 15, in <module>
    X_transformed = feature_engineer.create_feature_pipeline(X, y, use_pca=True, n_features=10, n_components=5)
  File "/tmp/ipykernel_2370256/1379365356.py", line 804, in create_feature_pipeline
    X_selected = self.select_features(X_encoded, y, n_features=n_features, fit=True)
  File "/tmp/ipykernel_2370256/1379365356.py", line 705, in select_features
    X_selected = self.feature_selector.fit_transform(X, y)
  File "/home/anat/.local/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 142, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/anat/.local/lib/python3.10/site-packages/sklearn/u

name 'OrphanetDataProcessor' is not defined


Traceback (most recent call last):
  File "/tmp/ipykernel_2353377/3052700220.py", line 3, in <module>
    data_processor = OrphanetDataProcessor(data_dir='../data')  # Use local directory
NameError: name 'OrphanetDataProcessor' is not defined
