##### This codes check the Drugbank database and see which of the 100 drugs are availbale, and then save the available drug as "[Drugbank Data](drug_validation_results.json)

#### The main drugbank dataset is not included due to license restrictions, but can be gotten from https://go.drugbank.com/

In [None]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from typing import Dict, List, Tuple, Optional
import json

class DrugValidationFramework:
    def __init__(self):
        # Drug name mappings for known alternatives
        self.drug_aliases = {
            "Albuterol": ["Salbutamol"],
            "Trimethoprim-sulfamethoxazole": ["Bactrim", "Co-trimoxazole", "TMP-SMX", "SXT,", "TMP-SMZ", " TMP-Sulfa"],
            "Mycophenolate": ["Mycophenolic acid", "Mycophenolate mofetil"],
        }
        
        # Organized dictionary of 100 selected drugs for validation
        self.drug_categories = {
            "Cardiovascular": {
                "Antihypertensives": ["Lisinopril", "Amlodipine", "Metoprolol", "Valsartan", "Hydrochlorothiazide"],
                "Anticoagulants": ["Warfarin", "Apixaban", "Clopidogrel", "Rivaroxaban", "Dabigatran"],
                "Statins": ["Atorvastatin", "Simvastatin", "Rosuvastatin", "Pravastatin", "Pitavastatin"],
                "Heart_Rhythm": ["Amiodarone", "Digoxin", "Diltiazem", "Verapamil", "Spironolactone"]
            },
            "CNS_Agents": {
                "Antidepressants": ["Sertraline", "Fluoxetine", "Venlafaxine", "Duloxetine", "Bupropion", "Mirtazapine"],
                "Antipsychotics": ["Quetiapine", "Risperidone", "Olanzapine", "Haloperidol", "Clozapine"],
                "Anxiolytics": ["Alprazolam", "Diazepam", "Gabapentin", "Pregabalin"]
            },
            "Anti_Infectives": {
                "Antibiotics": ["Ciprofloxacin", "Azithromycin", "Clarithromycin", "Rifampin", "Doxycycline", 
                              "Amoxicillin", "Metronidazole", "Trimethoprim-sulfamethoxazole"],
                "Antifungals": ["Fluconazole", "Itraconazole", "Voriconazole", "Posaconazole"],
                "Antivirals": ["Acyclovir", "Valacyclovir", "Oseltamivir"]
            },
            "Diabetes_Metabolic": {
                "All": ["Metformin", "Glipizide", "Sitagliptin", "Empagliflozin", "Liraglutide",
                       "Insulin glargine", "Insulin lispro", "Levothyroxine", "Methimazole", "Pioglitazone"]
            },
            "Pain_Analgesics": {
                "All": ["Morphine", "Oxycodone", "Tramadol", "Hydromorphone", "Fentanyl",
                       "Ibuprofen", "Celecoxib", "Meloxicam", "Acetaminophen", "Pregabalin"]
            },
            "Gastrointestinal": {
                "All": ["Omeprazole", "Pantoprazole", "Esomeprazole", "Famotidine", "Ondansetron",
                       "Metoclopramide", "Dicyclomine", "Sulfasalazine", "Mesalamine", "Prucalopride"]
            },
            "Respiratory": {
                "All": ["Montelukast", "Fluticasone", "Budesonide", "Salbutamol", "Tiotropium",
                       "Salmeterol", "Ipratropium", "Zafirlukast", "Theophylline", "Roflumilast"]
            },
            "Immunosuppressants": {
                "All": ["Tacrolimus", "Cyclosporine", "Mycophenolate", "Azathioprine", "Sirolimus"]
            },
            "Oncology": {
                "All": ["Methotrexate", "Cyclophosphamide", "Tamoxifen", "Letrozole", "Capecitabine"]
            }
        }

    def load_drugbank_data(self, filepath: str) -> pd.DataFrame:
        """
        Load and preprocess DrugBank data
        
        Args:
            filepath (str): Path to DrugBank CSV file
            
        Returns:
            pd.DataFrame: Preprocessed DrugBank data with NaN values handled
        """
        try:
            df = pd.read_csv(filepath)
            # Handle NaN values to ensure clean JSON output
            df = df.fillna('')
            print(f"Successfully loaded DrugBank data with {len(df)} entries")
            return df
        except Exception as e:
            print(f"Error loading DrugBank data: {str(e)}")
            return None

    def find_drug_in_drugbank(self, drug_name: str, drugbank_df: pd.DataFrame, threshold: int = 85) -> Tuple[Optional[pd.Series], float]:
        """
        Find drug in DrugBank using exact matching, aliases, and fuzzy matching
        
        Args:
            drug_name (str): Name of drug to search for
            drugbank_df (pd.DataFrame): DrugBank database
            threshold (int): Minimum similarity score (0-100) for fuzzy matching
            
        Returns:
            Tuple containing:
            - pd.Series: Best matching drug data (or None if no match)
            - float: Match confidence score
        """
        # Step 1: Try exact match
        exact_match = drugbank_df[drugbank_df['name'].str.lower() == drug_name.lower()]
        if not exact_match.empty:
            return exact_match.iloc[0], 100
            
        # Step 2: Check aliases for known alternative names
        if drug_name in self.drug_aliases:
            for alias in self.drug_aliases[drug_name]:
                alias_match = drugbank_df[drugbank_df['name'].str.lower() == alias.lower()]
                if not alias_match.empty:
                    return alias_match.iloc[0], 100

        # Step 3: Try fuzzy matching as fallback
        best_match = None
        best_score = 0
        
        for _, row in drugbank_df.iterrows():
            name_score = fuzz.ratio(drug_name.lower(), str(row['name']).lower())
            if name_score > best_score and name_score >= threshold:
                best_score = name_score
                best_match = row

        return best_match, best_score

    def extract_drugbank_info(self, drug_data: pd.Series) -> Dict:
        """
        Extract relevant fields from DrugBank entry
        
        Args:
            drug_data (pd.Series): Single drug entry from DrugBank
            
        Returns:
            Dict: Formatted drug information
        """
        return {
            'drugbank_id': drug_data.get('drugbank_id', ''),
            'mechanism_of_action': drug_data.get('mechanism_of_action', ''),
            'pharmacodynamics': drug_data.get('pharmacodynamics', ''),
            'toxicity': drug_data.get('toxicity', ''),
            'drug_interactions': drug_data.get('drug_interactions', ''),
            'indication': drug_data.get('indication', '')
        }

    def validate_drugs(self, drugbank_df: pd.DataFrame) -> Dict:
        """
        Validate all drugs against DrugBank and prepare validation report
        
        Args:
            drugbank_df (pd.DataFrame): DrugBank database
            
        Returns:
            Dict: Validation results containing found and missing drugs
        """
        validation_results = {
            'found_drugs': [],
            'missing_drugs': []
        }

        total_drugs = 0
        for category, subcategories in self.drug_categories.items():
            for subcategory, drugs in subcategories.items():
                total_drugs += len(drugs)
                for drug in drugs:
                    match, score = self.find_drug_in_drugbank(drug, drugbank_df)
                    
                    if match is not None:
                        drug_info = self.extract_drugbank_info(match)
                        validation_results['found_drugs'].append({
                            'name': drug,
                            'category': category,
                            'subcategory': subcategory,
                            'match_score': score,
                            'drugbank_info': drug_info
                        })
                    else:
                        validation_results['missing_drugs'].append({
                            'name': drug,
                            'category': category,
                            'subcategory': subcategory
                        })

        print(f"\nValidation Summary:")
        print(f"Total drugs checked: {total_drugs}")
        print(f"Found in DrugBank: {len(validation_results['found_drugs'])}")
        print(f"Missing from DrugBank: {len(validation_results['missing_drugs'])}")
        
        return validation_results

    def save_results(self, validation_results: Dict, output_file: str):
        """
        Save validation results to JSON file with proper formatting
        
        Args:
            validation_results (Dict): Validation results to save
            output_file (str): Output file path
        """
        with open(output_file, 'w') as f:
            json.dump(validation_results, f, indent=2, ensure_ascii=False)
        print(f"\nResults saved to {output_file}")

def main():
    # Initialize validation framework
    validator = DrugValidationFramework()
    
    # Load and preprocess DrugBank data
    drugbank_df = validator.load_drugbank_data('drugbank_data.csv')
    if drugbank_df is None:
        return
    
    # Run validation
    validation_results = validator.validate_drugs(drugbank_df)
    
    # Save results
    validator.save_results(validation_results, 'drug_validation_results.json')

if __name__ == "__main__":
    main()

Successfully loaded DrugBank data with 16581 entries



Validation Summary:
Total drugs checked: 100
Found in DrugBank: 99
Missing from DrugBank: 1

Results saved to drug_validation_results.json


### The codes  below creates visualizations of the cross validation carried out.

In [11]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import seaborn as sns
import matplotlib.pyplot as plt
import json
from typing import Dict, List
from scipy import stats

class ThesisVisualizer:
    def __init__(self):
        """Initialize visualizer with data and settings"""
        # Image export settings
        self.image_format = 'png'
        self.image_scale = 2
        self.image_width = 1200
        self.image_height = 800
        
        # Load data
        print("Loading data files...")
        try:
            with open('strict_comparison_results.json', 'r') as f:
                self.results = json.load(f)
            with open('drug_validation_results.json', 'r') as f:
                self.drugbank_data = json.load(f)
            print("Data loaded successfully")
        except Exception as e:
            print(f"Error loading data: {str(e)}")
            raise
            
        self.prepare_data()

    def prepare_data(self):
        """Prepare data for visualization"""
        print("Preparing data for analysis...")
        data_list = []
        drug_categories = {}
        
        # Extract drug categories
        for drug in self.drugbank_data['found_drugs']:
            drug_categories[drug['name']] = {
                'category': drug['category'],
                'subcategory': drug['subcategory']
            }
        
        # Create detailed dataset
        for drug_name, drug_data in self.results['detailed_results'].items():
            category = drug_categories.get(drug_name, {'category': 'Unknown', 'subcategory': 'Unknown'})
            for field, scores in drug_data['detailed_scores'].items():
                data_list.append({
                    'drug_name': drug_name,
                    'category': category['category'],
                    'subcategory': category['subcategory'],
                    'comparison_type': field,
                    'semantic_similarity': scores['semantic_similarity'],
                    'technical_overlap': scores['technical_term_overlap'],
                    'content_coverage': scores['content_coverage'],
                    'weighted_score': scores['weighted_score']
                })
        
        self.df = pd.DataFrame(data_list)
        print(f"Processed {len(self.df)} data points across {len(self.df['category'].unique())} categories")

    def save_figure(self, fig, filename_base: str):
        """Save figure in both HTML and image formats"""
        try:
            # Save interactive HTML
            fig.write_html(f'visualizations/{filename_base}.html')
            
            # Save static image
            fig.write_image(
                f'visualizations/{filename_base}.{self.image_format}',
                width=self.image_width,
                height=self.image_height,
                scale=self.image_scale
            )
            print(f"Saved {filename_base} successfully")
        except Exception as e:
            print(f"Error saving {filename_base}: {str(e)}")

    def create_overall_performance(self):
        """Create overall performance visualization"""
        print("Creating overall performance visualization...")
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'Category Performance',
                'Metric Distribution',
                'Comparison Types',
                'Time Series'
            )
        )
        
        # 1. Category Performance
        cat_means = self.df.groupby('category')['weighted_score'].mean().reset_index()
        fig.add_trace(
            go.Bar(
                x=cat_means['category'],
                y=cat_means['weighted_score'],
                name='Category Performance'
            ),
            row=1, col=1
        )
        
        # 2. Metric Distribution
        metrics = ['semantic_similarity', 'technical_overlap', 
                  'content_coverage', 'weighted_score']
        for metric in metrics:
            fig.add_trace(
                go.Box(
                    y=self.df[metric],
                    name=metric.replace('_', ' ').title()
                ),
                row=1, col=2
            )
        
        # 3. Comparison Types
        comp_means = self.df.groupby('comparison_type')['weighted_score'].mean().reset_index()
        fig.add_trace(
            go.Bar(
                x=comp_means['comparison_type'],
                y=comp_means['weighted_score'],
                name='Comparison Types'
            ),
            row=2, col=1
        )
        
        # 4. Time Series
        self.df['sequence'] = self.df.groupby('comparison_type').cumcount()
        fig.add_trace(
            go.Scatter(
                x=self.df['sequence'],
                y=self.df['weighted_score'],
                mode='lines',
                name='Performance Over Time'
            ),
            row=2, col=2
        )
        
        fig.update_layout(
            height=1000,
            title_text='Overall Performance Analysis',
            showlegend=True,
            font=dict(size=14)
        )
        
        self.save_figure(fig, 'overall_performance')

    def create_detailed_metrics(self):
        """Create detailed metrics visualization"""
        print("Creating detailed metrics visualization...")
        
        # Define metrics first
        metrics = ['semantic_similarity', 'technical_overlap', 
                  'content_coverage', 'weighted_score']
        
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'Semantic vs Technical',
                'Coverage vs Weighted',
                'Category Distribution',
                'Metric Correlations'
            )
        )
        
        # 1. Semantic vs Technical
        fig.add_trace(
            go.Scatter(
                x=self.df['semantic_similarity'],
                y=self.df['technical_overlap'],
                mode='markers',
                name='Semantic vs Technical',
                marker=dict(
                    color=self.df['weighted_score'],
                    colorscale='Viridis',
                    showscale=True
                )
            ),
            row=1, col=1
        )
        
        # 2. Coverage vs Weighted
        fig.add_trace(
            go.Scatter(
                x=self.df['content_coverage'],
                y=self.df['weighted_score'],
                mode='markers',
                name='Coverage vs Weighted',
                marker=dict(
                    color=self.df['semantic_similarity'],
                    colorscale='Viridis',
                    showscale=True
                )
            ),
            row=1, col=2
        )
        
        # 3. Category Distribution
        for category in self.df['category'].unique():
            cat_data = self.df[self.df['category'] == category]
            fig.add_trace(
                go.Box(
                    y=cat_data['weighted_score'],
                    name=category
                ),
                row=2, col=1
            )
        
        # 4. Metric Correlations
        corr_matrix = self.df[metrics].corr()
        fig.add_trace(
            go.Heatmap(
                z=corr_matrix,
                x=[m.replace('_', ' ').title() for m in metrics],
                y=[m.replace('_', ' ').title() for m in metrics],
                colorscale='RdBu'
            ),
            row=2, col=2
        )
        
        fig.update_layout(
            height=1000,
            title_text='Detailed Metrics Analysis',
            showlegend=True,
            template='plotly_white'
        )
        
        self.save_figure(fig, 'detailed_metrics')

    def create_category_analysis(self):
        """Create category-specific analysis"""
        print("Creating category analysis...")
        for category in self.df['category'].unique():
            cat_data = self.df[self.df['category'] == category]
            
            fig = make_subplots(
                rows=2, cols=2,
                subplot_titles=(
                    'Metric Distribution',
                    'Subcategory Performance',
                    'Comparison Types',
                    'Performance Timeline'
                )
            )
            
            # Add visualizations for each category
            # [Previous plotting code remains similar]
            
            fig.update_layout(
                height=1000,
                title_text=f'Analysis for {category}',
                showlegend=True
            )
            
            self.save_figure(fig, f'category_{category.lower().replace(" ", "_")}')

    def create_statistical_summary(self):
        """Create statistical summary visualization"""
        print("Creating statistical summary...")
        # Perform statistical tests
        categories = self.df['category'].unique()
        f_stat, p_val = stats.f_oneway(*[
            self.df[self.df['category'] == cat]['weighted_score'] 
            for cat in categories
        ])
        
        # Create visualization
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'Category Means with CI',
                'Distribution Comparison',
                'Statistical Tests',
                'Performance Range'
            )
        )
        
        # Add statistical visualizations
        # [Previous plotting code remains similar]
        
        fig.update_layout(
            height=1000,
            title_text='Statistical Analysis Summary',
            showlegend=True
        )
        
        self.save_figure(fig, 'statistical_summary')

    def create_all_visualizations(self):
        """Create all visualizations"""
        print("\nStarting visualization generation...")
        
        # Create output directory if it doesn't exist
        import os
        if not os.path.exists('visualizations'):
            os.makedirs('visualizations')
        
        # Generate all visualizations
        self.create_overall_performance()
        self.create_detailed_metrics()
        self.create_category_analysis()
        self.create_statistical_summary()
        
        print("\nAll visualizations completed successfully!")

def main():
    try:
        visualizer = ThesisVisualizer()
        visualizer.create_all_visualizations()
    except Exception as e:
        print(f"Error in visualization generation: {str(e)}")

if __name__ == "__main__":
    main()


Loading data files...
Data loaded successfully
Preparing data for analysis...
Processed 294 data points across 9 categories

Starting visualization generation...
Creating overall performance visualization...
Saved overall_performance successfully
Creating detailed metrics visualization...
Saved detailed_metrics successfully
Creating category analysis...
Saved category_cardiovascular successfully
Saved category_cns_agents successfully
Saved category_pain_analgesics successfully
Saved category_anti_infectives successfully
Saved category_diabetes_metabolic successfully
Saved category_gastrointestinal successfully
Saved category_respiratory successfully
Saved category_immunosuppressants successfully
Saved category_oncology successfully
Creating statistical summary...
Saved statistical_summary successfully

All visualizations completed successfully!
