In [None]:
import streamlit as st
import pandas as pd
from transformers import pipeline
from langdetect import detect
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

def load_medquad_dataset(csv_path, txt_path):
    data = pd.read_csv(csv_path)
    data.columns = data.columns.str.strip() 

    with open(txt_path, 'r') as f:
        answers = f.readlines()

    records = []
    for index, answer in enumerate(data['Answer']):
        if index < len(answers):
            question = answers[index].strip()
            records.append({
                'question': question,
                'answer': answer.strip()
            })
    return records

def clean_text(text):
    
    tokens = word_tokenize(text)
    
    cleaned_tokens = [word.lower() for word in tokens if word.isalnum()]
    return ' '.join(cleaned_tokens)

def search_medquad_data(query, data, n=5):
    cleaned_query = clean_text(query)
    results = []

    for entry in data:
        cleaned_question = clean_text(entry['question'])
        cleaned_answer = clean_text(entry['answer'])

        if cleaned_query in cleaned_question or cleaned_query in cleaned_answer:
            results.append(entry)
            if len(results) >= n:
                break

    return results

def setup_translation_pipeline(source_lang, target_lang):
    model_map = {
        'en': {
            'es': 'Helsinki-NLP/opus-mt-en-es',
            'fr': 'Helsinki-NLP/opus-mt-en-fr',
            'de': 'Helsinki-NLP/opus-mt-en-de',
        },
        'es': {
            'en': 'Helsinki-NLP/opus-mt-es-en',
        },
        'fr': {
            'en': 'Helsinki-NLP/opus-mt-fr-en',
        },
        'de': {
            'en': 'Helsinki-NLP/opus-mt-de-en',
        }
    }

    if source_lang in model_map and target_lang in model_map[source_lang]:
        return pipeline("translation", model=model_map[source_lang][target_lang])
    else:
        raise ValueError(f"Translation model not available for {source_lang} to {target_lang}")

def display_answers(results):
    if results:
        for idx, result in enumerate(results):
            st.write(f"### Q{idx + 1}: {result['question']}")
            st.write(f"**Answer**: {result['answer']}")
            st.write("---")
    else:
        st.write("No relevant answers found.")

# Streamlit 
def main():
    st.title("Multilingual Medical Q&A Chatbot")

    csv_path = "/Users/cansarma/Desktop/Ollama/Task 3/MedQuAD-master/QA-TestSet-LiveQA-Med-Qrels-2479-Answers/All-2479-Answers-retrieved-from-MedQuAD.csv"
    txt_path = "/Users/cansarma/Desktop/Ollama/Task 3/MedQuAD-master/QA-TestSet-LiveQA-Med-Qrels-2479-Answers/All-qrels_LiveQAMed2017-TestQuestions_2479_Judged-Answers.txt"
    
    st.write("Loading MedQuAD dataset...")
    data = load_medquad_dataset(csv_path, txt_path)
    st.write("Dataset loaded.")

    user_input = st.text_input("Ask a medical question")

    if user_input:
        user_lang = detect(user_input)
        st.write(f"Detected language: {user_lang}")

        st.write("### Searching for relevant answers...")
        results = search_medquad_data(user_input, data)

        if not results:
            st.write("No relevant answers found. Translating question to English...")

            try:
                
                translator = setup_translation_pipeline(user_lang, 'en')
                translated_input = translator(user_input)[0]['translation_text']
                st.write(f"Translated Question: {translated_input}")

                results = search_medquad_data(translated_input, data)

            except ValueError as e:
                st.write(str(e))
                return  
        display_answers(results)

if __name__ == "__main__":
    main()
