<div class="alert alert-block alert-success">
<b>DOCUMENTS</b>
</div>

In [109]:
import os #file handling
import json 
import xml.etree.ElementTree as ET #XML parsing
from bs4 import BeautifulSoup

with open(file_path, 'r', encoding='utf-8') as f:

This line opens the file located at file_path in read mode ('r').
The encoding='utf-8' argument ensures that the file is read using UTF-8 encoding, which is suitable for most text files, especially those containing non-ASCII characters.
The with statement is used to ensure that the file is properly closed after its suite finishes, even if an error occurs. The variable f acts as a file object within this block.

# Define functions for reading files

In [110]:
def read_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def read_txt(file_path):
    with open(file_path, 'r') as file:
        return {"content": file.read().split()}

def read_xml(file_path):
    try:
        # Parse the XML file
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Initialize a dictionary to store data
        extracted_data = {}

        # Iterate through all elements in the XML file
        for elem in root.iter():
            tag = elem.tag
            text_content = elem.text.strip() if elem.text else ""

            # Store non-empty content in the dictionary
            if text_content:
                if tag in extracted_data:
                    extracted_data[tag].append(text_content)
                else:
                    extracted_data[tag] = [text_content]

        return extracted_data

    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except ET.ParseError:
        print(f"Error parsing XML file: {file_path}")
        return None
    
def read_html(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
        
        extracted_data = {}
        
        for tag in soup.find_all(True):  # True finds all tags
            text_content = tag.get_text().strip()
            if text_content:
                if tag.name in extracted_data:
                    extracted_data[tag.name].append(text_content)
                else:
                    extracted_data[tag.name] = [text_content]
        
        return extracted_data

    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error parsing HTML file: {file_path}, Error: {str(e)}")
        return None

In [111]:
def read_document(file_path):
    if file_path.endswith('.json'):
        return read_json(file_path)
    elif file_path.endswith('.txt'):
        return read_txt(file_path)
    elif file_path.endswith('.xml'):
        return read_xml(file_path)
    else:
        raise ValueError("Unsupported file format.")

# Extract text with keys for tokenization

In [112]:
def extract_text_with_keys(data):
    if isinstance(data, dict):
        extracted = {}
        for key, value in data.items():
            if isinstance(value, (str, list, dict)):
                # Extract text recursively and store under the respective key
                extracted[key] = extract_text_with_keys(value)
        # Convert the extracted dictionary into a string for concatenation
        return ' '.join(f"{k}: {v}" for k, v in extracted.items())
    elif isinstance(data, list):
        # If it's a list, concatenate extracted text from each element as a single string.
        return ' '.join(extract_text_with_keys(item) for item in data if isinstance(item, (str, dict, list)))
    elif isinstance(data, str):
        return data  # Base case: return the string directly
    return ''  # Return an empty string for other types

# Expand punctuated abbreviations

In [113]:
def expand_punctuated_abbreviations(text):
    punctuated_abbreviation_dict = {
        "aa.": "arteriae",
        "vv.": "venae",
        "v.": "vena",
        "a.": "arteria",
        "nn.": "nervi",
        "n.": "nervus",
        "mm.": "musculi",
        "m.": "musculus",
        "ligg.": "ligamenta",
        "lig.": "ligamentum",
        "procc.": "processus",
        "proc.": "processus",
        "art.": "articulatio",
        "ggll.": "ganglia",
        "ggl.": "ganglion",
        "q.d.": "once a day",
        "b.i.d.": "twice a day",
        "t.i.d.": "three times a day",
        "q.i.d.": "four times a day",
        "q.o.d.": "every other day",
        "a.c.": "before meals",
        "p.c.": "after meals",
        "p.r.n.": "as needed",
        "p.o.": "by mouth",
        "i.v.": "intravenous",
        "i.m.": "intramuscular",
        "s.c.": "subcutaneous",
        "n.p.o.": "nothing by mouth",
        "c.c.": "with food",
        "a1.": "alpha 1",
        "A.B.C.": "airway, breathing, circulation",
        "etc.": "et cetera"
    }
    
    for abbrev, expansion in punctuated_abbreviation_dict.items():
        text = text.replace(abbrev, expansion)
    
    return text

# Tokenizing sentences and words using NLTK

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
def tokenize(text):
    text = text.lower()
    # Ensure that the input is a string
    if isinstance(text, dict):
        text = ' '.join(f"{k}: {' '.join(v) if isinstance(v, list) else v}" for k, v in text.items())
    elif isinstance(text, list):
        text = ' '.join(text)

    sentences = sent_tokenize(text)
    words = [word_tokenize(sentence) for sentence in sentences]
    flat_words = [word for sentence in words for word in sentence]
    return flat_words
print(tokenize("Surgical ab- aaron's clipping or endovascular-coiling is often guided by preoperative assessment of the aa. communicans and aa. basilaris to ensure collateral circulation. The a. cerebri posterior, though less frequently affected, can also be a site of aneurysm formation in patients with hypertension or connective tissue disorders."))

In [None]:
text = "Understanding vascular variability is critical in managing cerebrovascular disorders. Variations in aa. communicans, aa. cerebri, and vv. cerebrales significantly impact clinical outcomes and treatment approaches. Advances in imaging technologies are enhancing the precision of diagnostic capabilities, aiding clinicians in the management of complex cerebrovascular conditions. Ongoing research on anatomical variations promises to further refine diagnostic protocols and therapeutic interventions in cerebrovascular medicine."
expanded_text = expand_punctuated_abbreviations(text)
tokens = tokenize(expanded_text)
print(tokens)

# Calculate frequencies of each term

In [117]:
from collections import Counter

def calculate_word_frequency(tokens_per_doc):
    word_frequencies = Counter()
    for sentence_tokens in tokens_per_doc: 
        word_frequencies.update(sentence_tokens)
    return word_frequencies

# Text correction

In [118]:
medical_terms_dict = {
    "flu": 1000,
    "cyst": 950,
    "tumor": 900,
    "fever": 890,
    "cancer": 880,
    "pain": 870,
    "stroke": 850,
    "rash": 840,
    "asthma": 830,
    "diabetes": 820,
    "anemia": 800,
    "infection": 780,
    "sepsis": 760,
    "biopsy": 750,
    "fracture": 740,
    "allergy": 730,
    "nausea": 720,
    "fatigue": 700,
    "migraine": 680,
    "neoplasm": 670,
    "arthritis": 650,
    "chronic": 640,
    "diagnosis": 630,
    "therapy": 620,
    "cardiology": 600,
    "neurosis": 590,
    "immunity": 580,
    "concussion": 570,
    "respiratory": 560,
    "immunology": 550,
    "hematology": 540,
    "pathology": 530,
    "urology": 520,
    "radiology": 510,
    "chemotherapy": 500,
    "pharmacology": 490,
    "epidemiology": 480,
    "cardiogram": 470,
    "endocrinology": 460,
    "psychology": 450,
    "hematoma": 440,
    "intubation": 430,
    "anaphylaxis": 420,
    "psychiatry": 410,
    "oncology": 400,
    "fibromyalgia": 390,
    "hypertension": 380,
    "nephrology": 370,
    "pediatrics": 360,
    "ophthalmology": 350,
    "osteoporosis": 340,
    "gastroenteritis": 330,
    "rheumatology": 320,
    "thrombosis": 310,
    "anesthesiology": 300,
    "pulmonology": 290,
    "electrocardiogram": 280,
    "electroencephalogram": 270,
    "histocompatibility": 260,
    "otorhinolaryngology": 250,
    "gastroenterology": 240,
    "angiocardiography": 230,
    "biopsychosocial": 220,
    "immunodeficiency": 210,
    "neuroendocrinology": 200,
}

In [119]:
def update_medical_terms_with_real_frequencies(medical_dict, word_frequencies):
    for word, frequencies in word_frequencies_per_doc.items():    
        if word in medical_dict:
            medical_dict[word] = frequency
    return medical_dict

In [120]:
# Function to load both general and medical dictionaries
def initialize_symspell_with_dictionaries(general_dict_path, medical_dict):
    # Initialize SymSpell
    # Change max_dictionary_edit_distance as suitable for you 
    # make sure to change it in correct_tokens_with_symspell(sym_spell, tokens) as well
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    
    # Load the general dictionary
    for term, freq in general_dict.items():
        sym_spell.create_dictionary_entry(term, freq)

    # Add medical terms to the SymSpell dictionary
    for term, freq in medical_dict.items():
        sym_spell.create_dictionary_entry(term, freq)
    
    return sym_spell

In [121]:
def load_general_dictionary(file_path):
    dictionary = {}
    with open(file_path, 'r') as file:
        for line in file:
            term, freq = line.split()  # Split with space
            dictionary[term] = int(freq) 
    return dictionary

In [122]:
from symspellpy import SymSpell, Verbosity
def correct_tokens_with_symspell(sym_spell, tokens):
    corrected_tokens = []
    for word in tokens:
        # Get suggestions for each word
        # Change max_dictionary_edit_distance as suitable for you 
        # make sure to change it initialize_symspell_with_dictionaries(general_dict_path, medical_dict) as well
        suggestions = sym_spell.lookup(word, Verbosity.ALL, max_edit_distance=2)
        if suggestions:
            # Take the most frequent suggestion
            corrected_word = suggestions[0].term
        else:
            corrected_word = word  # Keep the original word if no suggestion is found
        corrected_tokens.append(corrected_word)
    
    return corrected_tokens

# Process documents

In [None]:
def read_document(file_path):
    file_extension = os.path.splitext(file_path)[1].lower()
    if file_extension == '.json':
        return read_json(file_path)
    elif file_extension == '.txt':
        return read_txt(file_path)
    elif file_extension == '.xml':
        return read_xml(file_path)
    elif file_extension == '.html':
        return read_html(file_path)  
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

folder_path = "C:/Users/yassm/OneDrive/Desktop/Github/Indexing and Search Engine for Medical Research Articles Project/docs"
documents = {}
file_name_map = {}
index = 0

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        try:
            # Use read_document to read based on file type
            documents[filename] = read_document(file_path)
        except ValueError as e:
            print(f"Error processing {filename}: {e}")
    
    file_name_map[filename] = index
    index += 1

print("File Name Map:")
print(file_name_map)
print("\nDocuments Content:")
for doc, content in documents.items():
    print(doc, ":", content, "\n")

# Replace abbreviations in the list of tokens with their full forms

In [124]:
abbreviation_mapping = {
    "abg": "arterial blood gases",
    "ace": "angiotensin converting enzyme",
    "acl": "anterior cruciate ligament",
    "adhd": "attention deficit hyperactivity disorder",
    "afib": "atrial fibrillation",
    "aids": "acquired immunodeficiency syndrome",
    "alp": "alkaline phosphatase",
    "als": "amyotrophic lateral sclerosis",
    "alt": "alanine aminotransferase",
    "amd": "age related macular degeneration",
    "ami": "acute myocardial infarction",
    "aodm": "adult onset diabetes mellitus",
    "ast": "aspartate aminotransferase",
    "avm": "arteriovenous malformation",
    "bid": "twice a day",
    "bmi": "body mass index",
    "bp": "blood pressure",
    "bph": "benign prostatic hypertrophy",
    "brca": "breast cancer gene",
    "bun": "blood urea nitrogen",
    "ca": "cancer or calcium",
    "ca-125": "cancer antigen 125",
    "cabg": "coronary artery bypass graft",
    "cad": "coronary artery disease",
    "cat": "computerized axial tomography",
    "cbc": "complete blood count",
    "chd": "congenital heart disease",
    "chf": "congestive heart failure",
    "cmv": "cytomegalovirus",
    "cns": "central nervous system",
    "copd": "chronic obstructive pulmonary disease",
    "cpk": "creatine phosphokinase",
    "cpr": "cardiopulmonary resuscitation",
    "crf": "chronic renal failure",
    "crp": "c reactive protein",
    "csf": "cerebrospinal fluid",
    "cva": "cerebrovascular accident",
    "cxr": "chest x ray",
    "d&c": "dilatation and curettage",
    "djd": "degenerative joint disease",
    "dm": "diabetes mellitus",
    "dtp": "diphtheria, tetanus, pertussis",
    "dvt": "deep vein thrombosis",
    "dx": "diagnosis",
    "ecg": "electrocardiogram",
    "echo": "echocardiogram",
    "eeg": "electroencephalogram",
    "emg": "electromyography",
    "ent": "ear, nose and throat",
    "ercp": "endoscopic retrograde cholangiopancreatography",
    "esr": "erythrocyte sedimentation rate",
    "esrd": "end stage renal (kidney) disease",
    "fsh": "follicle stimulating hormone",
    "gerd": "gastroesophageal reflux disease",
    "gi": "gastrointestinal",
    "gfr": "glomerular filtration rate",
    "gu": "genitourinary",
    "hav": "hepatitis a virus",
    "hbv": "hepatitis b virus",
    "hct": "hematocrit",
    "hcv": "hepatitis c virus",
    "hdl": "high density lipoprotein",
    "hgb": "hemoglobin",
    "hiv": "human immunodeficiency virus",
    "hpv": "human papilloma virus",
    "hrt": "hormone replacement therapy",
    "htn": "hypertension",
    "ibd": "inflammatory bowel disease",
    "ibs": "irritable bowel syndrome",
    "icd": "implantable cardioverter defibrillator",
    "icu": "intensive care unit",
    "iddm": "insulin dependent diabetes mellitus",
    "im": "intramuscular",
    "iud": "intrauterine device",
    "iv": "intravenous",
    "ivp": "intravenous pyelogram",
    "ldl": "low density lipoprotein",
    "lft": "liver function tests",
    "mi": "myocardial infarction",
    "mmr": "measles, mumps, and rubella",
    "mri": "magnetic resonance imaging",
    "mrsa": "methicillin resistant staphylococcus aureus",
    "ms": "multiple sclerosis",
    "ng": "nasogastric",
    "niddm": "non insulin dependent diabetes mellitus",
    "nkda": "no known drug allergies",
    "nsaid": "non steroidal anti inflammatory drug",
    "ocd": "obsessive compulsive disorder",
    "pad": "peripheral arterial disease",
    "pap": "papanicolau",
    "pat": "paroxysmal atrial tachycardia",
    "pet": "positron emission tomography",
    "pft": "pulmonary function test",
    "pid": "pelvic inflammatory disease",
    "pms": "premenstrual syndrome",
    "ppd": "purified protein derivative",
    "prn": "as needed",
    "psa": "prostate specific antigen",
    "pt": "prothrombin time",
    "pth": "parathyroid hormone",
    "ptsd": "post traumatic stress syndrome",
    "ptt": "partial thromboplastin time",
    "pud": "peptic ulcer disease",
    "pvc": "premature ventricular contraction",
    "qid": "four times a day",
    "ra": "rheumatoid arthritis",
    "rbc": "red blood cell",
    "rsv": "respiratory syncytial virus",
    "rx": "prescription",
    "sad": "seasonal affective disorder",
    "sids": "sudden infant death syndrome",
    "sle": "systemic lupus erythematosus",
    "sob": "shortness of breath",
    "std": "sexually transmitted disease",
    "t3": "triiodothyronine",
    "t4": "thyroxine",
    "tb": "tuberculosis",
    "tah": "total abdominal hysterectomy",
    "tia": "transient ischemic attack",
    "tibc": "total iron binding capacity",
    "tid": "three times a day",
    "tmj": "temporomandibular joint",
    "torch": "infections that may cause birth defects",
    "tsh": "thyroid stimulating hormone",
    "turp": "transurethral resection of prostate gland",
    "uri": "upper respiratory infection",
    "uti": "urinary tract infection",
    "xrt": "radiotherapy",
    "wbc": "white blood cell",
}


In [125]:
def replace_abbreviations(tokens):
    replaced_tokens = []
    for token in tokens:
        full_form = abbreviation_mapping.get(token, token)  # Get full form or original token
        replaced_tokens.append(full_form)
    return replaced_tokens

# Remove stopwords and punctuation

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

In [None]:
def remove_stopwords_punctuation(tokens):
    stop_words = set(stopwords.words('english'))
    #add stopwords to stop_words here
    stop_words.update({
        "et", "cetera", "xml", "json", "schema", "attribute", "element", 
        "document", "data", "version", "type", "id", "name", "value", 
        "namespace", "lang", "date", "ref", "content", "tag", "api", "schema", 
        "url", "http", "https", "base", "file", "result", "request", "response", 
        "title", "cell", "text", "body", "value", "image", "reference", "record", 
        "metadata", "definition", "information", "description", "details", "model",
        "category", "item", "reference", "data", "type", "category", "link", "method", "yes", "no", 
        "the", "and", "or", "not", "this", "that", "it", "a", "an", "of", "in", "for", 
        "to", "on", "at", "by", "with", "as", "etc", "etcetera", "thus", "here", "there",
        "wiki", "author", "field", "main", "section", "article", "h1", "h2", "h3", "h4", 
        "h5", "h6", "p", "br", "a", "img", "ul", "li", "ol", "strong", "em", "meta", "link", 
        "script", "style", "head", "footer", "html", "svg", "abstract", "conclusion"
    })
    punctuation = set(string.punctuation)
    punctuation.update({'\n', '\t', '\r', '', '-', '--', '---'})
    if all(isinstance(i, list) for i in tokens):  # Check if it's a list of lists
        tokens = [word for sublist in tokens for word in sublist]  # Flatten the list
    cleaned_tokens = [word for word in tokens if word not in stop_words and word not in punctuation and not word.isdigit()]
    return cleaned_tokens


# Example:
tokens_example = [
    "This", "is", "an", "example", "sentence", "-", "with", "punctuation", "and", "stopwords", ".", 
    "Also", "contains", "html", "tags", "like", "p", "and", "h1", "as", "well", "as", "a", "date", "2024", 
    "schema", "type", "and", "attribute", "--", "example-field"
]
cleaned_tokens_example = remove_stopwords_punctuation(tokens_example)
print(cleaned_tokens_example)


# Lemmatization using spaCy for better performance and accuracy

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

In [129]:
import spacy

nlp = spacy.load("en_core_web_sm")

def lemmatize_tokens(tokens):
    # Join tokens into a single string for spaCy processing
    text = " ".join(tokens)
    doc = nlp(text)
    
    # Lemmatize each token and return as a list of lemmatized tokens
    lemmatized_tokens = [token.lemma_ for token in doc]
    return lemmatized_tokens

In [None]:
# Example:
tokens_example = ["running", "jumps", "better", "children", "dogs", "geese"]

# Call the lemmatize_tokens function
lemmatized_tokens_example = lemmatize_tokens(tokens_example)

# Print original tokens and their lemmatized forms
print("Original tokens: ", tokens_example)
print("Lemmatized tokens: ", lemmatized_tokens_example)

# MAIN.0

In [None]:
general_dict = load_general_dictionary("frequency_dictionary_en_30_000.txt")
sym_spell = initialize_symspell_with_dictionaries(general_dict, medical_terms_dict)
tokens_per_doc = {}
tokens_before_removal = {}
word_frequencies_per_doc = {}
tokens_after_tokenization = {}  
tokens_after_lemmatization_and_cleaning = {}  
token_count_after_tokenization = 0
token_count_after_lemmatization_and_cleaning = 0
for filename, content in documents.items():
    doc_id = file_name_map.get(filename)
    extracted_content = extract_text_with_keys(content)
    expanded_content = expand_punctuated_abbreviations(extracted_content)
    tokens = tokenize(expanded_content)
    tokens_after_tokenization[filename] = tokens
    token_count_after_tokenization += len(tokens)
    word_frequencies_per_doc[filename] = dict(calculate_word_frequency(tokens))  
    medical_terms_dict = update_medical_terms_with_real_frequencies(medical_terms_dict, word_frequencies_per_doc[filename])
    sym_spell = initialize_symspell_with_dictionaries(general_dict, medical_terms_dict)
    corrected_tokens = correct_tokens_with_symspell(sym_spell, tokens)
    replaced_tokens = replace_abbreviations(corrected_tokens)
    cleaned_tokens = remove_stopwords_punctuation(replaced_tokens)
    lemmatized_tokens = lemmatize_tokens(cleaned_tokens)
    lemmatized_cleaned_tokens = remove_stopwords_punctuation(lemmatized_tokens)
    tokens_per_doc[doc_id] = lemmatized_cleaned_tokens
    tokens_after_lemmatization_and_cleaning[filename] = lemmatized_cleaned_tokens
    token_count_after_lemmatization_and_cleaning += len(lemmatized_cleaned_tokens)
    #print(f"cleaned tokens for '{filename}':", lemmatized_tokens)
for doc_id, content in tokens_per_doc.items():
    print(f"Tokens for document {doc_id}:", content)
#print(tokens_before_removal)
#print(word_frequencies_per_doc)

# Length of tokens before and after text preprocessing 

In [None]:
print(f"  Tokens after tokenization: {token_count_after_tokenization}")
print(f"  Tokens after lemmatization and cleaning: {token_count_after_lemmatization_and_cleaning}")

In [None]:
# Example of correct_tokens_with_symspell
tokens = ["hypertensiom", "diagnosus", "hrpertension", "thys", "teh", "hyprtension", "mai", "cmments", "csmment", "Ths", "exmple", "txt", "sme", "spelng", "errrs"]
corrected_tokens = correct_tokens_with_symspell(sym_spell, tokens)

print("Original tokens:", tokens)
print("Corrected tokens:", corrected_tokens)

# Inverted index with TF

In [134]:
def create_inverted_index_with_tf(tokens_per_doc):
    inverted_index = {}
    for doc_id, tokens in tokens_per_doc.items():
        for term in tokens:
            if term not in inverted_index:
                inverted_index[term] = {doc_id: 1}
            else:
                if doc_id not in inverted_index[term]:
                    inverted_index[term][doc_id] = 1
                else:
                    inverted_index[term][doc_id] += 1
    return inverted_index

# Inverted index with positions

In [135]:
def create_inverted_index_with_positions(tokens_per_doc):
    inverted_index = {}
    for doc_id, tokens in tokens_per_doc.items():
        for position, term in enumerate(tokens):
            if term not in inverted_index:
                inverted_index[term] = {}
            if doc_id not in inverted_index[term]:
                inverted_index[term][doc_id] = []
            inverted_index[term][doc_id].append(position)
    return inverted_index

# Inverted index with DF

In [136]:
def create_inverted_index_with_df(tokens_per_doc):
    inverted_index = {}
    for doc_id, tokens in tokens_per_doc.items():
        for word in tokens:
            if word not in inverted_index:
                inverted_index[word] = {"DF": 1, "PL": {doc_id: 1}}
            else:
                if doc_id not in inverted_index[word]["PL"]:
                    inverted_index[word]["PL"][doc_id] = 1  
                    inverted_index[word]["DF"] += 1         
                else:
                    inverted_index[word]["PL"][doc_id] += 1  
    return inverted_index

# Call the inverted index functions

In [None]:
inverted_index_tf = create_inverted_index_with_tf(tokens_per_doc)
print("Inverted Index with Term Frequency:", inverted_index_tf)

inverted_index_positions = create_inverted_index_with_positions(tokens_per_doc)
print("Inverted Index with Term Positions:", inverted_index_positions)

inverted_index_df = create_inverted_index_with_df(tokens_per_doc)
print("Inverted Index with Document Frequency and Posting List:", inverted_index_df)

# Term Frequency-Inverse Document Frequency (TF-IDF) Index

# Words that cannot be indexed

In [None]:
import math

def create_tf_idf_index(inverted_index_df, total_docs):
    tf_idf_index = {}
    zero_weight_terms = set()
    for term, data in inverted_index_df.items():
        doc_frequency = data["DF"]
        idf = math.log(total_docs / doc_frequency)
        tf_idf_index[term] = {}
        
        for doc_id, term_frequency in data["PL"].items():
            tf_idf_weight = term_frequency * idf
            tf_idf_index[term][doc_id] = tf_idf_weight
            if tf_idf_weight == 0:
                zero_weight_terms.add(term)
    
    return tf_idf_index, zero_weight_terms
total_docs = len(file_name_map)
tf_idf_index, zero_weight_terms = create_tf_idf_index(inverted_index_df, total_docs)
print("TF-IDF Index:")
print(tf_idf_index)
print("\nWords with weight = 0 (Cannot be indexed):")
print(zero_weight_terms)

# Structure terms by document

In [None]:
def structure_terms_by_document(tf_idf_index):
    terms_by_doc = {}  
    for term, doc_weights in tf_idf_index.items():
        for doc_id, weight in doc_weights.items():
            if doc_id not in terms_by_doc:
                terms_by_doc[doc_id] = []
            terms_by_doc[doc_id].append((term, weight)) 
    return terms_by_doc

terms_by_doc = structure_terms_by_document(tf_idf_index)
for doc_id, content in terms_by_doc.items():
    print(doc_id, ": ", content, "\n") 

# Retrieve top terms for each document

In [None]:
def get_top_weighted_terms(terms_by_doc, top_n=5):
    top_terms_per_doc = {}
    for doc_id, terms in terms_by_doc.items():
        top_terms_per_doc[doc_id] = sorted(terms, key=lambda x: x[1], reverse=True)[:top_n]
    return top_terms_per_doc

top_weighted_terms_per_doc = get_top_weighted_terms(terms_by_doc, top_n=5)
for doc_id, terms in top_weighted_terms_per_doc.items():
    print(f"Top terms for document {doc_id}: {terms}")

<div class="alert alert-block alert-success">
<b>QUERY</b>
</div>

In [None]:
def user_query_interface():
    user_query = input("Enter your search query: ")

    tokens = tokenize(user_query)  
    
    corrected_tokens = correct_tokens_with_symspell(sym_spell, tokens)
    
    replaced_tokens = replace_abbreviations(corrected_tokens)
    
    cleaned_tokens = remove_stopwords_punctuation(replaced_tokens)
    
    lemmatized_tokens = lemmatize_tokens(cleaned_tokens)
    
    lemmatized_cleaned_tokens = remove_stopwords_punctuation(lemmatized_tokens)
    
    print(f"Cleaned tokens for query:", lemmatized_cleaned_tokens)
    return lemmatized_cleaned_tokens
query_terms = user_query_interface()

In [142]:
def calculate_relevance(tf_idf_index, query_terms, doc_id):
    relevance_score = 0
    for term in query_terms:
        if term in tf_idf_index and doc_id in tf_idf_index[term]:
            relevance_score += tf_idf_index[term][doc_id] 
    return relevance_score

In [143]:
def rank_documents_by_relevance(tf_idf_index, query_terms, documents, top_n=3):
    relevance_scores = {} 
    for doc_id in documents.keys():
        relevance_scores[doc_id] = calculate_relevance(tf_idf_index, query_terms, doc_id)
        
    # Sort the documents by relevance score (from highest to lowest)
    ranked_documents = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)
    top_documents = ranked_documents[:top_n] 
    return top_documents

In [None]:
top_3_relevant_documents = rank_documents_by_relevance(tf_idf_index, query_terms, tokens_per_doc, top_n=3)
print("Top 3 most relevant documents:")
reversed_file_name_map = {v: k for k, v in file_name_map.items()}
for doc_id, score in top_3_relevant_documents:
    document_name = reversed_file_name_map.get(doc_id)
    print(f"Document: {doc_id} (Name: {document_name}), Relevance Score: {score}")

# Bigrams

In [145]:
from nltk import ngrams
def generate_two_grams_per_document(tokens_per_doc):
    two_grams_per_doc = {}  # Initialize a dictionary for 2-grams per document
    for doc_id, tokens in tokens_per_doc.items():
        two_grams = list(ngrams(tokens, 2))  # Generate 2-grams for each document
        two_grams_per_doc[doc_id] = two_grams  # Store the list of 2-grams in the dictionary
    return two_grams_per_doc

In [None]:
two_grams_per_doc = generate_two_grams_per_document(tokens_per_doc)
print(two_grams_per_doc)

In [None]:
# Create inverted index for 2-grams
inverted_index_two_grams = create_inverted_index_with_df(two_grams_per_doc)
print("Inverted Index for 2-grams:")
print(inverted_index_two_grams)

In [None]:
tf_idf_index_two_grams, zero_weight_terms_2grams = create_tf_idf_index(inverted_index_two_grams, total_docs)
print("TF-IDF Index for 2-grams:")
print(tf_idf_index_two_grams)

In [149]:
def generate_bigrams(tokens):
    if len(tokens) < 2:
        return []  # Return an empty list if there is only one token
    return [tuple(tokens[i:i+2]) for i in range(len(tokens)-1)]

In [None]:
query_terms_2grams = generate_bigrams(query_terms)
print(query_terms_2grams)

In [151]:
def calculate_relevance_2grams(tf_idf_index_two_grams, query_terms_2grams, doc_id):
    relevance_score = 0
    for term in query_terms_2grams:
        if term in tf_idf_index_two_grams and doc_id in tf_idf_index_two_grams[term]:
            relevance_score += tf_idf_index_two_grams[term][doc_id]
    return relevance_score

In [152]:
def rank_documents_by_relevance_2grams(tf_idf_index_two_grams, query_terms_2grams, documents, top_n=3):
    relevance_scores = {} 
    for doc_id in documents.keys():
        relevance_scores[doc_id] = calculate_relevance_2grams(tf_idf_index_two_grams, query_terms_2grams, doc_id)
    # Sort the documents by relevance score (from highest to lowest)
    ranked_documents = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get the top N documents based on relevance scores
    top_documents = ranked_documents[:top_n] 
    
    return top_documents

In [None]:
top_3_relevant_documents_2grams = rank_documents_by_relevance_2grams(tf_idf_index_two_grams, query_terms_2grams, two_grams_per_doc, top_n=3)
print("Top 3 most relevant documents:")

# Loop through the top 3 relevant documents
for doc_id, score in top_3_relevant_documents_2grams:
    document_name = reversed_file_name_map.get(doc_id)
    print(f"Document: {doc_id} (Name: {document_name}), Relevance Score: {score}")

<div class="alert alert-block alert-info">
<b>Can be used later</b>
</div>

# ngrams function

In [None]:
def generate_ngrams_per_document(tokens_per_doc, n_values=(2, 3)):
    n_grams_per_doc = {n: {} for n in n_values}  # Initialize dictionaries for each n value
    for filename, tokens in tokens_per_doc.items():
        for n in n_values:
            n_grams = ngrams(tokens, n)
            n_grams_per_doc[n][filename] = list(n_grams)  
    return n_grams_per_doc
n_values = (2, 3) 
n_grams_per_doc = generate_ngrams_per_document(tokens_per_doc, n_values)


#Example:
tokens_per_doc_example = {
    "doc1": ["this", "is", "a", "sample", "document"],
    "doc2": ["another", "document", "with", "different", "words"]
}
n_values_example = (2, 3, 4)
n_grams_per_doc_example = generate_ngrams_per_document(tokens_per_doc_example, n_values_example)

for n, docs in n_grams_per_doc_example.items():
    print(f"\n{n}-grams per document:")
    for doc, n_grams in docs.items():
        print(f"{doc}: {n_grams}")

# Text correction

In [None]:
from textblob import TextBlob
def correct_text(text):
    corrected_text = str(TextBlob(text).correct())
    return corrected_text

# Example:
input_text = "hrpertension"
corrected_output = correct_text(input_text)
print("Original:", input_text)
print("Corrected:", corrected_output)
# =======================================> not very practical (will not be used in this context)

# Accent removal

In [None]:
import unicodedata

def remove_accents(text):
    normalized_text = unicodedata.normalize('NFKD', text) #Normalization Form KD, where "KD" refers to Compatibility Decomposition
    without_accents = ''.join(char for char in normalized_text if not unicodedata.combining(char))
    return without_accents

# Example:
french_text = "Élévation de température et maux de tête."
cleaned_text = remove_accents(french_text)
print("Text without accents:", cleaned_text)


# Truncation

In [None]:
def truncate_terms(terms, n):
    truncated_terms = [term[:n] for term in terms]
    return truncated_terms

# Example:
terms = ['medication', 'medical', 'methodology', 'medicine', 'mechanism']

n = 3
truncated_terms = truncate_terms(terms, n)

print("Original Terms:", terms)
print("Truncated Terms (to 3 characters):", truncated_terms)

# Date normalization

In [None]:
from dateutil import parser

def normalize_date(date_string, output_format="%Y-%m-%d"):
    try:
        # Use dateutil.parser to parse the date string
        parsed_date = parser.parse(date_string)
        
        # Return the date in the specified output format
        return parsed_date.strftime(output_format)
    
    except ValueError:
        print(f"Error: Unable to parse date '{date_string}'")
        return None

# Example:
dates = [
    "2023-03-15", "15th March, 2023", "March 15, 2023", "03/15/2023", "2023/03/15", "2023-03-15 14:30:00"
]

normalized_dates = [normalize_date(date) for date in dates]
print("Normalized Dates:")
for original, normalized in zip(dates, normalized_dates):
    print(f"Original: {original} -> Normalized: {normalized}")
