In [None]:
vectorizer_path = '/content/drive/MyDrive/Dream Graduation Project/IR/clinical_vectorizer.pickle'

db_path = '/content/drive/MyDrive/Dream Graduation Project/IR/clinicaltrials.db'

qrels_path = '/content/drive/MyDrive/Dream Graduation Project/IR/evaluation/clinical_trials_qrels.pickle'

queries_path = '/content/drive/MyDrive/Dream Graduation Project/IR/evaluation/queries.csv'

In [None]:
import pickle

with open(qrels_path, 'rb') as f:
  qrels = pickle.load(f)


with open(vectorizer_path, 'rb') as f:
  vectorizer = pickle.load(f)

with open('/content/drive/MyDrive/Dream Graduation Project/IR/clinical_trials_main_matrix.pickle', 'rb') as f:
  matrix = pickle.load(f)


In [None]:
import sqlite3

sqlite_connection = sqlite3.connect("/content/drive/MyDrive/Dream Graduation Project/IR/clinicaltrials.db")

cursor = sqlite_connection.cursor()

def sort_dicts_by_list(data, order):
    """Sorts a list of dictionaries based on the order of a corresponding list of numbers.

    Args:
        data: A list of dictionaries, where each dictionary has an 'id' key.
        order: A list of numbers that defines the desired order for the dictionaries.

    Returns:
        A new list containing the sorted dictionaries.

    Raises:
        TypeError: If the lengths of 'data' and 'order' are not equal.
        ValueError: If any element in 'order' is not found in the 'id' values of 'data'.
    """

    if len(data) != len(order):
        raise TypeError("Lengths of data and order lists must be equal.")

    id_to_dict = {d['id']: d for d in data}  # Create a dictionary for efficient lookup

    if not all(num in id_to_dict for num in order):
        raise ValueError(f"Values in 'order' not found in any dictionary 'id'.")

    sorted_data = [id_to_dict[num] for num in order]

    return sorted_data


def convert_to_json(data: list[tuple]) -> list[dict]:
    desired_structure = {'id': None, 'doc_id': None, 'title': None, 'description': None, 'summary': None}
    converted_data = [
        dict(zip(desired_structure.keys(), item))
        for item in data
    ]
    return converted_data


def get_from_db(indices):
    table_name = 'clinical_trials'
    cursor = sqlite_connection.cursor()
    query_result = cursor.execute(f'SELECT * FROM {table_name} where id in {tuple(indices)}').fetchall()
    result = convert_to_json(query_result)
    data = sort_dicts_by_list(result, indices)
    cursor.close()
    return data

# Clean Data

## install **spellchecker** library



In [None]:
# !pip install pyspellchecker

## import Libraries for clean data


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True


## First remove_punctuation_tokenizer

In [None]:
import string

def remove_punctuation_tokenizer(txt: str):
    new_tokens = []
    txt = txt.lower()
    for token in txt.split():
        new_tokens.append(token.translate(str.maketrans('', '', string.punctuation)))

    return new_tokens

## Second remove_stopwords

In [None]:
from nltk.corpus import stopwords
from typing import List

def remove_stopwords(tokens: List[str]) -> List[str]:
    filtered = []
    for word in tokens:
        if word not in stopwords.words('english'):
            filtered.append(word)

    return filtered

## Third lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

def get_wordnet_pos(tag_parameter):
    tag = tag_parameter[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatization(tokens: List[str]) -> List[str]:
    pos_tags = pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos_tags]

    return lemmatized_words

## Fourth correct_sentence_spelling

In [None]:
# from spellchecker import SpellChecker

# def correct_sentence_spelling(tokens: List[str]) -> List[str]:
#     spell = SpellChecker()
#     c = 0
#     misspelled = spell.unknown(tokens)
#     for i, token in enumerate(tokens):
#         if token in misspelled:
#             corrected = spell.correction(token)
#             if corrected is not None:
#                 c += 1
#                 tokens[i] = corrected

#     return tokens

## Finish create text_proccess function

In [None]:
def text_processor(txt: str, enable_spell_checking=False):
    tokens = remove_punctuation_tokenizer(txt)
    tokens = remove_stopwords(tokens)
    # if enable_spell_checking:
    #     tokens = correct_sentence_spelling(tokens)
    tokens = lemmatization(tokens)
    return " ".join(tokens)

# Evaluation

In [None]:
import numpy as np


def precision_at_k(relevant_docs, retrieved_docs, k=10) -> float | int:
    """
    Calculates Precision@k

    Args:
        relevant_docs: A dictionary mapping query IDs to a list of relevant document IDs.
        retrieved_docs: A dictionary mapping query IDs to a list of retrieved document IDs, ranked by relevance.
        k: The number of top retrieved documents to consider (default 10).

    Returns:
        The Precision value.
    """

    retrieved = retrieved_docs[:k]
    num_retrieved = len(retrieved)
    num_retrieved_relevant = calculate_relevant_count(retrieved, relevant_docs)
    return num_retrieved_relevant / num_retrieved if num_retrieved > 0 else 0


def calculate_relevant_count(retrieved_docs: list, query_docs: list) -> int:
    """
    Find the number of intersected documents between `retrieved_docs` and `query_docs`

    Args:
        retrieved_docs: Al list of doc_id that returned from matching
        query_docs: A list of doc_id belonging to a qid from qrel file

    Returns:
        Number of shared results
    """
    intersect_values = np.intersect1d(retrieved_docs, query_docs)
    matched_count = len(intersect_values)
    return matched_count


def average_precision(retrieved: list, relevant: list):
    p_sum = 0
    num_of_relevant = 0
    for i in range(10):
        k = i + 1

        # get the doc_id's for the current
        relevant_docs = [doc['doc_id'] for doc in relevant]

        p_at_k = precision_at_k(relevant_docs, retrieved, k)
        # print(f'P@{k} : {p_at_k}')
        # get the k document id
        k_doc_id = retrieved[:k][-1]

        # get the rel(k)
        rel_at_k = get_rel_from_list(relevant, k_doc_id)

        if rel_at_k > 0:
            num_of_relevant += rel_at_k
        p_sum += p_at_k * rel_at_k

    return p_sum / num_of_relevant if num_of_relevant > 0 else 0


def get_rel_from_list(list_of_rel, doc_id) -> int:
    """Get the rel for a given doc_id from a list of {'doc_id': 'NCT00445783', 'rel': 1}, ..."""

    for rel in list_of_rel:
        if rel['doc_id'] == doc_id:
            return rel['rel']

    return 0


def mean_average_precision(queries: dict, qrels: dict) -> float | int:
    ap_sum = 0
    for qid, query_results in queries.items():
        if qid == 10:
            continue
        # print(f'query number {qid} : ')
        val = average_precision(queries[qid], qrels[qid])
        # print(f'******* Average Precision : {val}')
        ap_sum += val
        # print('-------------------------------------')
    return ap_sum / len(queries)


# Main

In [None]:
import csv
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



def get_queries_results(matrix)->dict:
    results = {}
    with (open(queries_path, 'r') as csvfile):
        reader = csv.reader(csvfile)
        for query_id, query in reader:
            if int(query_id) == 10:
                pass
            q = text_processor(query)
            qv = vectorizer.transform([q])
            similarity_scores = cosine_similarity(qv,matrix).flatten()
            sorted_indices = np.argsort(similarity_scores)[::-1]
            ranked_indices = sorted_indices[:10]
            ranked_indices = [num + 1 for num in ranked_indices]
            data = get_from_db(ranked_indices)
            docs_ids = [item['doc_id'] for item in data]
            results[int(query_id)] = docs_ids
    csvfile.close()
    return results


## Vectors pickles

In [None]:
def load_pickle_file(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

titles_matrix = load_pickle_file('/content/drive/MyDrive/Dream Graduation Project/IR/titles_matrix.pickle')
summaries_matrix = load_pickle_file('/content/drive/MyDrive/Dream Graduation Project/IR/summaries_matrix.pickle')
conditions_matrix = load_pickle_file('/content/drive/MyDrive/Dream Graduation Project/IR/conditions_matrix.pickle')
descriptions_matrix = load_pickle_file('/content/drive/MyDrive/Dream Graduation Project/IR/descriptions_matrix.pickle')
eligibilities_matrix = load_pickle_file('/content/drive/MyDrive/Dream Graduation Project/IR/eligibilities_matrix.pickle')
keywords_matrix = load_pickle_file('/content/drive/MyDrive/Dream Graduation Project/IR/keywords_matrix.pickle')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
initial_weights = {
    'title': 4,
    'description': 1,
    'summary': 3,
    'eligibility': 1,
    'keyword': 5,
    'condition':3
}
best_weights = initial_weights.copy()
best_map = -1
num_iterations = 100
learning_rate = 2


# Compute the weighted sum of matrices directly using sparse matrices
weighted_sum =  (initial_weights['title'] * titles_matrix +
                initial_weights['description'] * descriptions_matrix +
                initial_weights['summary'] * summaries_matrix +
                initial_weights['eligibility'] * eligibilities_matrix +
                initial_weights['keyword'] * keywords_matrix +
                initial_weights['condition'] * conditions_matrix)

total_weight = sum(initial_weights.values())
main_matrix = weighted_sum / total_weight

# Function to adjust weights slightly
def adjust_weights(weights, learning_rate):
    new_weights = weights.copy()
    for key in new_weights:
        adjustment = learning_rate * np.random.randn()
        new_weights[key] += adjustment
        # Ensure weights are non-negative
        new_weights[key] = max(new_weights[key], 0)
    return new_weights

for iteration in range(num_iterations):
    new_weights = adjust_weights(initial_weights, learning_rate)

    # Compute MAP with new weights
    weighted_sum =  (new_weights['title'] * titles_matrix +
                  new_weights['description'] * descriptions_matrix +
                  new_weights['summary'] * summaries_matrix +
                  new_weights['eligibility'] * eligibilities_matrix +
                  new_weights['keyword'] * keywords_matrix +
                  new_weights['condition'] * conditions_matrix)
    total_weight = sum(new_weights.values())
    matrix = weighted_sum / total_weight


    queries = get_queries_results(matrix)

    current_map = mean_average_precision(queries,qrels)

    # Check if current MAP is the best so far
    if current_map > best_map:
        best_map = current_map
        best_weights = new_weights.copy()

    print(f"Iteration {iteration + 1}/{num_iterations}, MAP: {current_map}, Weights: {new_weights}")

print(f"Best MAP: {best_map}, Best Weights: {best_weights}")
# write here



