In [2]:
import pandas as pd

# Load the dataset and rename the original index column to "original index"
df = pd.read_csv('../data/intern_screening_dataset.csv').reset_index().rename(columns={'index': 'original_index'})
df.head()

Unnamed: 0,original_index,question,answer
0,0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...
1,1,What is (are) Glaucoma ?,The optic nerve is a bundle of more than 1 mil...
2,2,What is (are) Glaucoma ?,Open-angle glaucoma is the most common form of...
3,3,Who is at risk for Glaucoma? ?,Anyone can develop glaucoma. Some people are a...
4,4,How to prevent Glaucoma ?,"At this time, we do not know how to prevent gl..."


In [3]:
# Check the columns and data types
print("Columns:", list(df.columns))
print("Data types:", [str(dt) for dt in df.dtypes])
print("Count:", df['question'].count())
print("Missing values in question:", df['question'].isnull().sum())
print("Missing values in answer:", df['answer'].isnull().sum())
print("Unique values in question:", df['question'].nunique())
print("Unique values in answer:", df['answer'].nunique())

Columns: ['original_index', 'question', 'answer']
Data types: ['int64', 'object', 'object']
Count: 16406
Missing values in question: 0
Missing values in answer: 5
Unique values in question: 14981
Unique values in answer: 15811


In [4]:
# Calculate statistics about the length of the answers
answer_lengths = df['answer'].apply(lambda x: "" if pd.isna(x) else x).apply(len)
print("Answer length statistics:")
print(f"  Max length: {answer_lengths.max()}")
print(f"  Min length: {answer_lengths.min()}")
print(f"  Mean length: {answer_lengths.mean():.2f}")
print(f"  Median length: {answer_lengths.median()}")
print(f"  Standard deviation: {answer_lengths.std():.2f}")

Answer length statistics:
  Max length: 29046
  Min length: 0
  Mean length: 1302.61
  Median length: 889.0
  Standard deviation: 1656.13


In [5]:
# Remove rows where 'answer' is missing
df_clean = df.dropna(subset=['answer'])

# For each question, concatenate all answers and collect the list of original indices
df_concat = df_clean.groupby('question').agg({
    'answer': lambda x: '\n---\n'.join(x),
    'original_index': lambda x: list(x)
}).reset_index()

# Create a new column 'question_index' with the index of each unique question
df_concat['question_index'] = df_concat.index

df_concat.head()

Unnamed: 0,question,answer,original_index,question_index
0,Do you have information about A1C,Summary : A1C is a blood test for type 2 diabe...,[1909],0
1,Do you have information about Acupuncture,Summary : Acupuncture has been practiced in Ch...,[1744],1
2,Do you have information about Adoption,Summary : Adoption brings a child born to othe...,[2307],2
3,Do you have information about Advance Directives,Summary : What kind of medical care would you ...,[1937],3
4,Do you have information about African American...,Summary : Every racial or ethnic group has spe...,[2434],4


In [6]:
# Check the columns and data types
print("Columns:", list(df_concat.columns))
print("Data types:", [str(dt) for dt in df_concat.dtypes])
print("Count:", df_concat['question'].count())
print("Missing values in question:", df_concat['question'].isnull().sum())
print("Missing values in answer:", df_concat['answer'].isnull().sum())
print("Unique values in question:", df_concat['question'].nunique())
print("Unique values in answer:", df_concat['answer'].nunique())

Columns: ['question', 'answer', 'original_index', 'question_index']
Data types: ['object', 'object', 'object', 'int64']
Count: 14976
Missing values in question: 0
Missing values in answer: 0
Unique values in question: 14976
Unique values in answer: 14470


In [7]:
import json

# Save a mapping from question_index to question and original_index
question_map = df_concat[['question_index', 'question', 'original_index']].set_index('question_index').to_dict(orient='index')
with open('../data/question_index_map.json', 'w', encoding='utf-8') as f:
    json.dump(question_map, f, ensure_ascii=False, indent=2)

In [8]:
# Group by 'answer' to aggregate duplicated answers
df_answer_grouped = df_concat.groupby('answer').agg({
    # Concatenate all unique questions into a single string, separated by '\n---\n'
    'question': lambda x: '\n---\n'.join(sorted(set(x))),
    # Concatenate all lists of original_index into a single list
    'original_index': lambda x: sum(x, []),
    # Collect all question_index values into a list
    'question_index': lambda x: list(x)
}).reset_index()

# Create a new column 'answer_index' as a unique index for each answer
df_answer_grouped['answer_index'] = df_answer_grouped.index

df_answer_grouped.head()


Unnamed: 0,answer,question,original_index,question_index,answer_index
0,- A kidney stone is a solid piece of material ...,What to do for Kidney Stones in Adults ?,[15613],[14255],0
1,- A person may prevent or delay some health pr...,What to do for Nutrition for Advanced Chronic ...,[15567],[14268],1
2,- Acromegaly is a hormonal disorder that resul...,What to do for Acromegaly ?,[15719],[14196],2
3,- Bladder problems have many possible causes. ...,What to do for What I need to know about Inter...,[16186],[14315],3
4,- Cirrhosis is scarring of the liver. Scar tis...,What to do for What I need to know about Cirrh...,[15262],[14304],4


In [9]:
import json

# Save a mapping from question_index to question and original_index
question_map = df_answer_grouped[['answer_index', 'answer', 'original_index']].set_index('answer_index').to_dict(orient='index')
with open('../data/answer_index_map.json', 'w', encoding='utf-8') as f:
    json.dump(question_map, f, ensure_ascii=False, indent=2)

In [10]:
# Calculate statistics about the length of the answers
answer_lengths = df_answer_grouped['answer'].apply(len)
print("Answer length statistics:")
print(f"  Max length: {answer_lengths.max()}")
print(f"  Min length: {answer_lengths.min()}")
print(f"  Mean length: {answer_lengths.mean():.2f}")
print(f"  Median length: {answer_lengths.median()}")
print(f"  Standard deviation: {answer_lengths.std():.2f}")


Answer length statistics:
  Max length: 117627
  Min length: 6
  Mean length: 1461.92
  Median length: 957.0
  Standard deviation: 2200.87


In [None]:
import nltk
from transformers import AutoTokenizer

try:
    nltk.data.find('tokenizers/punkt')
except Exception:
    print("Downloading the 'punkt' sentence tokenizer from NLTK...")
    nltk.download('punkt')

try:
    nltk.data.find('tokenizers/punkt_tab')
except Exception:
    print("Downloading the 'punkt_tab' sentence tokenizer from NLTK...")
    nltk.download('punkt_tab')



def chunk_text(text: str, tokenizer, chunk_size: int = 400, chunk_overlap: int = 50) -> list[str]:
    """
    Divide a long text into smaller chunks, respecting the sentence boundaries
    and with a defined overlap in number of tokens.

    Args:
        text (str): The complete text to be divided.
        tokenizer: The tokenizer from the Transformers library to be used to count the tokens.
        chunk_size (int): The maximum size of each chunk in tokens.
        chunk_overlap (int): The number of tokens of overlap between consecutive chunks.

    Returns:
        list[str]: A list of strings, where each string is a chunk of text.
    """
    if not text:
        return []

    # 1. Dividir o texto em sentenças usando NLTK
    sentences = nltk.sent_tokenize(text)

    # 2. Tokenizar todas as sentenças e guardar seus tokens
    tokens = [tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences]
    
    chunks = []
    current_chunk_tokens = []
    current_chunk_sentences = []
    
    for i, sentence_tokens in enumerate(tokens):
        # Se adicionar a próxima sentença ultrapassar o tamanho do chunk
        if len(current_chunk_tokens) + len(sentence_tokens) > chunk_size:
            # Finaliza o chunk atual e adiciona à lista
            if current_chunk_tokens:
                chunk_str = tokenizer.decode(current_chunk_tokens)
                chunks.append(chunk_str.strip())
            
            # 3. Começa um novo chunk com sobreposição
            # Pega os últimos tokens do chunk que acabamos de criar para formar a sobreposição
            overlap_tokens = current_chunk_tokens[-chunk_overlap:] if chunk_overlap > 0 and current_chunk_tokens else []
            current_chunk_tokens = overlap_tokens + sentence_tokens
        else:
            current_chunk_tokens.extend(sentence_tokens)

    # Adiciona o último chunk que sobrou
    if current_chunk_tokens:
        chunk_str = tokenizer.decode(current_chunk_tokens)
        chunks.append(chunk_str.strip())
        
    return chunks

[nltk_data] Downloading package punkt_tab to C:\Users\Aron
[nltk_data]     Ifanger\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [14]:
# Load the same tokenizer that you will use in your BERT/DistilBERT model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

df_answer_grouped['answer_chunks'] = df_answer_grouped['answer'].apply(
    lambda x: chunk_text(x, tokenizer, chunk_size=100, chunk_overlap=20)
)

df_chunks = df_answer_grouped.explode('answer_chunks')
df_chunks["chunk_index"] = df_chunks.index

df_chunks.head()

Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,answer,question,original_index,question_index,answer_index,answer_chunks,chunk_index
0,- A kidney stone is a solid piece of material ...,What to do for Kidney Stones in Adults ?,[15613],[14255],0,- a kidney stone is a solid piece of material ...,0
0,- A kidney stone is a solid piece of material ...,What to do for Kidney Stones in Adults ?,[15613],[14255],0,scientists do not believe that eating any spec...,0
0,- A kidney stone is a solid piece of material ...,What to do for Kidney Stones in Adults ?,[15613],[14255],0,"##gnose kidney stones, the health care provide...",0
0,- A kidney stone is a solid piece of material ...,What to do for Kidney Stones in Adults ?,[15613],[14255],0,", as well as whether they are causing pain or ...",0
1,- A person may prevent or delay some health pr...,What to do for Nutrition for Advanced Chronic ...,[15567],[14268],1,- a person may prevent or delay some health pr...,1


In [15]:
# Calculate statistics about the length of the answers
chunks_lengths = df_chunks['answer_chunks'].apply(len)
print("Answer length statistics:")
print(f"  Max length: {chunks_lengths.max()}")
print(f"  Min length: {chunks_lengths.min()}")
print(f"  Mean length: {chunks_lengths.mean():.2f}")
print(f"  Median length: {chunks_lengths.median()}")
print(f"  Standard deviation: {chunks_lengths.std():.2f}")

Answer length statistics:
  Max length: 5926
  Min length: 4
  Mean length: 398.75
  Median length: 392.0
  Standard deviation: 159.81


In [16]:
# Save the df_chunks DataFrame to a CSV file for later use
df_chunks.to_csv("../data/intern_screening_dataset_chunks.csv", index=False)

In [1]:
import pandas as pd

df = pd.read_json('../data/run_4/squad_train_data.json', orient='records')
df["start_index"] = df["answers"].apply(lambda x: x["answer_start"][0])
df['start_index_cat'] = df['start_index'].map(lambda x: '0' if x == 0 else '>0')
df['start_index_cat'].value_counts()


start_index_cat
>0    8479
0     3001
Name: count, dtype: int64

In [18]:
sum([len(a['text'][0].split()) for a in df["answers"]]) / len(df["answers"])

202.0240418118467