In [1]:
# ============================================================================
# 1. Load Datasets
# ============================================================================
import pandas as pd

df = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")
df2 = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")

print(f"Loaded: {len(df)} corpus entries, {len(df2)} Q&A entries")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loaded: 3200 corpus entries, 918 Q&A entries


In [2]:
# ============================================================================
# 2. Inspect corpus dataset
# ============================================================================

df.head(10)
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 3200 entries, 0 to 3200
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   passage  3200 non-null   string
dtypes: string(1)
memory usage: 50.0 KB


Unnamed: 0,passage
count,3200
unique,3196
top,; Government
freq,3


In [3]:
# ============================================================================
# 3. Inspect Q&A dataset
# ============================================================================

df2.head(10)
df2.info()
df2.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 918 entries, 0 to 1714
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  918 non-null    string
 1   answer    918 non-null    string
dtypes: string(2)
memory usage: 21.5 KB


Unnamed: 0,question,answer
count,918,918
unique,918,499
top,When did he publish another memoria?,yes
freq,1,160


In [4]:
# ============================================================================
# 4. Inspect dataset columns
# ============================================================================

print("Corpus columns:", df.columns)
print("QA columns:", df2.columns)

Corpus columns: Index(['passage'], dtype='object')
QA columns: Index(['question', 'answer'], dtype='object')


In [5]:
# ============================================================================
# 5. Sample entries
# ============================================================================

print("--- Sample Corpus Passage ---")
print(df['passage'].iloc[0])
print(f"\nPassage length: {len(df['passage'].iloc[0])} characters")

print("\n--- Sample Q&A Pair ---")
print(f"Question: {df2['question'].iloc[0]}")
print(f"Answer: {df2['answer'].iloc[0]}")

--- Sample Corpus Passage ---
Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area.

Passage length: 250 characters

--- Sample Q&A Pair ---
Question: Was Abraham Lincoln the sixteenth President of the United States?
Answer: yes


In [6]:
# ============================================================================
# 6. Extra- Entity Extraction Analysis
# ============================================================================

import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")

def get_top_entities(texts, dataset_name):
    """Extract entities from ALL texts and return top 20"""
    print(f"\nAnalyzing {dataset_name}...")

    all_entities = []
    for i, text in enumerate(texts):
        if i % 1000 == 0:
            print(f"  Processed {i}/{len(texts)}")

        doc = nlp(str(text))
        all_entities.extend([ent.text.strip() for ent in doc.ents])

    return Counter(all_entities).most_common(20)

# Get top entities for corpus passages
print("="*50)
print("CORPUS DATASET - TOP 20 ENTITIES:")
corpus_entities = get_top_entities(df['passage'], "corpus passages")
for i, (entity, count) in enumerate(corpus_entities, 1):
    print(f"{i:2d}. {entity}: {count}")


CORPUS DATASET - TOP 20 ENTITIES:

Analyzing corpus passages...
  Processed 0/3200
  Processed 1000/3200
  Processed 2000/3200
  Processed 3000/3200
 1. first: 320
 2. Roosevelt: 266
 3. Lincoln: 247
 4. Ford: 226
 5. two: 209
 6. Finland: 209
 7. one: 202
 8. Wilson: 184
 9. Egypt: 175
10. Tesla: 157
11. Singapore: 156
12. Canada: 144
13. Grant: 144
14. Romania: 144
15. Coolidge: 142
16. Newton: 134
17. the United States: 124
18. American: 114
19. Finnish: 112
20. Adams: 104
