In [1]:
"""
Knowledge Base Contradiction and Redundancy Detection System

This implementation creates optimal knowledge base representations for:
1. Contradiction detection (using evidence-based validation)
2. Redundancy detection
3. Synthetic contradiction generation with proper validation

The system follows the DIALFACT framework for evidence retrieval and
contradiction verification while implementing graph-based knowledge
representations as suggested in the Detecting_Documents_With_Inconsistent_Context paper.

Key components:
- KnowledgeGraph: Structured knowledge representation.
- EvidenceRetriever: Retrieves evidence from knowledge sources.
- NLIValidator: Validates contradictions using Natural Language Inference.
- SyntheticContradictionGenerator: Generates validated contradictions.
- Utility: A helper class to centralize common functionalities like model loading and NLP tasks.
"""

'\nKnowledge Base Contradiction and Redundancy Detection System\n\nThis implementation creates optimal knowledge base representations for:\n1. Contradiction detection (using evidence-based validation)\n2. Redundancy detection\n3. Synthetic contradiction generation with proper validation\n\nThe system follows the DIALFACT framework for evidence retrieval and\ncontradiction verification while implementing graph-based knowledge\nrepresentations as suggested in the Detecting_Documents_With_Inconsistent_Context paper.\n\nKey components:\n- KnowledgeGraph: Structured knowledge representation.\n- EvidenceRetriever: Retrieves evidence from knowledge sources.\n- NLIValidator: Validates contradictions using Natural Language Inference.\n- SyntheticContradictionGenerator: Generates validated contradictions.\n- Utility: A helper class to centralize common functionalities like model loading and NLP tasks.\n'

In [2]:
import os
import re
import json
import logging
import networkx as nx
import numpy as np
from collections import defaultdict, OrderedDict
from typing import List, Dict, Tuple, Set, Optional, Any

import torch
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
    BertTokenizer,
    BertForMaskedLM
)
from datasets import load_dataset, VerificationMode

In [3]:
# --- Configuration ---
# Set a cache directory to avoid re-downloading models and datasets.
CACHE_DIR = "./.cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR

# --- Logging Setup ---
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("kb_contradiction_detection.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("KBContradictionDetection")

In [4]:
class Utility:
    """
    A helper class to centralize common functionalities and models.
    This prevents repeated loading of large models and centralizes NLP tasks,
    addressing a major performance bottleneck and code duplication.
    """
    _nlp = None

    @classmethod
    def get_spacy_model(cls):
        """Lazily loads and returns the spaCy model."""
        if cls._nlp is None:
            logger.info("Loading spaCy model 'en_core_web_sm' for the first time...")
            try:
                cls._nlp = spacy.load("en_core_web_sm")
            except OSError:
                logger.warning("spaCy model not found. Downloading...")
                from spacy.cli import download
                download("en_core_web_sm")
                cls._nlp = spacy.load("en_core_web_sm")
            logger.info("spaCy model loaded successfully.")
        return cls._nlp

    @staticmethod
    def extract_entities(text: str) -> List[str]:
        """Extracts named entities from text using the shared spaCy model."""
        nlp = Utility.get_spacy_model()
        doc = nlp(text)
        entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "LOC", "DATE", "EVENT", "PRODUCT", "WORK_OF_ART"]]
        # Remove duplicates while preserving order
        return list(OrderedDict.fromkeys(entities))

    @staticmethod
    def simple_contradiction_heuristic(claim1: str, claim2: str) -> bool:
        """Simple heuristic to determine if two claims are likely contradictions based on keywords."""
        quantifier_pairs = [
            ("all", "not all"), ("every", "some"), ("always", "not always"),
            ("most", "few"), ("many", "few"), ("increase", "decrease"),
            ("never", "sometimes"), ("none", "some"), ("higher", "lower"),
            ("positive", "negative"), ("good", "bad"), ("true", "false")
        ]

        claim1_lower = claim1.lower()
        claim2_lower = claim2.lower()

        for q1, q2 in quantifier_pairs:
            if (q1 in claim1_lower and q2 in claim2_lower) or \
               (q2 in claim1_lower and q1 in claim2_lower):
                return True
        return False


In [5]:
class KnowledgeGraph:
    """
    Represents knowledge as a graph structure where nodes are entities and edges are relationships.
    This version centralizes model loading and optimizes serialization for better performance.
    """
    def __init__(self, embedding_model_name: str = "all-MiniLM-L6-v2"):
        logger.info(f"Initializing KnowledgeGraph with embedding model: {embedding_model_name}")
        self.graph = nx.DiGraph()
        self.evidence_store = {}
        self.claim_embeddings = {}
        self.entity_to_claims = defaultdict(list)
        try:
            self.embedding_model = SentenceTransformer(embedding_model_name, cache_folder=CACHE_DIR)
        except Exception as e:
            logger.error(f"Failed to load embedding model '{embedding_model_name}'. Error: {e}")
            raise
        self.nlp = Utility.get_spacy_model()
        self.claim_counter = 0

    def add_claim(self, claim: str, evidence: Optional[List[str]] = None, source: str = "unknown") -> str:
        """Adds a claim to the knowledge graph."""
        claim_id = f"claim_{self.claim_counter}"
        self.claim_counter += 1

        self.evidence_store[claim_id] = {
            "claim": claim,
            "evidence": evidence or [],
            "source": source
        }

        entities = Utility.extract_entities(claim)
        relations = self._extract_relations(claim, entities)

        for subj, pred, obj in relations:
            self.graph.add_node(subj, type="entity")
            self.graph.add_node(obj, type="entity")

            if not self.graph.has_edge(subj, obj):
                self.graph.add_edge(subj, obj, relations={})

            if pred not in self.graph[subj][obj]['relations']:
                self.graph[subj][obj]['relations'][pred] = []
            self.graph[subj][obj]['relations'][pred].append(claim_id)

            self.entity_to_claims[subj].append(claim_id)
            self.entity_to_claims[obj].append(claim_id)

        self.claim_embeddings[claim_id] = self.embedding_model.encode([claim])[0]

        logger.debug(f"Added claim {claim_id}: '{claim}' with {len(entities)} entities")
        return claim_id

    def _extract_relations(self, text: str, entities: List[str]) -> List[Tuple[str, str, str]]:
        """Extracts simple subject-predicate-object relations from text."""
        doc = self.nlp(text)
        relations = []
        for token in doc:
            if token.dep_ in ("nsubj", "nsubjpass") and token.head.pos_ == "VERB":
                subj = token.text
                verb = token.head.lemma_
                obj = ""
                for child in token.head.children:
                    if child.dep_ in ("dobj", "pobj", "attr"):
                        obj = child.text
                        break
                if obj and subj in entities and obj in entities:
                    relations.append((subj, verb, obj))

        if not relations and len(entities) >= 2:
            relations.append((entities[0], "is related to", entities[1]))

        return relations

    def detect_contradictions(self, nli_validator: 'NLIValidator', threshold: float = 0.7) -> List[Tuple[str, str, float]]:
        """Detects contradictions by checking semantically similar claims with an NLI model."""
        contradictions = []
        checked_pairs = set()

        for entity, claims in self.entity_to_claims.items():
            if len(claims) < 2:
                continue

            for i in range(len(claims)):
                for j in range(i + 1, len(claims)):
                    claim_id1, claim_id2 = claims[i], claims[j]

                    pair_key = tuple(sorted((claim_id1, claim_id2)))
                    if pair_key in checked_pairs:
                        continue
                    checked_pairs.add(pair_key)

                    claim1_text = self.evidence_store[claim_id1]["claim"]
                    claim2_text = self.evidence_store[claim_id2]["claim"]

                    emb1 = self.claim_embeddings[claim_id1]
                    emb2 = self.claim_embeddings[claim_id2]

                    similarity = cosine_similarity([emb1], [emb2])[0][0]

                    if similarity > threshold:
                        if nli_validator.validate_contradiction(claim1_text, claim2_text):
                            contradictions.append((claim_id1, claim_id2, float(similarity)))

        logger.info(f"Detected {len(contradictions)} potential contradictions after NLI validation.")
        return contradictions

    def detect_redundancies(self, threshold: float = 0.95) -> List[Tuple[str, str, float]]:
        """Detects redundant claims using a high similarity threshold."""
        redundancies = []
        claim_ids = list(self.claim_embeddings.keys())
        if len(claim_ids) < 2:
            return []

        embeddings = np.array([self.claim_embeddings[cid] for cid in claim_ids])
        similarities = cosine_similarity(embeddings)

        for i in range(len(claim_ids)):
            for j in range(i + 1, len(claim_ids)):
                similarity = similarities[i][j]
                if similarity > threshold:
                    redundancies.append((claim_ids[i], claim_ids[j], float(similarity)))

        logger.info(f"Detected {len(redundancies)} potential redundancies.")
        return redundancies

    def to_json(self, filepath: str):
        """Saves the knowledge graph to a JSON file, including embeddings."""
        logger.info(f"Saving knowledge graph to {filepath}...")
        data = {
            "graph": nx.node_link_data(self.graph),
            "evidence_store": self.evidence_store,
            "claim_counter": self.claim_counter,
            "claim_embeddings": {cid: emb.tolist() for cid, emb in self.claim_embeddings.items()}
        }
        with open(filepath, 'w') as f:
            json.dump(data, f, indent=2)
        logger.info("Knowledge graph saved successfully.")

    @classmethod
    def from_json(cls, filepath: str, embedding_model_name: str = "all-MiniLM-L6-v2") -> 'KnowledgeGraph':
        """Loads a knowledge graph from a JSON file."""
        logger.info(f"Loading knowledge graph from {filepath}...")
        with open(filepath, 'r') as f:
            data = json.load(f)

        kg = cls(embedding_model_name)
        kg.graph = nx.node_link_graph(data["graph"])
        kg.evidence_store = data["evidence_store"]
        kg.claim_counter = data["claim_counter"]
        kg.claim_embeddings = {cid: np.array(emb) for cid, emb in data["claim_embeddings"].items()}

        for claim_id in kg.evidence_store:
            entities = Utility.extract_entities(kg.evidence_store[claim_id]["claim"])
            for entity in entities:
                if claim_id not in kg.entity_to_claims[entity]:
                    kg.entity_to_claims[entity].append(claim_id)

        logger.info(f"Knowledge graph loaded with {len(kg.evidence_store)} claims.")
        return kg

In [6]:
class EvidenceRetriever:
    """
    Retrieves evidence for claims from a knowledge source (e.g., Wikipedia).
    This version builds an in-memory inverted index for efficient document retrieval,
    addressing the critical scalability issue of the original linear scan.
    """
    def __init__(self, top_k: int = 5):
        logger.info("Initializing EvidenceRetriever.")
        self.top_k = top_k
        self.wiki_pages = None
        self.page_index = defaultdict(list)
        self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
        self._load_and_index_data()

    def _load_and_index_data(self):
        """Loads the dataset and builds an inverted index for fast retrieval."""
        try:
            logger.info("Attempting to load Wikipedia dataset: 'wikimedia/wikipedia', '20231101.en'")
            self.wiki_pages = load_dataset(
                "wikimedia/wikipedia",
                "20231101.en",
                split="train[:10000]", # Using a subset for demonstration purposes
                cache_dir=CACHE_DIR,
                verification_mode=VerificationMode.NO_CHECKS
            )
            logger.info(f"Loaded {len(self.wiki_pages)} Wikipedia pages. Now building index...")

            # Build an inverted index from page titles for fast lookup
            for i, page in enumerate(self.wiki_pages):
                title_entities = Utility.extract_entities(page['title'])
                for entity in title_entities:
                    self.page_index[entity.lower()].append(i)
            logger.info(f"Index built with {len(self.page_index)} unique entities.")

        except Exception as e:
            logger.error(f"Failed to load or index Wikipedia dataset: {e}", exc_info=True)
            logger.warning("Running in limited mode without Wikipedia evidence source.")
            self.wiki_pages = None

    def retrieve_evidence(self, claim: str) -> List[Dict]:
        """Retrieves evidence for a claim using the indexed knowledge source."""
        if not self.wiki_pages:
            return [{"title": "No Source", "sentence": "Evidence retrieval disabled.", "score": 0.0}]

        claim_entities = Utility.extract_entities(claim)
        if not claim_entities:
            return []

        # Stage 1: Candidate document retrieval using the index
        candidate_indices = set()
        for entity in claim_entities:
            candidate_indices.update(self.page_index.get(entity.lower(), []))

        if not candidate_indices:
            return []

        candidate_pages = [self.wiki_pages[i] for i in candidate_indices]

        # Stage 2: Sentence-level ranking within candidate documents
        evidence = []
        claim_embedding = self.embedding_model.encode([claim])[0]

        for page in candidate_pages:
            sentences = re.split(r'(?<=[.!?])\s+', page['text'])
            # Only consider first 50 sentences for performance
            sentences = [s for s in sentences if len(s.split()) > 5][:50]
            if not sentences:
                continue

            sentence_embeddings = self.embedding_model.encode(sentences)
            similarities = cosine_similarity([claim_embedding], sentence_embeddings)[0]

            for i, score in enumerate(similarities):
                if score > 0.5: # Relevance threshold
                    evidence.append({
                        "title": page['title'],
                        "sentence": sentences[i],
                        "score": float(score)
                    })

        evidence.sort(key=lambda x: x["score"], reverse=True)
        logger.debug(f"Retrieved {len(evidence)} evidence sentences for claim: '{claim}'")
        return evidence[:self.top_k]

In [7]:
class NLIValidator:
    """Validates contradictions using a pre-trained Natural Language Inference (NLI) model."""
    def __init__(self, model_name: str = "roberta-large-mnli", device: Optional[int] = None):
        logger.info(f"Initializing NLIValidator with model: {model_name}")
        if device is None:
            device = 0 if torch.cuda.is_available() else -1

        try:
            # CORRECTED: Removed the problematic 'cache_dir' argument from the pipeline constructor
            self.nli_pipeline = pipeline(
                "text-classification",
                model=model_name,
                top_k=None, # Get all scores
                device=device,
                truncation=True
            )
            # Find the exact name for the contradiction label
            self.contradiction_label = ""
            for label, _ in self.nli_pipeline.model.config.label2id.items():
                if 'contra' in label.lower():
                    self.contradiction_label = label
                    break
            if not self.contradiction_label:
                if 'contradiction' in self.nli_pipeline.model.config.label2id:
                    self.contradiction_label = 'contradiction'
                else:
                    raise RuntimeError("Could not determine contradiction label from model config.")

            logger.info(f"NLIValidator initialized on device {device}. Contradiction label: '{self.contradiction_label}'")
        except Exception as e:
            logger.error(f"Failed to load NLI model: {e}", exc_info=True)
            self.nli_pipeline = None

    def validate_contradiction(self, premise: str, hypothesis: str, threshold: float = 0.9) -> bool:
        """Validates if a premise and hypothesis are truly contradictory."""
        if not self.nli_pipeline:
            logger.warning("NLI model not available. Falling back to simple heuristic.")
            return Utility.simple_contradiction_heuristic(premise, hypothesis)

        # NLI models check if the hypothesis follows from the premise.
        # To check for contradiction, the premise should contradict the hypothesis.
        # The input format for many models is `premise</s></s>hypothesis`
        result = self.nli_pipeline(f"{premise}</s></s>{hypothesis}")

        for item in result:
            if item["label"] == self.contradiction_label and item["score"] >= threshold:
                logger.debug(f"Valid contradiction (score: {item['score']:.4f}): '{premise}' vs '{hypothesis}'")
                return True
        return False

In [8]:
class SyntheticContradictionGenerator:
    """Generates synthetic contradictions for training and evaluation, with NLI validation."""
    def __init__(self, nli_validator: NLIValidator):
        logger.info("Initializing SyntheticContradictionGenerator.")
        self.nli_validator = nli_validator
        self.nlp = Utility.get_spacy_model()
        self.quantifier_rules = OrderedDict([
            ("all", "not all"), ("every", "some"), ("always", "not always"),
            ("increase", "decrease"), ("higher", "lower"), ("positive", "negative"),
            ("good", "bad"), ("true", "false"), ("never", "sometimes")
        ])

    def generate(self, claim: str, max_candidates: int = 1) -> List[str]:
        """Generates and validates contradictions for a given claim."""
        candidates = []
        candidates.extend(self._negation_based_contradiction(claim))
        candidates.extend(self._quantifier_flipping_contradiction(claim))

        valid_contradictions = []
        for candidate in set(candidates):
            if self.nli_validator.validate_contradiction(claim, candidate):
                valid_contradictions.append(candidate)
            if len(valid_contradictions) >= max_candidates:
                break

        logger.info(f"Generated {len(valid_contradictions)} valid contradictions for: '{claim}'")
        return valid_contradictions

    def _negation_based_contradiction(self, claim: str) -> List[str]:
        """
        Generates contradictions using verb negation. This is the corrected implementation.
        """
        doc = self.nlp(claim)
        candidates = []
        has_negation = any(token.dep_ == 'neg' for token in doc)

        # Find the main verb (root)
        root = next((token for token in doc if token.dep_ == 'ROOT'), None)
        if not root or root.pos_ not in ('VERB', 'AUX'):
            return []

        if has_negation:
            # Simple negation removal
            new_claim = re.sub(r"\s+n't\b", "", claim)
            new_claim = re.sub(r"\s+not\b", "", new_claim)
            candidates.append(new_claim)
        else:
            # Add negation
            if root.lemma_ == 'be':
                new_claim = claim.replace(root.text, f"{root.text} not", 1)
                candidates.append(new_claim)
            elif root.tag_ in ['VBP', 'VB']: # Present tense (I go, they go)
                new_claim = claim.replace(root.text, f"do not {root.lemma_}", 1)
                candidates.append(new_claim)
            elif root.tag_ == 'VBZ': # Present tense, 3rd person (he goes)
                new_claim = claim.replace(root.text, f"does not {root.lemma_}", 1)
                candidates.append(new_claim)
            elif root.tag_ == 'VBD': # Past tense (he went)
                new_claim = claim.replace(root.text, f"did not {root.lemma_}", 1)
                candidates.append(new_claim)

        return [c.strip() for c in candidates]

    def _quantifier_flipping_contradiction(self, claim: str) -> List[str]:
        """Generates contradictions by flipping quantifiers/adjectives."""
        candidates = []
        claim_lower = claim.lower()
        for original, replacement in self.quantifier_rules.items():
            if f" {original} " in f" {claim_lower} ":
                # Use regex for whole word replacement to avoid partial matches (e.g., 'all' in 'ball')
                new_claim = re.sub(r'\b' + re.escape(original) + r'\b', replacement, claim, flags=re.IGNORECASE, count=1)
                candidates.append(new_claim)
        return candidates

In [9]:
# Main
if __name__ == '__main__':
    logger.info("="*50)
    logger.info("Starting Knowledge Base Contradiction and Redundancy Demo")
    logger.info("="*50)

    # Initialization
    nli_validator = NLIValidator()
    evidence_retriever = EvidenceRetriever()
    kg = KnowledgeGraph()
    contradiction_generator = SyntheticContradictionGenerator(nli_validator)

    # Populate Knowledge Graph with Claims
    claims_to_add = [
        "The Eiffel Tower is located in Paris, France.",
        "All birds can fly.",
        "Penguins are birds that cannot fly.",
        "The capital of Australia is Sydney.",
        "Canberra is the official capital of Australia.",
        "The sun revolves around the Earth.",
        "The Earth orbits the sun.",
        "Water boils at 100 degrees Celsius at sea level.",
        "Water always boils at 100 degrees Celsius."
    ]

    for claim_text in claims_to_add:
        logger.info(f"\n--- Processing Claim: '{claim_text}' ---")
        evidence = evidence_retriever.retrieve_evidence(claim_text)
        if evidence:
            logger.info(f"Retrieved Evidence: {evidence[0]['sentence']} (Score: {evidence[0]['score']:.2f})")
            kg.add_claim(claim_text, evidence=[e['sentence'] for e in evidence])
        else:
            kg.add_claim(claim_text)

    # Detect Redundancies
    logger.info("\n" + "="*50)
    logger.info("Detecting Redundancies (Similarity > 0.95)")
    logger.info("="*50)
    redundancies = kg.detect_redundancies(threshold=0.95)
    if redundancies:
        for c1_id, c2_id, score in redundancies:
            logger.info(f"Potential Redundancy Detected (Score: {score:.3f}):")
            logger.info(f"  - {c1_id}: {kg.evidence_store[c1_id]['claim']}")
            logger.info(f"  - {c2_id}: {kg.evidence_store[c2_id]['claim']}")
    else:
        logger.info("No high-confidence redundancies found.")

    # Detect Contradictions
    logger.info("\n" + "="*50)
    logger.info("Detecting Contradictions (Validated by NLI Model)")
    logger.info("="*50)
    contradictions = kg.detect_contradictions(nli_validator)
    if contradictions:
        for c1_id, c2_id, score in contradictions:
            logger.info(f"Contradiction Detected (Similarity: {score:.3f}):")
            logger.info(f"  - {c1_id}: {kg.evidence_store[c1_id]['claim']}")
            logger.info(f"  - {c2_id}: {kg.evidence_store[c2_id]['claim']}")
    else:
        logger.info("No contradictions found after NLI validation.")

    # Generate Synthetic Contradictions
    logger.info("\n" + "="*50)
    logger.info("Generating Synthetic Contradictions")
    logger.info("="*50)

    claim_for_generation = "All politicians are honest."
    logger.info(f"Original claim: '{claim_for_generation}'")
    generated_contradictions = contradiction_generator.generate(claim_for_generation)
    if generated_contradictions:
        for contra in generated_contradictions:
            logger.info(f"  - Generated Contradiction: '{contra}'")
    else:
        logger.info("Could not generate a validated contradiction for the claim.")

    kg_filepath = "knowledge_graph.json"
    kg.to_json(kg_filepath)

    kg_loaded = KnowledgeGraph.from_json(kg_filepath)
    logger.info(f"\nSuccessfully loaded Knowledge Graph with {len(kg_loaded.evidence_store)} claims from '{kg_filepath}'.")

    logger.info("\nDemo finished successfully.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/41 [00:00<?, ?files/s]

20231101.en/train-00000-of-00041.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

20231101.en/train-00001-of-00041.parquet:   0%|          | 0.00/351M [00:00<?, ?B/s]

20231101.en/train-00002-of-00041.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

20231101.en/train-00003-of-00041.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

20231101.en/train-00004-of-00041.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

20231101.en/train-00005-of-00041.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

20231101.en/train-00006-of-00041.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

20231101.en/train-00007-of-00041.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

20231101.en/train-00008-of-00041.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

20231101.en/train-00009-of-00041.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

20231101.en/train-00010-of-00041.parquet:   0%|          | 0.00/234M [00:00<?, ?B/s]

20231101.en/train-00011-of-00041.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

20231101.en/train-00012-of-00041.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

20231101.en/train-00013-of-00041.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

20231101.en/train-00014-of-00041.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

20231101.en/train-00015-of-00041.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

20231101.en/train-00016-of-00041.parquet:   0%|          | 0.00/503M [00:00<?, ?B/s]

20231101.en/train-00017-of-00041.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

20231101.en/train-00018-of-00041.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

20231101.en/train-00019-of-00041.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

20231101.en/train-00020-of-00041.parquet:   0%|          | 0.00/225M [00:00<?, ?B/s]

20231101.en/train-00021-of-00041.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

20231101.en/train-00022-of-00041.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

20231101.en/train-00023-of-00041.parquet:   0%|          | 0.00/213M [00:00<?, ?B/s]

20231101.en/train-00024-of-00041.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

20231101.en/train-00025-of-00041.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

20231101.en/train-00026-of-00041.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

20231101.en/train-00027-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

20231101.en/train-00028-of-00041.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

20231101.en/train-00029-of-00041.parquet:   0%|          | 0.00/218M [00:00<?, ?B/s]

20231101.en/train-00030-of-00041.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

20231101.en/train-00031-of-00041.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

20231101.en/train-00032-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

20231101.en/train-00033-of-00041.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

20231101.en/train-00034-of-00041.parquet:   0%|          | 0.00/219M [00:00<?, ?B/s]

20231101.en/train-00035-of-00041.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

20231101.en/train-00036-of-00041.parquet:   0%|          | 0.00/610M [00:00<?, ?B/s]

20231101.en/train-00037-of-00041.parquet:   0%|          | 0.00/674M [00:00<?, ?B/s]

20231101.en/train-00038-of-00041.parquet:   0%|          | 0.00/538M [00:00<?, ?B/s]

20231101.en/train-00039-of-00041.parquet:   0%|          | 0.00/465M [00:00<?, ?B/s]

20231101.en/train-00040-of-00041.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6407814 [00:00<?, ? examples/s]

TypeError: list indices must be integers or slices, not str