In [None]:
pip install trafilatura

In [None]:
import spacy
from spacy.tokens import Token
import os
import json
import trafilatura
import re

nlp = spacy.load("en_core_web_sm")

class AdvancedSpacyReplacer:
    def __init__(self, mapping):
        self.mapping = {key.lower(): value for key, value in mapping.items()}
    
    def replace_with_grammar(self, text):
        # Сначала обрабатываем слова, которые могут быть рядом со скобками
        pattern = r'\b(' + '|'.join(re.escape(key) for key in self.mapping.keys()) + r')(?=\[|\()'
        
        def replace_match(match):
            word = match.group(1)
            replacement = self.mapping[word.lower()]
            
            # Сохраняем регистр
            if word.istitle():
                return replacement.capitalize()
            elif word.isupper():
                return replacement.upper()
            else:
                return replacement
        
        # Заменяем слова перед скобками
        text = re.sub(pattern, replace_match, text, flags=re.IGNORECASE)
        
        # Затем обрабатываем остальной текст обычным способом
        doc = nlp(text)
        
        for token in doc:
            lemma = token.lemma_.lower()
            if lemma in self.mapping:
                replacement = self.mapping[lemma]
                
                if token.tag_ in ["NNS", "NNPS"] and not replacement.endswith('s'):
                    replacement += 's'
                
                if token.is_title:
                    replacement = replacement.capitalize()
                elif token.is_upper:
                    replacement = replacement.upper()
                
                token._.replacement = replacement
            else:
                token._.replacement = token.text
        
        result = []
        for token in doc:
            result.append(token._.replacement)
            if token.whitespace_:
                result.append(token.whitespace_)
        
        return ''.join(result)

knowledge_base_path ="../knowledge_base/"

links = [
    "https://matrix.fandom.com/wiki/Neo",
    "https://matrix.fandom.com/wiki/Trinity",
    "https://matrix.fandom.com/wiki/Morpheus",
    "https://matrix.fandom.com/wiki/The_Oracle",
    "https://matrix.fandom.com/wiki/The_Architect",
    "https://matrix.fandom.com/wiki/Spoon_Boy",
    "https://matrix.fandom.com/wiki/The_Keymaker",
    "https://matrix.fandom.com/wiki/Agent_Smith",
    "https://matrix.fandom.com/wiki/Tank",
    "https://matrix.fandom.com/wiki/The_Merovingian",
    "https://matrix.fandom.com/wiki/The_Twins",
    "https://matrix.fandom.com/wiki/Mobil_Avenue",
    "https://matrix.fandom.com/wiki/Zion",
    "https://matrix.fandom.com/wiki/Power_plant",
    "https://matrix.fandom.com/wiki/Simulated_reality",
    "https://matrix.fandom.com/wiki/Nebuchadnezzar",
    "https://matrix.fandom.com/wiki/Logos",
    "https://matrix.fandom.com/wiki/Matrix",
    "https://matrix.fandom.com/wiki/Matrix_Beta_Versions",
    "https://matrix.fandom.com/wiki/Food",
    "https://matrix.fandom.com/wiki/Alice%27s_Adventures_in_Wonderland",
    "https://matrix.fandom.com/wiki/Rescue_of_Morpheus",
    "https://matrix.fandom.com/wiki/Machine_War",
    "https://matrix.fandom.com/wiki/The_Freeway_Chase",
    "https://matrix.fandom.com/wiki/Hotel_Ambush",
    "https://matrix.fandom.com/wiki/Operation_Dark_Storm",
    "https://matrix.fandom.com/wiki/Chateau_Showdown",
    "https://matrix.fandom.com/wiki/Battle_of_Zion",
    "https://matrix.fandom.com/wiki/Showdown_in_Mega_City",
    "https://matrix.fandom.com/wiki/Machine_Civil_War"
]

matrix_to_catrix_mapping = {
    # Core Characters
    "Neo": "Meo",
    "Anderson": "Meowdersnoot",
    "Thomas" : "Jerry",
    "Trinity": "Tri-Cat-ty",
    "Morpheus": "Purrpheus",
    "Oracle": "Sphinx",
    "Architect": "Head Bengal",
    "Keymaker": "Claw who Makes Ways",
    "Agent Smith": "Agent Hiss",
    "Smith" : "Hiss",
    "Tank": "Litter-Tender",
    "Merovingian": "Meowingian",
    "Twins": "Tabbies",
        
    # Places & Concepts
    "Real World": "Outside",
    "Zion": "Great Scratching Post",
    "Power plant": "Sunbeam Grid",
    "Matrix": "Catrix",
    "Rabbit" : "Fish",
    
    # Ships
    "Nebuchadnezzar": "The Meowchadnezzar",
    "Logos": "Purrlogos",
    "Hammer": "Pawmer",
    
    # Key Items & Events
    "Red Pill": "Silvervine Stick",
    "Blue Pill": "Saucer of Warm Milk",

    # others
    "Agents": "V.E.T.s",  
}

replacer = AdvancedSpacyReplacer(matrix_to_catrix_mapping)

for link in links:
    print(link)
    downloaded = trafilatura.fetch_url(link)
    fetched_text = trafilatura.extract(downloaded, include_comments=False)
    result = replacer.replace_with_grammar(fetched_text)
    file_name = link.rsplit('/', 1)[-1]
    file_path = knowledge_base_path + file_name + ".txt"
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    text_file = open(file_path, "w", encoding="utf-8")
    text_file.write(result)
    text_file.close()


mapping_json_file = open(knowledge_base_path+"0_terms_map.json", "w", encoding="utf-8")
mapping_json_file.write(json.dumps(matrix_to_catrix_mapping))
mapping_json_file.close()