In [2]:
txt = "dfdslmfkjsdlmkf <link=sdfsdf> dfgdfgdfg.\ndskfmsdkfmksdfk<dfgdfg> <dsflmksmfk> <ref=dsfsd/dsfsdf> dsfsdfsdf <dfgdfg>"
import re
pattern = r'<(?!ref=).*?>'
print(re.findall(pattern, txt))


['<link=sdfsdf>', '<dfgdfg>', '<dsflmksmfk>', '<dfgdfg>']


In [10]:
from __future__ import annotations
import re
from typing import Iterator, NamedTuple, Union

class ReplacementStr(NamedTuple):
    """
    A namedtuple that represents a replacement rule.

    Args:
        str2replace: The string to be replaced.
        replacement: The replacement string.
    """
    str2replace:str
    replacement:str


class AdvancedStr(str):
    """
    An advanced string that can be used to find and replace advanced HTML tags.

    Attributes:
        __AdvancedBalisesPattern: A dictionary that maps the name of an advanced tag
            to its regular expression pattern.
    """
    __AdvancedBalisesPattern = {
        "reference": "ref=",
        "link": "link="
    }
    # --------------------------- CONSTRUCTOR -------------------------- #
    def __new__(cls, arg):
        """
        Creates a new `AdvancedStr` object.

        Args:
            arg: The string to be wrapped by the `AdvancedStr` object.
        """
        instance = super(AdvancedStr, cls).__new__(cls, arg)
        return instance
    
    # --------------------------- PROPERTIES --------------------------- #
    @property
    def __definitionPattern(self) -> str:
        """
        Generate the regular expression pattern for definitions.

        Returns:
            str: The regular expression pattern.
        """
        if self.__AdvancedBalisesPattern.values():
            excludedPatterns = (
                r"(?!" +
                r'|'.join(list(self.__AdvancedBalisesPattern.values())) +
                ")")
        else:
            excludedPatterns = ""

        return fr'<{excludedPatterns}.*?>'

    def __referencePattern(self) -> str:
        """
        Generate the regular expression pattern for reference tags.

        Returns:
            str: The regular expression pattern.
        """
        return self.__getAdvancedBalisesPattern(
            baliseName= "reference"
            )
    
    def __linkPattern(self) -> str:
        """
        Generate the regular expression pattern for link tags.

        Returns:
            str: The regular expression pattern.
        """
        return self.__getAdvancedBalisesPattern(
            baliseName="link"
            )

    @property
    def definitions(self)-> Iterator[re.Match[str]]:
        """
        Gets an iterator that iterates over all definition tags in the string.

        Yields:
            The next definition tag.
        """
        return self.__getMatches(
            referenceStr=self,
            pattern=self.__definitionPattern)
    
    @property
    def references(self)-> Iterator[re.Match[str]]:
        """
        Gets an iterator that iterates over all reference tags in the string.

        Yields:
            The next reference tag.
        """
        return self.__AdvancedBalises(
            baliseName="reference"
        )
    
    @property
    def links(self)-> Iterator[re.Match[str]]:
        """
        Gets an iterator that iterates over all link tags in the string.

        Yields:
            The next link tag.
        """
        return self.__AdvancedBalises(
            baliseName="link"
        )

    # -------------------------- GENERIC TOOLS ------------------------- #
    
    @staticmethod
    def __getMatches(
            referenceStr : str,
            pattern : str,
            ) -> Iterator[re.Match[str]]:
        """
        Find all matches of a pattern in the reference string.

        Args:
            referenceStr (str): The string to search for matches.
            pattern (str): The regular expression pattern to match.

        Returns:
            Iterator[re.Match[str]]: An iterator of match objects.
        """
        
        return re.finditer(
                pattern,
                referenceStr,
                re.DOTALL)
   
    @staticmethod
    def __listValues(
            balises : Iterator[re.Match[str]],
            ) -> list[str]:
        """
        Extract values from an iterator of match objects.

        Args:
            balises (Iterator[re.Match[str]]): An iterator of match objects.

        Returns:
            list[str]: A list of extracted values.
        """
        return [balise.group()[1:-1] for balise in balises]
    
    def __getAdvancedBalisesPattern(
            self,
            baliseName: str,
        )-> str:
        """
        Generate the regular expression pattern for a specific advanced tag.

        Args:
            baliseName (str): The name of the advanced tag.

        Returns:
            str: The regular expression pattern for the specified advanced tag.
        """
        balisePattern = self.__AdvancedBalisesPattern[baliseName]
        return fr'<{balisePattern}.*?>'
    
    def __AdvancedBalises(
            self,
            baliseName:str,
            )-> Iterator[re.Match[str]]:
        """
        Get an iterator that iterates over all occurrences of a specific advanced tag.

        Args:
            baliseName (str): The name of the advanced tag to search for.

        Yields:
            re.Match[str]: A match object representing an occurrence of the advanced tag.
        """
        return self.__getMatches(
            pattern=self.__getAdvancedBalisesPattern(baliseName=baliseName),
            referenceStr=self)
   
    @staticmethod
    def __replaceElements(
            initial_str : str,
            replacement : ReplacementStr
            ) -> AdvancedStr:
        """
        Replace a specific tag with its corresponding replacement in the string.

        Args:
            initial_str (str): The initial string with the tag to be replaced.
            replacement (ReplacementStr): A ReplacementStr object containing the tag to replace
                and its corresponding replacement.

        Returns:
            AdvancedStr: A new instance of AdvancedStr with the tag replaced.
        """
        
        new_txt = re.sub(
            fr"<{replacement.str2replace}>",
            replacement.replacement,
            initial_str,
            re.DOTALL
            )
        return AdvancedStr(new_txt)

    @staticmethod
    def __replaceListElements(
            original_str: str,
            replacements : list[ReplacementStr]
            ) -> AdvancedStr:
        """
        Replace multiple tags with their corresponding replacements in the string.

        Args:
            original_str (str): The original string with tags to be replaced.
            replacements (list[ReplacementStr]): A list of ReplacementStr objects
                containing the tags to replace and their corresponding replacements.

        Returns:
            AdvancedStr: A new instance of AdvancedStr with replaced tags.
        """
        
        new_str = original_str

        # loop over all definition to replace
        for replacement in replacements:
            new_str = AdvancedStr.__replaceElements(
                initial_str=new_str,
                replacement=replacement
            )
        
        return AdvancedStr(new_str)


    # --------------------------- DEFINITION --------------------------- #
    def getDefinitions(self)-> list[str]:
        """
        Get a list of definition values extracted from the string.

        Returns:
            list[str]: A list of definition values.
        """
        return self.__listValues(self.definitions)
    
    def replaceDefinitions(
            self,
            definitionTag2replace: list[ReplacementStr],
            ) -> AdvancedStr:
        """
        Replace definition tags with provided replacements.

        Args:
            definitionTag2replace (list[ReplacementStr]): A list of ReplacementStr objects
                containing the tags to replace and their corresponding replacements.

        Returns:
            AdvancedStr: A new instance of AdvancedStr with replaced tags.
        """
        
        return self.__replaceListElements(
            original_str=self,
            replacements=definitionTag2replace
        )
      
    # --------------------------- REFERENCES --------------------------- #

    def getReferences(self) -> list[str]:
        """
        Get a list of reference values extracted from the string.

        Returns:
            list[str]: A list of reference values.
        """
        return self.__listValues(self.references)
    
    # ------------------------------ LINKS ----------------------------- #
    def getLinks(self) -> list[str]:
        """
        Get a list of link values extracted from the string.

        Returns:
            list[str]: A list of link values.
        """
        return self.__listValues(self.references)


a =AdvancedStr(txt)

print(a.getDefinitions())

print(a)

b = a.replaceDefinitions([
    ReplacementStr(
    str2replace='dfgdfg',
    replacement="ALLEZ LE SCO"),
    ReplacementStr(
    str2replace='dsflmksmfk',
    replacement="ALLEZ LE PSG")
    ]
)

print(b)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 77)

In [8]:
# create a list
prime_numbers = ["2", "", "5"]

# create another list
numbers = [""]

# add all elements of prime_numbers to numbers
prime_numbers.extend(numbers)


print('List after extend():', prime_numbers)

while("" in prime_numbers):
    prime_numbers.remove("")
print('List after extend():', prime_numbers)


List after extend(): ['2', '', '5', '']
List after extend(): ['2', '5']


In [8]:
df =b.toPandas()
print(df.head())
type(df["title"][1])



                                               title  \
0                                         V4yb6punio   
1                    Ld4byp63d2xwv3r5eby7kifb1v6o4nl   
2  Fjy d6rtj3xl0gha2tcx18qjli7ildxcrglk3a5j2rzxtf...   
3                        Ywpbh8ajkkftec4x1xsf37h1yvv   
4  Cpco452yp1iubyd7bw7pybpk8z9ckzlljgg8ewdsbset4z...   

                                         description  \
0  ante Morbi tristique interdum eu lectus ipsum ...   
1  amet, ipsum nisl, Nullam sit metus laoreet Lor...   
2  auctor congue libero, iaculis, Curabitur males...   
3  nec ut justo ligula eu varius nunc dignissim v...   
4  at aliquam auctor elementum fringilla, Suspend...   

                                           rationale validation_status  \
0  libero malesuada ultricies, gravida at eleifen...           unvalid   
1  Nulla sem metus orci risus dui Donec consequat...           unvalid   
2  Lorem dignissim aliquam congue ex a Etiam rutr...           unvalid   
3                             

str