# EXPLORING REGULAR EXPRESSIONS

This notebook is partially inspired by <a href="resources/Speech and Language Processing (12-Jan-2025) - Daniel Jurafsky & James H. Martin.pdf"><i>"Speech-and-Language-Processing_Jurafsky-Martin_Jan-12-2025"</i></a> chapter on Regular Expressions.

String Methods + Regex Expressions are most useful here.

## INITIALZATION

In [1]:
# Required Modules

import re
import pandas as pd
import numpy as np
import regex
import pygrep
import flashtext
import spacy
import loguru
import pyspark
import kafka
import os
import gzip
import pandas as pd
import shutil
import kagglehub
import requests
import math
from urllib.parse import urljoin

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# File Paths

src_folder_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//src"
notebooks_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//src//notebooks"
scripts_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//src//scripts"
datasets_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//datasets"
csvs_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//datasets//csv"
jsons_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//datasets//json"
txt_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//datasets//txt"

In [3]:
# Important Functions/Variables

def int_extractor_lambda(inp):
    """Returns a single integer/sequence of integers, removing all non-integer characters from the given input.
    
    :param (Any) inp: A single/sequence of primitive data types.
    :return: Returns a single integer
    :rtype: int
    :return: Returns a sequence of integers/DataFrame Series of integers
    :rtype: List[int]
    :return: The element itself, if it is of type bool/None
    :rtype: inp(elem)
    """ 
        
    if isinstance(inp, str):
        return int("".join(re.findall(r"[0-9]+", inp)))
    
    elif isinstance(inp, list) or isinstance(inp, pd.Series):
        inp = [str(elem) for elem in inp] # To avoid 'TypeError: expected string or bytes-like object'
        new_li = []
        for ind, elem in enumerate(inp):
                if elem != 'nan':
                    # print(f"Before Regex attempt: {elem}")
                    new_str = int("".join(re.findall(r"[0-9]+", elem)))
                    # print(f"After Regex attempt: {new_str}")
                    new_li.append(new_str)
                else:
                    new_str = 0
                    new_li.append(new_str)
        return new_li
    
    elif isinstance(inp, float) and math.isnan(inp):
        return float('nan')
    
    else:
    
        return int(inp)
    
# def dtype_updater(col1, col2): [Construct function to update corresponding dtypes from subdataframes to original dataframes]
    

In [4]:
# Downloading dataset from Kaggle and loading it into notebook

## Getting the latest version
path = kagglehub.dataset_download("bilalyussef/google-books-dataset")

# Display file path for downloaded file
print("Path to dataset files:", path)

Path to dataset files: C:\Users\mquay\.cache\kagglehub\datasets\bilalyussef\google-books-dataset\versions\3


The downloads are to be moved to the <code>\csv</code> directory for ease of access.

In [5]:
# Accessing File~

raw_doc_master = pd.read_csv("..//..//datasets//csv//kaggle//bilalyussef//google_books_1299.csv", encoding='utf-8').drop(columns=["Unnamed: 0"]).rename(columns={"generes":'genres'})
"""Master Document File (raw)"""

raw_doc_master.columns

Index(['title', 'author', 'rating', 'voters', 'price', 'currency',
       'description', 'publisher', 'page_count', 'genres', 'ISBN', 'language',
       'published_date'],
      dtype='object')

In [6]:
raw_doc_p1 = raw_doc_master.loc[:, ['title',"author", "rating", "voters", "price"]]
"""Stores the first partitioned DataFrame of the document

:return: 'title', 'author', 'rating', 'voters', 'price'
:rtype: Columns
"""

raw_doc_p1[:3]

Unnamed: 0,title,author,rating,voters,price
0,Attack on Titan: Volume 13,Hajime Isayama,4.6,428,43.28
1,Antiques Roadkill: A Trash 'n' Treasures Mystery,Barbara Allan,3.3,23,26.15
2,The Art of Super Mario Odyssey,Nintendo,3.9,9,133.85


In [75]:
raw_doc_p2 = raw_doc_master.loc[:, ["currency", "description", "publisher", "page_count"]]
"""Stores the second partitioned DataFrame of the document

:return: 'currency', 'description', 'publisher', 'page_count'
:rtype: Columns
"""

raw_doc_p2[:3]

Unnamed: 0,currency,description,publisher,page_count
0,SAR,NO SAFE PLACE LEFT At great cost to the Garris...,Kodansha Comics,192
1,SAR,Determined to make a new start in her quaint h...,Kensington Publishing Corp.,288
2,SAR,Take a globetrotting journey all over the worl...,Dark Horse Comics,368


In [65]:
raw_doc_p3 = raw_doc_master.loc[:, ["genres", 'ISBN', 'language', 'published_date']]
"""Stores the third partitioned DataFrame of the document

:return: 'genres', 'ISBN', 'language', 'published_date'
:rtype: Columns
"""

raw_doc_p3[:3]

Unnamed: 0,genres,ISBN,language,published_date
0,none,9781612626864,English,"Jul 31, 2014"
1,"Fiction , Mystery &amp, Detective , Cozy , Gen...",9780758272799,English,"Jul 1, 2007"
2,"Games &amp, Activities , Video &amp, Electronic",9781506713816,English,"Nov 5, 2019"


In [9]:
joint_partitioned_raw_doc = [raw_doc_p1, raw_doc_p2, raw_doc_p3]

"""Stores partitioned DataFrames from master document

:return: 3 partitioned files from the main document
:rtype: List[DataFrame]
"""

'Stores partitioned DataFrames from master document\n\n:return: 3 partitioned files from the main document\n:rtype: List[DataFrame]\n'

## PRE-PROCESSING

### Exploration

In [10]:
# How many books exist in this file?
len(raw_doc_master)

1299

In [11]:
# File Description

raw_doc_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           1299 non-null   object 
 1   author          1299 non-null   object 
 2   rating          1224 non-null   float64
 3   voters          1224 non-null   object 
 4   price           1299 non-null   float64
 5   currency        1299 non-null   object 
 6   description     1296 non-null   object 
 7   publisher       1299 non-null   object 
 8   page_count      1299 non-null   int64  
 9   genres          1299 non-null   object 
 10  ISBN            1299 non-null   object 
 11  language        1299 non-null   object 
 12  published_date  1299 non-null   object 
dtypes: float64(2), int64(1), object(10)
memory usage: 132.1+ KB


#### raw_doc_p1

In [12]:
raw_doc_p1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   title   1299 non-null   object 
 1   author  1299 non-null   object 
 2   rating  1224 non-null   float64
 3   voters  1224 non-null   object 
 4   price   1299 non-null   float64
dtypes: float64(2), object(3)
memory usage: 50.9+ KB


In [13]:
raw_doc_p1[:3]

Unnamed: 0,title,author,rating,voters,price
0,Attack on Titan: Volume 13,Hajime Isayama,4.6,428,43.28
1,Antiques Roadkill: A Trash 'n' Treasures Mystery,Barbara Allan,3.3,23,26.15
2,The Art of Super Mario Odyssey,Nintendo,3.9,9,133.85


In [14]:
# Extracting only integer characters for each instantiation
raw_doc_p1.loc[:,'voters'] = int_extractor_lambda(raw_doc_p1['voters'])

# Converting column to int64
raw_doc_p1['voters'] = raw_doc_p1['voters'].astype("int64")
raw_doc_p1['voters'].dtype

dtype('int64')

In [15]:
raw_doc_p1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   title   1299 non-null   object 
 1   author  1299 non-null   object 
 2   rating  1224 non-null   float64
 3   voters  1299 non-null   int64  
 4   price   1299 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 50.9+ KB


In [16]:
raw_doc_p1[:3]

Unnamed: 0,title,author,rating,voters,price
0,Attack on Titan: Volume 13,Hajime Isayama,4.6,428,43.28
1,Antiques Roadkill: A Trash 'n' Treasures Mystery,Barbara Allan,3.3,23,26.15
2,The Art of Super Mario Odyssey,Nintendo,3.9,9,133.85


#### raw_doc_p2

In [76]:
raw_doc_p2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   currency     1299 non-null   object
 1   description  1296 non-null   object
 2   publisher    1299 non-null   object
 3   page_count   1299 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 40.7+ KB


In [18]:
raw_doc_p2[:3]

Unnamed: 0,currency,description,publisher,page_count
0,SAR,NO SAFE PLACE LEFT At great cost to the Garris...,Kodansha Comics,192
1,SAR,Determined to make a new start in her quaint h...,Kensington Publishing Corp.,288
2,SAR,Take a globetrotting journey all over the worl...,Dark Horse Comics,368


##### Inspection

In [19]:
# Inspecting 'page_count' feature

raw_doc_p2['page_count'].unique()

array([          192,           288,           368,           320,
                 544,           864,           400,           226,
                 304,           112,           208,           250,
                  96,            40,           704,            30,
                  32,           144,           358,           245,
                 416,           336,           168,           176,
                 357,           448,           384,          1424,
                 656,           240,           146,           256,
                 476,           272,           592,           672,
                 224,          4544,           720,           576,
                 148,           350,           216,           352,
                 300,           464,           408,           136,
                 500,           164,            15,            52,
                 172,           816,           128,           688,
                  31,           254,           130,           

This shows what seems to be an ISBN number stored in this column.

Corroborating with the <a href='https://www.kaggle.com/datasets/bilalyussef/google-books-dataset'>file source hosted by Kaggle</a>, this does seem to be the case (check graph displayed within the first row of the 'page_count' column).

The total page count is supposed to be 144, confirmed with <a href='https://www.amazon.co.uk/Deadpool-Team-Up-Fred-Van-Lente/dp/0785145281'>a quick Google search of the entry</a>.

Since the <code>ISBN</code> column is stored within <code>raw_doc_p3</code>, we will correct 'page_count' via modifying <code>raw_doc_p2</code> and 'ISBN' via modifying <code>raw_doc_p3</code>.

In [None]:
# Isolating and retrieving pages with ISBN no. '9781302013929' to correct 'page_count' feature

incorrect_page_count = raw_doc_p2.loc[raw_doc_p2['page_count']==9781302013929]
incorrect_page_count

Unnamed: 0,currency,description,publisher,page_count
288,SAR,"Stop rubbing your eyes, fanboy, they don't dec...",Marvel Entertainment,9781302013929
1241,SAR,"Stop rubbing your eyes, fanboy, they don't dec...",Marvel Entertainment,9781302013929


In [None]:
# Isolating and retrieving pages with ISBN no. '9781302013929' to correct 'ISBN' feature

incorrect_isbn = raw_doc_p3.loc[raw_doc_p2['page_count']==9781302013929]
incorrect_isbn

Unnamed: 0,genres,ISBN,language,published_date
288,"Comics & Graphic Novels , Superheroes",Original pages,English,186
1241,"Comics & Graphic Novels , Superheroes",Original pages,English,186


In [68]:
incorrect_page_count.loc[:,'page_count'] = 144
incorrect_page_count

Unnamed: 0,currency,description,publisher,page_count
288,SAR,"Stop rubbing your eyes, fanboy, they don't dec...",Marvel Entertainment,144
1241,SAR,"Stop rubbing your eyes, fanboy, they don't dec...",Marvel Entertainment,144


In [69]:
incorrect_isbn.loc[:,'ISBN'] = 9781302013929
incorrect_isbn

Unnamed: 0,genres,ISBN,language,published_date
288,"Comics & Graphic Novels , Superheroes",9781302013929,English,186
1241,"Comics & Graphic Novels , Superheroes",9781302013929,English,186


In [95]:
# Correcting 'page_count' in subset dataframe

raw_doc_p2.loc[raw_doc_p2['page_count']==9781302013929].loc[:,'page_count'] = np.nan
raw_doc_p2.loc[raw_doc_p2['page_count']==9781302013929].update(incorrect_page_count)
raw_doc_p2.loc[raw_doc_p2['page_count']==9781302013929]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  raw_doc_p2.loc[raw_doc_p2['page_count']==9781302013929].loc[:,'page_count'] = np.nan
  raw_doc_p2.loc[raw_doc_p2['page_count']==9781302013929].loc[:,'page_count'] = np.nan


Unnamed: 0,currency,description,publisher,page_count
288,SAR,"Stop rubbing your eyes, fanboy, they don't dec...",Marvel Entertainment,9781302013929
1241,SAR,"Stop rubbing your eyes, fanboy, they don't dec...",Marvel Entertainment,9781302013929


In [None]:
# Correcting 'ISBN' in subset dataframe
raw_doc_p3.loc[raw_doc_p2['page_count']==9781302013929]

Unnamed: 0,genres,ISBN,language,published_date
288,"Comics & Graphic Novels , Superheroes",Original pages,English,186
1241,"Comics & Graphic Novels , Superheroes",Original pages,English,186


In [25]:
len(raw_doc_master['title'])

1299

In [26]:
len(raw_doc_master['title'].unique())

246

In [27]:
raw_doc_master[raw_doc_p2['page_count']==9781302013929]

Unnamed: 0,title,author,rating,voters,price,currency,description,publisher,page_count,genres,ISBN,language,published_date
288,Deadpool Team-Up Vol. 1: Good Buddies,Fred Van Lente,3.9,363,43.28,SAR,"Stop rubbing your eyes, fanboy, they don't dec...",Marvel Entertainment,9781302013929,"Comics & Graphic Novels , Superheroes",Original pages,English,186
1241,Deadpool Team-Up Vol. 1: Good Buddies,Fred Van Lente,3.9,363,43.28,SAR,"Stop rubbing your eyes, fanboy, they don't dec...",Marvel Entertainment,9781302013929,"Comics & Graphic Novels , Superheroes",Original pages,English,186


In [28]:
raw_doc_p2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   currency     1299 non-null   object
 1   description  1296 non-null   object
 2   publisher    1299 non-null   object
 3   page_count   1299 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 40.7+ KB


In [29]:
raw_doc_p2[:3]

Unnamed: 0,currency,description,publisher,page_count
0,SAR,NO SAFE PLACE LEFT At great cost to the Garris...,Kodansha Comics,192
1,SAR,Determined to make a new start in her quaint h...,Kensington Publishing Corp.,288
2,SAR,Take a globetrotting journey all over the worl...,Dark Horse Comics,368


#### raw_doc_p3

In [30]:
raw_doc_p3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   genres          1299 non-null   object
 1   ISBN            1299 non-null   object
 2   language        1299 non-null   object
 3   published_date  1299 non-null   object
dtypes: object(4)
memory usage: 40.7+ KB


In [31]:
raw_doc_p3[:3]

Unnamed: 0,genres,ISBN,language,published_date
0,none,9781612626864,English,"Jul 31, 2014"
1,"Fiction , Mystery &amp, Detective , Cozy , Gen...",9780758272799,English,"Jul 1, 2007"
2,"Games &amp, Activities , Video &amp, Electronic",9781506713816,English,"Nov 5, 2019"


In [32]:
raw_doc_p3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   genres          1299 non-null   object
 1   ISBN            1299 non-null   object
 2   language        1299 non-null   object
 3   published_date  1299 non-null   object
dtypes: object(4)
memory usage: 40.7+ KB


In [33]:
raw_doc_p3[:3]

Unnamed: 0,genres,ISBN,language,published_date
0,none,9781612626864,English,"Jul 31, 2014"
1,"Fiction , Mystery &amp, Detective , Cozy , Gen...",9780758272799,English,"Jul 1, 2007"
2,"Games &amp, Activities , Video &amp, Electronic",9781506713816,English,"Nov 5, 2019"


In [34]:
raw_doc_p1['voters'].unique()

array([  428,    23,     9,    10,   577,   832,    94,   221,   135,
          47,   383,    57,    45, 38526,     4,   427,     3,    13,
           0,   281,   200,     7,   100,   408,   288,    15,   886,
        1633,  1382,   206,     1,    72,     6,   861,    42, 17719,
         522,  6615,    64, 32771,  1251,  1130,  5177,    34,   142,
         580,   352,   751, 10650,     2,  3316,   124,   255,   591,
        1756,    87,   223,  3650,   107,    25,    56,   127,    14,
         453,  3695,   526,   532,   399,   451,  4683,    18,    95,
          46,    90,    84,    78,   818,   247,    30,    80,   157,
          85,    79,   119,   407,   216,   588,   997,    60,   499,
         624,   120,   166,   483,   168,   661,  4260,  1354,    38,
        1408,  1701,  4532,   250,  1780,    33,    11,  4750,     8,
          21,  1799,   510,    24,  9403,   567,   141,   179,  2700,
          22,   363,   715,    97,   103,  1105,  4533,    66,    52,
         634,    53,

In [35]:
# Sampling 10 files for exploration

raw_doc_master

Unnamed: 0,title,author,rating,voters,price,currency,description,publisher,page_count,genres,ISBN,language,published_date
0,Attack on Titan: Volume 13,Hajime Isayama,4.6,428,43.28,SAR,NO SAFE PLACE LEFT At great cost to the Garris...,Kodansha Comics,192,none,9781612626864,English,"Jul 31, 2014"
1,Antiques Roadkill: A Trash 'n' Treasures Mystery,Barbara Allan,3.3,23,26.15,SAR,Determined to make a new start in her quaint h...,Kensington Publishing Corp.,288,"Fiction , Mystery &amp, Detective , Cozy , Gen...",9780758272799,English,"Jul 1, 2007"
2,The Art of Super Mario Odyssey,Nintendo,3.9,9,133.85,SAR,Take a globetrotting journey all over the worl...,Dark Horse Comics,368,"Games &amp, Activities , Video &amp, Electronic",9781506713816,English,"Nov 5, 2019"
3,Getting Away Is Deadly: An Ellie Avery Mystery,Sara Rosett,4.0,10,26.15,SAR,"With swollen feet and swelling belly, pregnant...",Kensington Publishing Corp.,320,none,9781617734076,English,"Mar 1, 2009"
4,"The Painted Man (The Demon Cycle, Book 1)",Peter V. Brett,4.5,577,28.54,SAR,The stunning debut fantasy novel from author P...,HarperCollins UK,544,"Fiction , Fantasy , Dark Fantasy",9780007287758,English,"Jan 8, 2009"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1294,Twas The Nightshift Before Christmas: Festive ...,Adam Kay,4.7,47,41.82,SAR,A short gift book of festive hospital diaries ...,Pan Macmillan,112,"Medical , Health Care Delivery",9781529018592,English,"Oct 17, 2019"
1295,Why We Sleep: The New Science of Sleep and Dreams,Matthew Walker,4.8,52,46.85,SAR,'Astonishing ... an amazing book ... absolutel...,Penguin UK,368,"Psychology , Cognitive Psychology &amp, Cognition",9780141983776,English,"Sep 28, 2017"
1296,How to Understand Business Finance: Edition 2,Bob Cinnamon,3.5,4,46.85,SAR,The modern marketplace is increasingly unpredi...,Kogan Page Publishers,176,none,9780749460211,English,"Apr 3, 2010"
1297,Spider-Man: Kraven's Last Hunt,J. M. DeMatteis,4.6,74,43.28,SAR,"Collects Web of Spider-Man #31-32, Amazing Spi...",Marvel Entertainment,168,none,9781302377366,English,"Dec 10, 2014"


In [36]:
text_exploration = sample_robotstxt_urls[0]
sample_robotstxt_urls[0]

NameError: name 'sample_robotstxt_urls' is not defined

In [None]:
re.findall(r"(seg)+", text_exploration)

['seg']

In [None]:
re.search(r"\d+", text_exploration).group()

'2025'

In [None]:
# re.search()

first_num_occ = re.search(r"\d+", text_exploration)
print(first_num_occ.group())

2025


## `re.search()` - Find the first match

## `re.findall()` - Find all matches

## `re.match()` - Does pattern match at start of string?

## `re.fullmatch()` - Does entire string match pattern?

## `re.sub()` - Replaces occurences of a pattern with another string

## `re.split()` - Splits a string based on regex pattern

## `re.compile()` - Precompiles a regex for repeated use

# EXAMPLES

- Extract all digits from "Order 123, Invoice 456, ID 7890".
- Find all lowercase letters in "Hello WORLD Python!".
- Extract all capital letters from "Regex is FUN and POWERFUL".
- Find all words in "Hello, World! 123" (Ignore punctuation).
- Extract words that contain only alphabets from "cat123 dog45 apple banana".
<br>
<br>
- Quantifiers (*, +, ?, {})
- Find all words that start with "a" in "apple banana apricot orange".
- Extract sequences of 2 or more digits from "abc123def4567gh89".
- Find all words with at least 5 letters in "hello world python java".
- Extract all words with exactly 4 letters from "this that when where".
- Match any word that contains "th" in "there, think, throw, math, father".
<br>
<br>
- Character Classes (\d, \w, \s)
- Extract all numbers from "I have 2 cats and 3 dogs".
- Find all words containing numbers in "user1 admin99 guest3".
- Extract all words starting with a capital letter from "Alice Bob charlie Daniel".
- Extract all special characters from "Hello@World! Python#Regex" (Ignore letters and numbers).
- Find all sequences of spaces in "This has multiple spaces".
<br>
<br>
- Anchors (^, $, \b)
- Match words that start with "P" in "Python PHP JavaScript"
- Find all sentences ending with a period in "Hello world. This is regex! Python is fun."
- Extract hashtags from "#Python #DataScience #Regex"
- Extract valid email addresses from "test@example.com, hello@site.org, user@invalid".
- Extract valid phone numbers from "Call 123-456-7890 or (123) 456-7890".

In [None]:
# - Extract all digits from "Order 123, Invoice 456, ID 7890"

text1 = "Order 123, Invoice 456, ID 7890"

re.findall(r"\d+", text1)

['123', '456', '7890']

In [None]:
# Find all lowercase letters in "Hello WORLD Python!"

text2 = "Hello WORLD Python!"
re.findall(r"[a-z]+", text2)

['ello', 'ython']

In [None]:
# Extract all capital letters from "Regex is FUN and POWERFUL".

text3 = "Regex is FUN and POWERFUL"
re.findall(r"[A-Z]+", text3)

['R', 'FUN', 'POWERFUL']

In [None]:
# Find all words in "Hello, World! 123" (Ignore punctuation).

text4 = "Hello, World! 123"
re.findall(r"[A-Za-z]+", text4)

['Hello', 'World']

In [None]:
# Extract words that contain only alphabets from "cat123 dog45 apple banana".

text5 = "cat123 dog45 apple banana"
re.findall(r"\b[a-zA-Z]+\b", text5)

['apple', 'banana']

In [None]:
# Find all words that start with "a" in "apple banana apricot orange"

text6 = "apple banana apricot orange"
re.findall(r"\ba\w*", text6)

['apple', 'apricot']

In [None]:
re.findall(r"", text4)

['H']

In [None]:
re.findall(r"/[a-z]", "cat123 dog45 apple banana")

[]

In [None]:
a = ""

In [None]:
import re 
b = re.split(r"\x01", a)
b

['20250219-09:49:39.874935000 [out] : 8=FIX.4.4',
 '9=291',
 '35=8',
 '34=48945',
 '49=CENTROID_SOL',
 '52=20250219-09:49:39.874882',
 '56=TD_MT5_FIX',
 '1=BM_MT5_B_2',
 '6=0.0000000000',
 '11=e-1739958579793858600',
 '14=0.00',
 '17=9756177',
 '31=0.0000000000',
 '32=0.00',
 '37=9756177',
 '38=1000.00',
 '39=0',
 '40=1',
 '54=1',
 '55=ADAUSD.x',
 '58=New Request',
 '59=3',
 '60=20250219-09:49:39',
 '150=0',
 '151=1000.00',
 '9999=CenSystem',
 '10=198',
 'a',
 'a']

In [None]:
for i in range(len(b)):
    if not "38=" in b[i]:
        pass
    else: 
        print(b[i])
    if not "39=" in b[i]:
        pass
    else: 
        print(b[i])
    if not "55=" in b[i]:
        pass
    else: 
        print(b[i])

38=1000.00
39=0
55=ADAUSD.x
