# EXPLORING REGULAR EXPRESSIONS

This notebook is partially inspired by <a href="resources/Speech and Language Processing (12-Jan-2025) - Daniel Jurafsky & James H. Martin.pdf"><i>"Speech-and-Language-Processing_Jurafsky-Martin_Jan-12-2025"</i></a> chapter on Regular Expressions.

## INITIALZATION

In [18]:
# Required Modules

import re
import pandas as pd
import numpy as np
import regex
import pygrep
import flashtext
import spacy
import loguru
import pyspark
import kafka
import os
import gzip
import pandas as pd
import shutil
import kagglehub
import requests
from urllib.parse import urljoin

In [15]:
# File Paths

src_folder_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//src"
notebooks_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//src//notebooks"
scripts_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//src//scripts"
datasets_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//datasets"
csvs_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//datasets//csv"
jsons_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//datasets//json"
txt_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//datasets//txt"

In [16]:
# Important Functions



In [None]:
# Loading File

## Downloading the latest version
path = kagglehub.dataset_download("bilalyussef/google-books-dataset")

# Display file path for downloaded file
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/bilalyussef/google-books-dataset?dataset_version_number=3...


100%|██████████| 713k/713k [00:00<00:00, 1.55MB/s]

Extracting files...
Path to dataset files: C:\Users\mquay\.cache\kagglehub\datasets\bilalyussef\google-books-dataset\versions\3





The downloads are to be moved to the <code>\csv</code> directory for ease of access.

In [66]:
# Accessing File

document = pd.read_csv("..//..//datasets//csv//kaggle//bilalyussef//google_books_1299.csv", encoding='utf-8').drop(columns=["Unnamed: 0"]).rename(columns={"generes":'genres'})
document.columns

Index(['title', 'author', 'rating', 'voters', 'price', 'currency',
       'description', 'publisher', 'page_count', 'genres', 'ISBN', 'language',
       'published_date'],
      dtype='object')

In [80]:
doc_p1 = document[['title',"author", "rating", "voters", "price"]]
"""Stores the first partition of the document

Columns: 'title',"author", "rating", "voters", "price"
"""

doc_p1[:3]

Unnamed: 0,title,author,rating,voters,price
0,Attack on Titan: Volume 13,Hajime Isayama,4.6,428,43.28
1,Antiques Roadkill: A Trash 'n' Treasures Mystery,Barbara Allan,3.3,23,26.15
2,The Art of Super Mario Odyssey,Nintendo,3.9,9,133.85


In [81]:
doc_p2 = document[["currency", "description", "publisher", "page_count"]]
"""Stores the second partition of the document

Columns: "currency", "description", "publisher", "page_count"
"""

doc_p2[:3]

Unnamed: 0,currency,description,publisher,page_count
0,SAR,NO SAFE PLACE LEFT At great cost to the Garris...,Kodansha Comics,192
1,SAR,Determined to make a new start in her quaint h...,Kensington Publishing Corp.,288
2,SAR,Take a globetrotting journey all over the worl...,Dark Horse Comics,368


In [82]:
doc_p3 = document[["genres", 'ISBN', 'language', 'published_date']] 
"""Stores the third partition of the document

Columns: "genres", 'ISBN', 'language', 'published_date'
"""

doc_p3[:3]

Unnamed: 0,genres,ISBN,language,published_date
0,none,9781612626864,English,"Jul 31, 2014"
1,"Fiction , Mystery &amp, Detective , Cozy , Gen...",9780758272799,English,"Jul 1, 2007"
2,"Games &amp, Activities , Video &amp, Electronic",9781506713816,English,"Nov 5, 2019"


In [78]:
joint_partitioned_doc = [doc_p1, doc_p2, doc_p3]

"""Stores partitioned files from master document

Returns:
    - 3 partitioned files from the main document
"""

'Stores partitioned files from master document\n\nReturns:\n    - 3 partitioned files from the main document\n'

## PRE-PROCESSING

### Exploration

In [None]:
# How many books exist in this file?
len(document)

1299

In [52]:
# File Description

document.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           1299 non-null   object 
 1   author          1299 non-null   object 
 2   rating          1224 non-null   float64
 3   voters          1224 non-null   object 
 4   price           1299 non-null   float64
 5   currency        1299 non-null   object 
 6   description     1296 non-null   object 
 7   publisher       1299 non-null   object 
 8   page_count      1299 non-null   int64  
 9   genres          1299 non-null   object 
 10  ISBN            1299 non-null   object 
 11  language        1299 non-null   object 
 12  published_date  1299 non-null   object 
dtypes: float64(2), int64(1), object(10)
memory usage: 132.1+ KB


In [None]:
voters_values_preproc_start = list(doc_p1['voters'])
"""Removes commas from all values, allowing conversion to int64"""

# voters_values_preproc_final = [int(str(num).replace(",", "").split(".")[0]) for num in list(doc_p1['voters'])]
# """Final values after having removed commas"""
        
# voters_values_preproc_final

ValueError: invalid literal for int() with base 10: 'nan'

In [94]:
doc_p1['voters'].unique()

array(['428', '23', '9', '10', '577', '832', '94', '221', '135', '47',
       '383', '57', '45', '38,526', '4', '427', '3', '13', nan, '281',
       '200', '7', '100', '408', '288', '15', '886', '1,633', '1,382',
       '206', '1', '72', '6', '861', '42', '17,719', '522', '6,615', '64',
       '32,771', '1,251', '1,130', '5,177', '34', '142', '580', '352',
       '751', '10,650', '2', '3,316', '124', '255', '591', '1,756', '87',
       '223', '3,650', '107', '25', '56', '127', '14', '453', '3,695',
       '526', '532', '399', '451', '4,683', '18', '95', '46', '90', '84',
       '78', '818', '247', '30', '80', '157', '85', '79', '119', '407',
       '216', '588', '997', '60', '499', '624', '120', '166', '483',
       '168', '661', '4,260', '1,354', '38', '1,408', '1,701', '4,532',
       '250', '1,780', '33', '11', '4,750', '8', '21', '1,799', '510',
       '24', '9,403', '567', '141', '179', '2,700', '22', '363', '715',
       '97', '103', '1,105', '4,533', '66', '52', '634', '53', '61

In [93]:
doc_p1['voters'] = doc_p1['voters'].astype("int64")
doc_p1['voters'].dtype

ValueError: invalid literal for int() with base 10: '38,526'

In [53]:
# Sampling 10 files for exploration

document

Unnamed: 0,title,author,rating,voters,price,currency,description,publisher,page_count,genres,ISBN,language,published_date
0,Attack on Titan: Volume 13,Hajime Isayama,4.6,428,43.28,SAR,NO SAFE PLACE LEFT At great cost to the Garris...,Kodansha Comics,192,none,9781612626864,English,"Jul 31, 2014"
1,Antiques Roadkill: A Trash 'n' Treasures Mystery,Barbara Allan,3.3,23,26.15,SAR,Determined to make a new start in her quaint h...,Kensington Publishing Corp.,288,"Fiction , Mystery &amp, Detective , Cozy , Gen...",9780758272799,English,"Jul 1, 2007"
2,The Art of Super Mario Odyssey,Nintendo,3.9,9,133.85,SAR,Take a globetrotting journey all over the worl...,Dark Horse Comics,368,"Games &amp, Activities , Video &amp, Electronic",9781506713816,English,"Nov 5, 2019"
3,Getting Away Is Deadly: An Ellie Avery Mystery,Sara Rosett,4.0,10,26.15,SAR,"With swollen feet and swelling belly, pregnant...",Kensington Publishing Corp.,320,none,9781617734076,English,"Mar 1, 2009"
4,"The Painted Man (The Demon Cycle, Book 1)",Peter V. Brett,4.5,577,28.54,SAR,The stunning debut fantasy novel from author P...,HarperCollins UK,544,"Fiction , Fantasy , Dark Fantasy",9780007287758,English,"Jan 8, 2009"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1294,Twas The Nightshift Before Christmas: Festive ...,Adam Kay,4.7,47,41.82,SAR,A short gift book of festive hospital diaries ...,Pan Macmillan,112,"Medical , Health Care Delivery",9781529018592,English,"Oct 17, 2019"
1295,Why We Sleep: The New Science of Sleep and Dreams,Matthew Walker,4.8,52,46.85,SAR,'Astonishing ... an amazing book ... absolutel...,Penguin UK,368,"Psychology , Cognitive Psychology &amp, Cognition",9780141983776,English,"Sep 28, 2017"
1296,How to Understand Business Finance: Edition 2,Bob Cinnamon,3.5,4,46.85,SAR,The modern marketplace is increasingly unpredi...,Kogan Page Publishers,176,none,9780749460211,English,"Apr 3, 2010"
1297,Spider-Man: Kraven's Last Hunt,J. M. DeMatteis,4.6,74,43.28,SAR,"Collects Web of Spider-Man #31-32, Amazing Spi...",Marvel Entertainment,168,none,9781302377366,English,"Dec 10, 2014"


In [None]:
text_exploration = sample_robotstxt_urls[0]
sample_robotstxt_urls[0]

'crawl-data/CC-MAIN-2025-05/segments/1736703361941.29/robotstxt/CC-MAIN-20250126135402-20250126165402-00000.warc.gz'

In [65]:
re.findall(r"(seg)+", text_exploration)

['seg']

In [33]:
re.search(r"\d+", text_exploration).group()

'2025'

In [30]:
# re.search()

first_num_occ = re.search(r"\d+", text_exploration)
print(first_num_occ.group())

2025


## `re.search()` - Find the first match

## `re.findall()` - Find all matches

## `re.match()` - Does pattern match at start of string?

## `re.fullmatch()` - Does entire string match pattern?

## `re.sub()` - Replaces occurences of a pattern with another string

## `re.split()` - Splits a string based on regex pattern

## `re.compile()` - Precompiles a regex for repeated use

# EXAMPLES

- Extract all digits from "Order 123, Invoice 456, ID 7890".
- Find all lowercase letters in "Hello WORLD Python!".
- Extract all capital letters from "Regex is FUN and POWERFUL".
- Find all words in "Hello, World! 123" (Ignore punctuation).
- Extract words that contain only alphabets from "cat123 dog45 apple banana".
<br>
<br>
- Quantifiers (*, +, ?, {})
- Find all words that start with "a" in "apple banana apricot orange".
- Extract sequences of 2 or more digits from "abc123def4567gh89".
- Find all words with at least 5 letters in "hello world python java".
- Extract all words with exactly 4 letters from "this that when where".
- Match any word that contains "th" in "there, think, throw, math, father".
<br>
<br>
- Character Classes (\d, \w, \s)
- Extract all numbers from "I have 2 cats and 3 dogs".
- Find all words containing numbers in "user1 admin99 guest3".
- Extract all words starting with a capital letter from "Alice Bob charlie Daniel".
- Extract all special characters from "Hello@World! Python#Regex" (Ignore letters and numbers).
- Find all sequences of spaces in "This has multiple spaces".
<br>
<br>
- Anchors (^, $, \b)
- Match words that start with "P" in "Python PHP JavaScript"
- Find all sentences ending with a period in "Hello world. This is regex! Python is fun."
- Extract hashtags from "#Python #DataScience #Regex"
- Extract valid email addresses from "test@example.com, hello@site.org, user@invalid".
- Extract valid phone numbers from "Call 123-456-7890 or (123) 456-7890".

In [66]:
# - Extract all digits from "Order 123, Invoice 456, ID 7890"

text1 = "Order 123, Invoice 456, ID 7890"

re.findall(r"\d+", text1)

['123', '456', '7890']

In [76]:
# Find all lowercase letters in "Hello WORLD Python!"

text2 = "Hello WORLD Python!"
re.findall(r"[a-z]+", text2)

['ello', 'ython']

In [79]:
# Extract all capital letters from "Regex is FUN and POWERFUL".

text3 = "Regex is FUN and POWERFUL"
re.findall(r"[A-Z]+", text3)

['R', 'FUN', 'POWERFUL']

In [44]:
# Find all words in "Hello, World! 123" (Ignore punctuation).

text4 = "Hello, World! 123"
re.findall(r"[A-Za-z]+", text4)

['Hello', 'World']

In [49]:
# Extract words that contain only alphabets from "cat123 dog45 apple banana".

text5 = "cat123 dog45 apple banana"
re.findall(r"\b[a-zA-Z]+\b", text5)

['apple', 'banana']

In [32]:
# Find all words that start with "a" in "apple banana apricot orange"

text6 = "apple banana apricot orange"
re.findall(r"\ba\w*", text6)

['apple', 'apricot']

In [None]:
re.findall(r"", text4)

['H']

In [None]:
re.findall(r"/[a-z]", "cat123 dog45 apple banana")

[]

In [None]:
a = ""

In [17]:
import re 
b = re.split(r"\x01", a)
b

['20250219-09:49:39.874935000 [out] : 8=FIX.4.4',
 '9=291',
 '35=8',
 '34=48945',
 '49=CENTROID_SOL',
 '52=20250219-09:49:39.874882',
 '56=TD_MT5_FIX',
 '1=BM_MT5_B_2',
 '6=0.0000000000',
 '11=e-1739958579793858600',
 '14=0.00',
 '17=9756177',
 '31=0.0000000000',
 '32=0.00',
 '37=9756177',
 '38=1000.00',
 '39=0',
 '40=1',
 '54=1',
 '55=ADAUSD.x',
 '58=New Request',
 '59=3',
 '60=20250219-09:49:39',
 '150=0',
 '151=1000.00',
 '9999=CenSystem',
 '10=198',
 'a',
 'a']

In [21]:
for i in range(len(b)):
    if not "38=" in b[i]:
        pass
    else: 
        print(b[i])
    if not "39=" in b[i]:
        pass
    else: 
        print(b[i])
    if not "55=" in b[i]:
        pass
    else: 
        print(b[i])

38=1000.00
39=0
55=ADAUSD.x
