Exploring Basics of Regular Expressions

In [20]:
# Required Modules

import re
import pandas as pd
import numpy as np
import regex
import pygrep
import flashtext
import spacy
import loguru
import pyspark
import kafka
import os
import gzip
import shutil
import requests
from urllib.parse import urljoin

In [21]:
# File Paths

src_folder_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//src"
notebooks_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//src//notebooks"
scripts_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//src//scripts"
datasets_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//datasets"
csvs_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//datasets//csv"
jsons_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//datasets//json"
txt_path = "C://Users//mquay//Documents//GitHub//Personal//nlp-practice//datasets//txt"

In [22]:
# Important Functions



In [23]:
# Loading File

robotstxt_file = os.path.join(txt_path, "robotstxt.paths.gz")

In [24]:
# Accessing File

robotstxt_urls = []
with gzip.open(robotstxt_file, "rt", encoding="utf-8") as file:
    for i, line in enumerate(file):
        robotstxt_urls.append(line.strip())

In [25]:
# How many files exist?
len(robotstxt_urls)

90000

## Exploration

In [35]:
# Sampling 10 files for exploration

sample_robotstxt_urls = robotstxt_urls[:10]
len(sample_robotstxt_urls)

10

In [None]:
text_exploration = sample_robotstxt_urls[0]
sample_robotstxt_urls[0]

'crawl-data/CC-MAIN-2025-05/segments/1736703361941.29/robotstxt/CC-MAIN-20250126135402-20250126165402-00000.warc.gz'

In [65]:
re.findall(r"(seg)+", text_exploration)

['seg']

In [33]:
re.search(r"\d+", text_exploration).group()

'2025'

In [30]:
# re.search()

first_num_occ = re.search(r"\d+", text_exploration)
print(first_num_occ.group())

2025


## Extracting Text from .WARC files

In [34]:
# Downloading 10 files

base_url_robotstxt = "https://data.commoncrawl.org/"

for i, filename in enumerate(sample_robotstxt_urls):
    robotstxt_url = urljoin(base_url_robotstxt, filename)  # Correct URL joining
    response = requests.get(robotstxt_url, stream=True)

    if response.status_code == 200:
        with open(filename, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        
        print(f"Downloaded WARC file {filename} successfully!")  # Print after completion
    else:
        print(f"Failed to download {filename}: HTTP {response.status_code}")

FileNotFoundError: [Errno 2] No such file or directory: 'crawl-data/CC-MAIN-2025-05/segments/1736703361941.29/robotstxt/CC-MAIN-20250126135402-20250126165402-00000.warc.gz'

## `re.search()` - Find the first match

In [None]:


for file in 

In [7]:
text1 = "Order number 12345 was processed on 2024-02-17."

numbers = re.search(r"\d+", text1)
print(f"Extracted Numbers: {numbers.group()}")

Extracted Numbers: 12345


## `re.findall()` - Find all matches

## `re.match()` - Does pattern match at start of string?

## `re.fullmatch()` - Does entire string match pattern?

## `re.sub()` - Replaces occurences of a pattern with another string

## `re.split()` - Splits a string based on regex pattern

## `re.compile()` - Precompiles a regex for repeated use

# EXAMPLES

- Extract all digits from "Order 123, Invoice 456, ID 7890".
- Find all lowercase letters in "Hello WORLD Python!".
- Extract all capital letters from "Regex is FUN and POWERFUL".
- Find all words in "Hello, World! 123" (Ignore punctuation).
- Extract words that contain only alphabets from "cat123 dog45 apple banana".
<br>
<br>
- Quantifiers (*, +, ?, {})
- Find all words that start with "a" in "apple banana apricot orange".
- Extract sequences of 2 or more digits from "abc123def4567gh89".
- Find all words with at least 5 letters in "hello world python java".
- Extract all words with exactly 4 letters from "this that when where".
- Match any word that contains "th" in "there, think, throw, math, father".
<br>
<br>
- Character Classes (\d, \w, \s)
- Extract all numbers from "I have 2 cats and 3 dogs".
- Find all words containing numbers in "user1 admin99 guest3".
- Extract all words starting with a capital letter from "Alice Bob charlie Daniel".
- Extract all special characters from "Hello@World! Python#Regex" (Ignore letters and numbers).
- Find all sequences of spaces in "This has multiple spaces".
<br>
<br>
- Anchors (^, $, \b)
- Match words that start with "P" in "Python PHP JavaScript"
- Find all sentences ending with a period in "Hello world. This is regex! Python is fun."
- Extract hashtags from "#Python #DataScience #Regex"
- Extract valid email addresses from "test@example.com, hello@site.org, user@invalid".
- Extract valid phone numbers from "Call 123-456-7890 or (123) 456-7890".

In [66]:
# - Extract all digits from "Order 123, Invoice 456, ID 7890"

text1 = "Order 123, Invoice 456, ID 7890"

re.findall(r"\d+", text1)

['123', '456', '7890']

In [76]:
# Find all lowercase letters in "Hello WORLD Python!"

text2 = "Hello WORLD Python!"
re.findall(r"[a-z]+", text2)

['ello', 'ython']

In [79]:
# Extract all capital letters from "Regex is FUN and POWERFUL".

text3 = "Regex is FUN and POWERFUL"
re.findall(r"[A-Z]+", text3)

['R', 'FUN', 'POWERFUL']

In [80]:
# Find all words in "Hello, World! 123" (Ignore punctuation).

text4 = "Hello, World! 123"
re.findall(r"\w+", text4)

['Hello', 'World', '123']