In [1]:
import re 
from collections import namedtuple

# NamedTuple

## Definition
- #### **`namedtuple`** is factory funciton that creates a subclass of **`tuple`** with named fields. It creates tuple-like objects with named attributes, providing a more readable and self-explanatory way to store and access data. 

In [2]:
# Define a named tuple 'Trip' with fields 'date' and 'cost'
Trip = namedtuple('Trip', ['date', 'cost'])

# Create an instance of Trip
summer_trip = Trip('06-15-2024', '5000')

# Access the fields by name 
summer_trip.date

'06-15-2024'

In [3]:
# Access fields by index (like regular tuple)
summer_trip[1]

'5000'

## Use Case #1
- #### Storing records in a more structured way.

In [4]:
Employee = namedtuple('Employee', ['name', 'id', 'department'])

employee = Employee(name='Veera', id='i99999', department='Business Intelligence')

print(f"Employee ID: {employee.id}, Name: {employee.name}, Position: {employee.department} ")

Employee ID: i99999, Name: Veera, Position: Business Intelligence 


## Use Case #2
- #### Storing Configuration Settings

In [5]:
Config = namedtuple('Config', ['host_url', 'secret_key'])

odbc_connection = Config('db2-dev1-gibberish', 'dev-secret-key')
oracle_connection = Config('oracle-dev1-gibberish', 'dev-secret-key')

odbc_connection.secret_key == oracle_connection.secret_key

True

# Tokenization Steps

1. **Regular Expression Patterns**:
   - `NAME`, `NUM`, `PLUS`, `TIMES`, `EQ`, and `WS` are regular expression patterns using named capturing groups. These patterns are used to match specific types of tokens in the input.

   a. **`NAME`**:
      - `(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)`
      - Explanation:
        - `(?P<NAME>...)` defines a named capturing group named "NAME".
        - `[a-zA-Z_]` matches a single uppercase or lowercase letter or underscore.
        - `[a-zA-Z_0-9]*` matches zero or more occurrences of letters, digits, or underscores after the first character.
        - This pattern matches variable names following Python's identifier rules.

   b. **`NUM`**:
      - `(?P<NUM>\d+)`
      - Explanation:
        - `(?P<NUM>...)` defines a named capturing group named "NUM".
        - `\d` matches a digit (0-9).
        - `+` matches one or more occurrences of the preceding digit pattern.
        - This pattern matches integers (numeric values).

   c. **`PLUS`**:
      - `(?P<PLUS>\+)`
      - Explanation:
        - `(?P<PLUS>...)` defines a named capturing group named "PLUS".
        - `\+` matches the plus symbol (`+`).

   d. **`TIMES`**:
      - `(?P<TIMES>\*)`
      - Explanation:
        - `(?P<TIMES>...)` defines a named capturing group named "TIMES".
        - `\*` matches the asterisk symbol (`*`).

   e. **`EQ`**:
      - `(?P<EQ>=)`
      - Explanation:
        - `(?P<EQ>...)` defines a named capturing group named "EQ".
        - `=` matches the equal symbol (`=`).

   f. **`WS`**:
      - `(?P<WS>\s+)`
      - Explanation:
        - `(?P<WS>...)` defines a named capturing group named "WS".
        - `\s` matches any whitespace character (space, tab, newline).
        - `+` matches one or more occurrences of the preceding whitespace pattern.
        - This pattern matches whitespace.

In [6]:
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

2. **Master Pattern**:
   - `master_pattern` is a master regular expression pattern that combines all the individual patterns using the `|` (alternation) operator. This pattern is used to match any of the individual patterns.

In [7]:
master_pattern = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))
master_pattern

re.compile(r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)|(?P<NUM>\d+)|(?P<PLUS>\+)|(?P<TIMES>\*)|(?P<EQ>=)|(?P<WS>\s+)',
           re.UNICODE)

3. **Token Definition**:
   - `Token` is a named tuple that defines the structure of a token. Each token has a type (e.g., `NAME`, `NUM`) and a corresponding value (the matched substring).

In [8]:
Token = namedtuple('Token', ['type', 'value'])

4. **`generate_tokens` Function**:
   - The `generate_tokens` function takes a pattern (`pattern`) and a text input (`text`).
   - It uses the `scanner` created from the pattern to scan the input text for matches based on the pattern.
   - For each match found, it yields a token with the match's type and value.

In [9]:
from typing import Generator

def generate_tokens(pattern: re.Pattern, text: str) -> Generator[Token, None, None]:
    """
    Tokenizes the input text based on the provided regular expression pattern.

    Parameters:
    pattern (re.Pattern): The regular expression pattern to match tokens in the text.
    text (str): The input text to tokenize.

    Yields:
    Token: A named tuple representing a token with type and value.

    The function iterates through the input text using the provided pattern's scanner.
    For each match found, it yields a Token with the match's type (last named group) and value.
    """
    scanner = pattern.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())


for token in generate_tokens(master_pattern, 'foo = 42'):
    print(token)

Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')


In [10]:
sample_text = """Why is 5*5=25?"""
for token in generate_tokens(master_pattern, text=sample_text):
    print(token.type)

NAME
WS
NAME
WS
NUM
TIMES
NUM
EQ
NUM
