In [2]:
# regex module
import re

## The findall() Function

Print a list of all matches

In [5]:
# Print a list of all matches:
string = "The rain in Spain"
x = re.findall("ai", string)
print(x)

[]


The list contains the matches in the order they are found.

If no matches are found, an empty list is returned:

In [4]:
# Return an empty list if no match was found:
string = "The rain in Spain"
x = re.findall("Portugal", string)
print(x)

[]


## The search() Function

The search() function searches the string for a match, and returns a Match object if there is a match.

If there is more than one match, only the first occurrence of the match will be returned:

In [5]:
# Search for the first white-space character in the string:
string = "The rain in Spain"
x = re.search("\s", string)
print("The first white-space character is located in position:", x.start())

The first white-space character is located in position: 3


If no matches are found, the value None is returned:

In [6]:
# Make a search that returns no match:
string = "The rain in Spain"
x = re.search("Portugal", string)
print(x)

None


## The split() Function

The split() function returns a list where the string has been split at each match:

In [7]:
# Split at each white-space character:
string = "The rain in Spain"
x = re.split("\s", string)
print(x)

['The', 'rain', 'in', 'Spain']


You can control the number of occurrences by specifying the maxsplit parameter:

In [8]:
# Split the string only at the first occurrence:
string = "The rain in Spain"
x = re.split("\s", string, 1)
print(x)

['The', 'rain in Spain']


## The sub() Function

The sub() function replaces the matches with the text of your choice:

In [9]:
# Replace every white-space character with the number 9:
string = "The rain in Spain"
x = re.sub("\s", "9", string)
print(x)

The9rain9in9Spain


You can control the number of replacements by specifying the count parameter:

In [10]:
# Replace the first 2 occurrences:
string = "The rain in Spain"
x = re.sub("\s", "9", string, 2)
print(x)

The9rain9in Spain


## Match Object

A Match Object is an object containing information about the search and the result.

In [11]:
# Do a search that will return a Match Object:
string = "The rain in Spain"
x = re.search("ai", string)

print(x)            # the object
print(x.span())     # a tuple containing the start-, and end positions of the match.
print(x.string)     # the string passed into the function
print(x.group())    # the part of the string where there was a match

<re.Match object; span=(5, 7), match='ai'>
(5, 7)
The rain in Spain
ai


In [12]:
# We can combine and create a function

def cleanText(text: str = "") -> str:
    sentence = str(text)
    #Removes unicode strings like "\u002c" and "x96"        
    sentence = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', sentence)
    sentence = re.sub(r'[^\x00-\x7f]',r'',sentence)        
    #remove exclamation & question marks
    sentence = re.sub(r"(\!)+", ' ', sentence)
    sentence = re.sub(r"(\?)+", ' ', sentence)   
    #remove punctuation
    sentence = re.sub(r"(\,)+", ' ', sentence)
    sentence = re.sub(r"(\;)+", ' ', sentence)
    sentence = re.sub(r"(\:)+", ' ', sentence)
    #remove parentheses
    sentence = re.sub(r"(\()+", ' ', sentence)
    sentence = re.sub(r"(\))+", ' ', sentence)
    #remove underscore
    sentence = re.sub(r"(\_)+", ' ', sentence)    
    #remove other simbols        
    sentence = re.sub(r"(\-)+", ' ', sentence)
    sentence = re.sub(r"(\+)+", ' ', sentence)
    sentence = re.sub(r"(\/)+", ' ', sentence)
    sentence = re.sub(r"(\*)+", ' ', sentence)
    sentence = re.sub(r"(\')+", ' ', sentence)
    sentence = re.sub(r"(\")+", ' ', sentence)
    sentence = re.sub(r"(\#)+", ' ', sentence)
    sentence = re.sub(r"(\>)+", ' ', sentence)
    sentence = re.sub(r"(\<)+", ' ', sentence)
    sentence = re.sub(r"(\£)+", ' ', sentence)
    sentence = re.sub(r"(\$)+", ' ', sentence)
    sentence = re.sub(r"(\%)+", ' ', sentence)
    sentence = re.sub(r"(\&)+", ' ', sentence)
    sentence = re.sub(r"(\€)+", ' ', sentence)
    sentence = re.sub(r"(\=)+", ' ', sentence)
    sentence = re.sub(r"(\^)+", ' ', sentence)
    #trim
    sentence = sentence.strip('\'"')
    #Remove additional white spaces
    sentence = re.sub('[\s]+', ' ', sentence)
    sentence = re.sub('[\n]+', ' ', sentence)
    sentence = sentence.strip()
    text = sentence
    return text

randomSentence = "Hi,   maybe  there$is some::::::::mistake!!"

clearSentence = cleanText(randomSentence)
clearSentence

'Hi maybe there is some mistake'

# NLTK RegEx tokenizer

We can instantate the RegexpTokenizer class or use the simple helper function regexp_tokenize

In [7]:
para = """Characters like periods, exclamation point and newline char are used to separate the sentences. But one drawback with split() method, that we can only use one separator at a time! So sentence tokenization won't be foolproof with split() method."""

from nltk.tokenize import RegexpTokenizer   #   Source code : https://www.nltk.org/_modules/nltk/tokenize/regexp.html
tokenizer = RegexpTokenizer("[\w']+", gaps=False, discard_empty=True)

print(tokenizer.tokenize(para))

['Characters', 'like', 'periods', 'exclamation', 'point', 'and', 'newline', 'char', 'are', 'used', 'to', 'separate', 'the', 'sentences', 'But', 'one', 'drawback', 'with', 'split', 'method', 'that', 'we', 'can', 'only', 'use', 'one', 'separator', 'at', 'a', 'time', 'So', 'sentence', 'tokenization', "won't", 'be', 'foolproof', 'with', 'split', 'method']


In [14]:
from nltk.tokenize import regexp_tokenize

print(regexp_tokenize(para, "[\w']+"))

['Characters', 'like', 'periods', 'exclamation', 'point', 'and', 'newline', 'char', 'are', 'used', 'to', 'separate', 'the', 'sentences', 'But', 'one', 'drawback', 'with', 'split', 'method', 'that', 'we', 'can', 'only', 'use', 'one', 'separator', 'at', 'a', 'time', 'So', 'sentence', 'tokenization', "won't", 'be', 'foolproof', 'with', 'split', 'method']


In [9]:
tokenizer = RegexpTokenizer('\s+', gaps=True)
tokenizer.tokenize(para)

[' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ']

### Token compare

In [10]:
import nltk
#nltk.download('gutenberg')

shakespeare_caesar = nltk.corpus.gutenberg.raw('shakespeare-caesar.txt')

print(len(shakespeare_caesar))

print(shakespeare_caesar[:250])

112310
[The Tragedie of Julius Caesar by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Flauius, Murellus, and certaine Commoners ouer the Stage.

  Flauius. Hence: home you idle Creatures, get you home:
Is this a Holiday? What, know you not


In [11]:
splitCaesar = shakespeare_caesar.split()

print(splitCaesar[:50])

print(f"N° of split words : {len(splitCaesar)}")

print(f"N° of unique split words : {len(set(splitCaesar))}")

['[The', 'Tragedie', 'of', 'Julius', 'Caesar', 'by', 'William', 'Shakespeare', '1599]', 'Actus', 'Primus.', 'Scoena', 'Prima.', 'Enter', 'Flauius,', 'Murellus,', 'and', 'certaine', 'Commoners', 'ouer', 'the', 'Stage.', 'Flauius.', 'Hence:', 'home', 'you', 'idle', 'Creatures,', 'get', 'you', 'home:', 'Is', 'this', 'a', 'Holiday?', 'What,', 'know', 'you', 'not', '(Being', 'Mechanicall)', 'you', 'ought', 'not', 'walke', 'Vpon', 'a', 'labouring', 'day,', 'without']
N° of split words : 20459
N° of unique split words : 4992


In [12]:
from nltk.tokenize import regexp_tokenize

regexCaesar = regexp_tokenize(shakespeare_caesar, r'[A-Za-z]+', gaps=False)

print(regexCaesar[:50])

print(f"N° of regex words : {len(regexCaesar)}")

print(f"N° of unique regex words : {len(set(regexCaesar))}")

['The', 'Tragedie', 'of', 'Julius', 'Caesar', 'by', 'William', 'Shakespeare', 'Actus', 'Primus', 'Scoena', 'Prima', 'Enter', 'Flauius', 'Murellus', 'and', 'certaine', 'Commoners', 'ouer', 'the', 'Stage', 'Flauius', 'Hence', 'home', 'you', 'idle', 'Creatures', 'get', 'you', 'home', 'Is', 'this', 'a', 'Holiday', 'What', 'know', 'you', 'not', 'Being', 'Mechanicall', 'you', 'ought', 'not', 'walke', 'Vpon', 'a', 'labouring', 'day', 'without', 'the']
N° of regex words : 20804
N° of unique regex words : 3543


In [13]:
from nltk.tokenize import word_tokenize

wTokenCaesar = word_tokenize(shakespeare_caesar, preserve_line=False)

print(wTokenCaesar[:50])

print(f"N° of token words : {len(wTokenCaesar)}")

print(f"N° of unique token words : {len(set(wTokenCaesar))}")

['[', 'The', 'Tragedie', 'of', 'Julius', 'Caesar', 'by', 'William', 'Shakespeare', '1599', ']', 'Actus', 'Primus', '.', 'Scoena', 'Prima', '.', 'Enter', 'Flauius', ',', 'Murellus', ',', 'and', 'certaine', 'Commoners', 'ouer', 'the', 'Stage', '.', 'Flauius', '.', 'Hence', ':', 'home', 'you', 'idle', 'Creatures', ',', 'get', 'you', 'home', ':', 'Is', 'this', 'a', 'Holiday', '?', 'What', ',', 'know']
N° of token words : 25251
N° of unique token words : 3610
