## Regular Expressions

In [2]:
#Regex are strings with a special syntax
#Allows us to match patterns in other strings
#Applications of regex: Find all web links in a document, parse email addresses, remove / replace unwanted characters etc.

In [1]:
import re

In [3]:
#re.match(pattern, string)
re.match('abc', 'abcdef')

<re.Match object; span=(0, 3), match='abc'>

In [4]:
#special patterns
word_regex = '\w+'
re.match(word_regex, 'hi there!') #matches first word in a sentence

<re.Match object; span=(0, 2), match='hi'>

In [5]:
#Common regex patterns (there are 100s of them and these are just a few common ones)
# \w+  word (\w will just pull individual characters and \w+ will pull word)
# \d   digit
# \s   space
# .*   wildcard (matches any letter or symbol)
# + or * greedy match (matches repeats of sigle letter or whole patterns)
# \S   not space
# []   create a group of characters eg. [a-z] lowercase group

In [6]:
#Python re module
#re: module
#split: split a string on regex
#findall: find all patterns in a string
#search: search for a pattern
#match: match an entire string or substring based on a pattern

#pass pattern first and then string
#may return an iterator, string or match object

In [7]:
re.split('\s+', 'Split on spaces.')

['Split', 'on', 'spaces.']

In [10]:
re.findall(r"\w+", "Let's wrtie RegEx!")

['Let', 's', 'wrtie', 'RegEx']

In [11]:
#It is important to prefix regex patterns with r to ensure that patterns are interpreted correctly
#specially escape sequences in strings
#Eg: "\n" in Python is used to indicate a new line but if we use r prefix, it will be interpreted as raw string 
#that is charater "\" followed by character "n" and not as a new line

In [12]:
my_string = "Let's write RegEx!  Won't that be fun?  I sure think so.  Can you find 4 sentences?  Or perhaps, all 19 words?"

#Split on Sentence endings
re.split(r"[.?!]", my_string)

["Let's write RegEx",
 "  Won't that be fun",
 '  I sure think so',
 '  Can you find 4 sentences',
 '  Or perhaps, all 19 words',
 '']

In [13]:
#Find all capitalised words
re.findall(r"[A-Z]\w+", my_string)

['Let', 'RegEx', 'Won', 'Can', 'Or']

In [14]:
#Split string on spaces
re.split(r"\s+", my_string)

["Let's",
 'write',
 'RegEx!',
 "Won't",
 'that',
 'be',
 'fun?',
 'I',
 'sure',
 'think',
 'so.',
 'Can',
 'you',
 'find',
 '4',
 'sentences?',
 'Or',
 'perhaps,',
 'all',
 '19',
 'words?']

In [23]:
re.findall(r"\d+", my_string)

['4', '19']

In [33]:
#Difference between re.search() and re.match()
#when the pattern that we are looking for is present in the beginning of the string then both will give identical results
#Eg:
re.match('abc','abcde') #will give result as 'abc'
re.search('abc','abcde') #will give result as 'abc'

#match tries to match a string from the beginning and search will go through the entire string
#Eg:
re.match('cd','abcde') #will not give any result
re.search('cd','abcde') #will give result as 'cd'

#use match when you want to look for pattern specifically at the beginning of the string
#and use search when you want to look for pattern anywhere in the string

<re.Match object; span=(2, 4), match='cd'>

## Tokenization

In [17]:
#Turning a string or document into tokens (smaller chunks)
#One step in preparing a text for NLP
#Many different theories and rules
#You can create your own rules using regular expressions
#Examples:
# 1. Breaking out words or sentences
# 2. Separating punctuation
# 3. Separating all hashtags in a tweet

#Common library: nltk (natural language toolkit)

In [19]:
#Not required, did this since the next code block was generating an error and asked to download punkt
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bansal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [20]:
from nltk.tokenize import word_tokenize
word_tokenize("Hi there!")

['Hi', 'there', '!']

In [21]:
#Why tokenize?
#Easier to map part of speech
#Matching common words
#Removing unwanted tokens

In [22]:
#Other nltk tokenizers
#sent_tokenize: tokenize a document into sentences
#regexp_tokenize: tokenize a string or document based on a regex pattern
#TweetTokenizer: special class just for tweet tokenization, 
#allowing you to separate hashtags, mentions and lots of exclamation points

In [34]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

#Split my_string into sentences
sentences = sent_tokenize(my_string)
sentences

["Let's write RegEx!",
 "Won't that be fun?",
 'I sure think so.',
 'Can you find 4 sentences?',
 'Or perhaps, all 19 words?']

In [35]:
#Tokenize the 4th sentence into words
tokenize_sent = word_tokenize(sentences[3])
tokenize_sent

['Can', 'you', 'find', '4', 'sentences', '?']

In [37]:
#Make a set of unique tokens in the entire my_string
unique_tokens = set(word_tokenize(my_string))
unique_tokens

{'!',
 "'s",
 ',',
 '.',
 '19',
 '4',
 '?',
 'Can',
 'I',
 'Let',
 'Or',
 'RegEx',
 'Wo',
 'all',
 'be',
 'find',
 'fun',
 "n't",
 'perhaps',
 'sentences',
 'so',
 'sure',
 'that',
 'think',
 'words',
 'write',
 'you'}