In [None]:
import pandas as pd
import numpy as np

In [2]:
name = "Praxis"

In [3]:
%%time
print("We are at {var}.".format(var=name))

We are at Praxis.
Wall time: 1e+03 µs


In [4]:
%%time
print(f"We are at {name}.")
#This one is faster

We are at Praxis.
Wall time: 0 ns


In [5]:
#Proper way to use quotes inside quotes.
print(f"We are at {name!r}")

We are at 'Praxis'


In [6]:
d = {'a':123,'b':456}

In [7]:
print(f"The value of a is {d['a']}.")

The value of a is 123.


## Minimum widths, Alignment and padding

In [8]:
library = [('Auther','Topic','Pages'),('Twain','Rafting',683),('Feyman','Physics',89),('Hamilton','Mythology',155)]

In [9]:
for book in library:
    print(f"{book[0]:{10}} {book[1]:{8}} {book[2]:{7}}")

Auther     Topic    Pages  
Twain      Rafting      683
Feyman     Physics       89
Hamilton   Mythology     155


In [10]:
for book in library:
    print(f"{book[0]:{10}} {book[1]:{9}} {book[2]:{7}}")

Auther     Topic     Pages  
Twain      Rafting       683
Feyman     Physics        89
Hamilton   Mythology     155


In [11]:
# < : Left alignment
# > : Right alignment
# ^ : Center alignment
for book in library:
    print(f"{book[0]:{10}} {book[1]:{9}} {book[2]:>{7}}")

Auther     Topic       Pages
Twain      Rafting       683
Feyman     Physics        89
Hamilton   Mythology     155


In [12]:
# -,. : Padding types
for book in library:
    print(f"{book[0]:{10}} {book[1]:{9}} {book[2]:,>{7}}")

Auther     Topic     ,,Pages
Twain      Rafting   ,,,,683
Feyman     Physics   ,,,,,89
Hamilton   Mythology ,,,,155


## Regular Expression:

In [13]:
text = "The agents phone number is 772-432-7865. Call soon!"

In [14]:
"phone" in text

True

In [15]:
#Regular expression library
import re

In [16]:
pattern = "Not available in text."
re.search(pattern,text)

In [17]:
pattern = "phone"
match = re.search(pattern,text)

In [18]:
#start of string, end of string to be found
match.span()

(11, 16)

In [19]:
#start of string to be found
match.start()

11

In [20]:
#end of string to be found
match.end()

16

In [21]:
text = "my phone is a new phone"
match = re.search("phone", text)

In [22]:
match.span()

(3, 8)

In [23]:
#All occurances of string to be found
matches = re.findall("phone", text)
matches

['phone', 'phone']

In [24]:
#Iterating through the entire text and find all spans.
for match in re.finditer("phone",text):
    print(match.span())

(3, 8)
(18, 23)


In [25]:
match.group()

'phone'

## Patterns

In [26]:
text = "My telephone number is 772-432-7865. Call soon!"

In [27]:
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d',text)
phone.group() #To find what is being matched.

'772-432-7865'

## Quantifiers
\d - single digit

\D - single anything else but digit

\w - (lowercase w) matches a "word" character: a letter or digit or underbar [a-zA-Z0-9_].

\W - single anything else but alphanumeric

\s - Matches Unicode whitespace characters (which includes [ \t\n\r\f\v], and also many other characters, for example the non-breaking spaces.

\S - single anything else but white space.

In [28]:
text = "My telephone number is 772-432-7865. Call soon on 772-432-7866!"

In [29]:
phone = re.search(r'\d{3}-\d{3}-\d{4}',text)
phone.group() #To find what is being matched.

'772-432-7865'

In [30]:
#Iterating through the entire text and find all spans.
text = "My telephone number is 772-432-7865. Call soon on 772-432-7865!"
phone = re.search(r'\d{3}-\d{3}-\d{4}',text)
for match in re.finditer('772-432-7865',text):
    print(match.span())

(23, 35)
(50, 62)


## Groups:

In [31]:
text = "My telephone number is 772-432-7865. Call soon!"
phone = re.compile(r'(\d{3})-(\d{3})-(\d{4})')
result = re.search(phone,text)
result

<re.Match object; span=(23, 35), match='772-432-7865'>

In [32]:
result[1]

'772'

In [33]:
result[2]

'432'

In [34]:
result[3]

'7865'

In [35]:
result[0]

'772-432-7865'

## Or operator

In [36]:
text1 = "This man was here."
text2 = "This woman was here."

In [37]:
re.search(r"man|woman",text1)

<re.Match object; span=(5, 8), match='man'>

In [38]:
re.search(r"man|woman",text2)

<re.Match object; span=(5, 10), match='woman'>

## Wild card character

In [39]:
text = "Cat in the hat went on a mat and splat there."

In [40]:
re.findall(r".at",text)

['Cat', 'hat', 'mat', 'lat']

In [41]:
re.findall(r"...at",text)

['e hat', 'a mat', 'splat']

In [42]:
re.findall(r'\S+at',text)

['Cat', 'hat', 'mat', 'splat']

## Start and end signals:
^ - Start

$ - End

In [43]:
text = f"this number ends with 9"
re.findall(r'\d$',text)

['9']

In [44]:
text = f"this number ends with 9."
re.findall(r'\d$',text)

[]

In [47]:
text = f"this number ends with 9."
re.findall(r'\D$',text)

['.']

In [45]:
text = f"1 is the first number."
re.findall(r'^\d',text)

['1']

In [48]:
text = f" is the first number."
re.findall(r'^\W',text)

[' ']

In [49]:
text = f"a is the first number."
re.findall(r'^\w',text)

['a']

In [54]:
text = f".a is the first number."
re.findall(r'^\D',text)

['.']

## Exclusion:
[^pattern]

In [67]:
phrase = f"There are few numbers in this sentence such as 1 2 3 and many more counting."
print(re.findall(r'\d',phrase))

['1', '2', '3']


In [58]:
print(re.findall(r'[^\d]',phrase))

['T', 'h', 'e', 'r', 'e', ' ', 'a', 'r', 'e', ' ', 'f', 'e', 'w', ' ', 'n', 'u', 'm', 'b', 'e', 'r', 's', ' ', 'i', 'n', ' ', 't', 'h', 'i', 's', ' ', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', ' ', 's', 'u', 'c', 'h', ' ', 'a', 's', ' ', ' ', ' ', ' ', 'a', 'n', 'd', ' ', 'm', 'a', 'n', 'y', ' ', 'm', 'o', 'r', 'e', ' ', 'c', 'o', 'u', 'n', 't', 'i', 'n', 'g', '.']


In [61]:
print(re.findall(r'[^\d]+',phrase))

['There are few numbers in this sentence such as ', ' ', ' ', ' and many more counting.']


In [56]:
print(re.findall(r'\D',phrase))

['T', 'h', 'e', 'r', 'e', ' ', 'a', 'r', 'e', ' ', 'f', 'e', 'w', ' ', 'n', 'u', 'm', 'b', 'e', 'r', 's', ' ', 'i', 'n', ' ', 't', 'h', 'i', 's', ' ', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', ' ', 's', 'u', 'c', 'h', ' ', 'a', 's', ' ', ' ', ' ', ' ', 'a', 'n', 'd', ' ', 'm', 'a', 'n', 'y', ' ', 'm', 'o', 'r', 'e', ' ', 'c', 'o', 'u', 'n', 't', 'i', 'n', 'g', '.']


In [57]:
print(re.findall(r'\D+',phrase))

['There are few numbers in this sentence such as ', ' ', ' ', ' and many more counting.']


In [68]:
tp = f"This is a string! But it has punctuations. How to remove it?"
print(re.findall(r'[^!.?]+',tp))

['This is a string', ' But it has punctuations', ' How to remove it']


In [69]:
print(re.findall('[^!.? ]+',tp))

['This', 'is', 'a', 'string', 'But', 'it', 'has', 'punctuations', 'How', 'to', 'remove', 'it']


## Challenges:

In [88]:
text = "Let's find the hyphen-words in the sentence. And there can be many long-ish words in the text."

In [89]:
# extract the hyphen words from text
re.findall(r'\S+-+\w+\S',text)

['hyphen-words', 'long-ish']

In [93]:
re.findall(r'\w+-+\w+',text)

['hyphen-words', 'long-ish']

In [98]:
# extract all the email ids from the below text
text = """For accounting contact : abc@yourcompany.com,
            for bonus contact: hr@gmail.com,
            for increment more than x15 times contact : vp@gmail.com"""

In [100]:
re.findall(r'\S+@+\w+\S+\w',text)

['abc@yourcompany.com', 'hr@gmail.com', 'vp@gmail.com']

In [102]:
re.findall(r'\w+@\w+.\w+',text)

['abc@yourcompany.com', 'hr@gmail.com', 'vp@gmail.com']

## Multiple options:

In [122]:
txt1 = "Hello, do you like catfish?"
txt2 = "Would you like to take catnap?"
txt3 = "Have you seen this caterpillar?"

In [104]:
re.search("cat(fish|nap|claw)", txt1)

<re.Match object; span=(19, 26), match='catfish'>

In [105]:
re.search("cat(fish|nap|claw)", txt2)

<re.Match object; span=(23, 29), match='catnap'>

In [106]:
re.search("cat(fish|nap|claw)", txt3)

In [124]:
re.search("cat([^erpillar])+\w+", txt1)

<re.Match object; span=(19, 26), match='catfish'>

In [125]:
re.search("cat([^erpillar])+\w+", txt2)

<re.Match object; span=(23, 29), match='catnap'>

In [126]:
re.search("cat([^(fish|erpillar)])+\w+", txt1)

In [127]:
re.search("cat([^(fish|erpillar)])+\w+", txt2)

<re.Match object; span=(23, 29), match='catnap'>

## NLTK, Spacy:

In [145]:
#!pip install nltk

In [131]:
import nltk

In [135]:
#!pip install spacy

In [133]:
import spacy

## Tokanization:

In [137]:
text = f"Matches the empty string, but only when it is not at the beginning or end of a word. This means that r'py\B' matches 'python', 'py3', 'py2', but not 'py', 'py.', or 'py!'. \B is just the opposite of \b, so word characters in Unicode patterns are Unicode alphanumerics or the underscore? although this can be changed by using the ASCII flag. Word boundaries are determined by the current locale if the LOCALE flag is used."

In [1]:
nltk.download('all')

In [141]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [147]:
sent_tokenize(text)

['Matches the empty string, but only when it is not at the beginning or end of a word.',
 "This means that r'py\\B' matches 'python', 'py3', 'py2', but not 'py', 'py.",
 "', or 'py!'.",
 '\\B is just the opposite of \x08, so word characters in Unicode patterns are Unicode alphanumerics or the underscore, although this can be changed by using the ASCII flag.',
 'Word boundaries are determined by the current locale if the LOCALE flag is used.']

In [151]:
word_token = word_tokenize(text)
word_token

['Matches',
 'the',
 'empty',
 'string',
 ',',
 'but',
 'only',
 'when',
 'it',
 'is',
 'not',
 'at',
 'the',
 'beginning',
 'or',
 'end',
 'of',
 'a',
 'word',
 '.',
 'This',
 'means',
 'that',
 "r'py\\B",
 "'",
 'matches',
 "'python",
 "'",
 ',',
 "'py3",
 "'",
 ',',
 "'py2",
 "'",
 ',',
 'but',
 'not',
 "'py",
 "'",
 ',',
 "'py",
 '.',
 "'",
 ',',
 'or',
 "'py",
 '!',
 "'",
 '.',
 '\\B',
 'is',
 'just',
 'the',
 'opposite',
 'of',
 '\x08',
 ',',
 'so',
 'word',
 'characters',
 'in',
 'Unicode',
 'patterns',
 'are',
 'Unicode',
 'alphanumerics',
 'or',
 'the',
 'underscore',
 ',',
 'although',
 'this',
 'can',
 'be',
 'changed',
 'by',
 'using',
 'the',
 'ASCII',
 'flag',
 '.',
 'Word',
 'boundaries',
 'are',
 'determined',
 'by',
 'the',
 'current',
 'locale',
 'if',
 'the',
 'LOCALE',
 'flag',
 'is',
 'used',
 '.']

## Stop words

In [149]:
from nltk.corpus import stopwords

In [155]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [154]:
clean_WT = [word for word in word_token if word not in stopwords.words('english')]
print(clean_WT)

['Matches', 'empty', 'string', ',', 'beginning', 'end', 'word', '.', 'This', 'means', "r'py\\B", "'", 'matches', "'python", "'", ',', "'py3", "'", ',', "'py2", "'", ',', "'py", "'", ',', "'py", '.', "'", ',', "'py", '!', "'", '.', '\\B', 'opposite', '\x08', ',', 'word', 'characters', 'Unicode', 'patterns', 'Unicode', 'alphanumerics', 'underscore', ',', 'although', 'changed', 'using', 'ASCII', 'flag', '.', 'Word', 'boundaries', 'determined', 'current', 'locale', 'LOCALE', 'flag', 'used', '.']
