---

_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-text-mining/resources/d9pwm) course resource._

---

# Working with Text Data in pandas

\* : wildcard, matches any single character

^ : search for start of string

$ : matches from the end of string

[] : matches one of the set of characters within

[a-z] : matches one of the range fo the characters a,b,c,...x,y,z

[^abc] : matches a character that is NOT a,b or c

a|b : matches a or b, where a and b are strings

( ) : scoping for operators

\ : Escape character for special characters (\t,\n,\b)

\b : Matches a word boundary

\d : Matches any digit, equivalent to [0-9]

\D : Matches any non-digit, equivalent to [^0-9]

\s : Any whitespace, same as [\t\n\r\f\v]

\S : Any non-whitespace, same as [^ \t\n\r\f\v]

\w : Alphanumeric character, same as [a-zA-Z0-9_]

\W : non-Alphanumeric Character, same as [^a-zA-Z0-9_]

\* : Mathces zero or more occurences

\+ : mathces one or more occurences

? : matches zero or once

{n} : matches exactly n times

{n,} : at least n repetitions

{,n} : at most n repetitions

{m,n} : at least m and at most n times

In [1]:
import pandas as pd

time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

Unnamed: 0,text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30...
2,"Wednesday: At 7:00pm, there is a basketball game!"
3,Thursday: Be back home by 11:15 pm at the latest.
4,"Friday: Take the train at 08:10 am, arrive at ..."


In [2]:
# make each row in the text column a string
df['text'].str

<pandas.core.strings.StringMethods at 0x107fb3a58>

In [3]:
df['text'].str

<pandas.core.strings.StringMethods at 0x107fb3ac8>

In [4]:
# find the number of characters for each string in df['text']
df['text'].str.len()

0    46
1    50
2    49
3    49
4    54
Name: text, dtype: int64

In [5]:
# find the tokens for each string in df['text']
df['text'].str.split()

0    [Monday:, The, doctor's, appointment, is, at, ...
1    [Tuesday:, The, dentist's, appointment, is, at...
2    [Wednesday:, At, 7:00pm,, there, is, a, basket...
3    [Thursday:, Be, back, home, by, 11:15, pm, at,...
4    [Friday:, Take, the, train, at, 08:10, am,, ar...
Name: text, dtype: object

In [6]:
# find the number of tokens for each string in df['text']
for row in df['text'].str.split():
    counter = 0
    for word in row:
        counter+=1
    print(counter)

7
8
8
10
10


In [7]:
# Efficiently find the number of tokens for each string in df['text']
df['text'].str.split().str.len()

0     7
1     8
2     8
3    10
4    10
Name: text, dtype: int64

In [8]:
# find which entries contain the word 'appointment'
df['text'].str.contains('appointment')

0     True
1     True
2    False
3    False
4    False
Name: text, dtype: bool

In [9]:
# find how many times a digit occurs in each string
df['text'].str.count(r'\d')

0    3
1    4
2    3
3    4
4    8
Name: text, dtype: int64

In [10]:
# find all occurances of the digits
df['text'].str.findall(r'\d')

0                   [2, 4, 5]
1                [1, 1, 3, 0]
2                   [7, 0, 0]
3                [1, 1, 1, 5]
4    [0, 8, 1, 0, 0, 9, 0, 0]
Name: text, dtype: object

In [11]:
# group and find the hours and minutes
df['text'].str.findall(r'(\d?\d):(\d\d)')

0               [(2, 45)]
1              [(11, 30)]
2               [(7, 00)]
3              [(11, 15)]
4    [(08, 10), (09, 00)]
Name: text, dtype: object

In [12]:
# replace weekdays with '???'
# r'\w+day finds all instances of alphanumeric strings that end in 'day'
df['text'].str.replace(r'\w+day\b', '???')

0          ???: The doctor's appointment is at 2:45pm.
1       ???: The dentist's appointment is at 11:30 am.
2          ???: At 7:00pm, there is a basketball game!
3         ???: Be back home by 11:15 pm at the latest.
4    ???: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [28]:
def replace_abbr(x):
    day = x[0][:3]
    return day

In [29]:
df['text'].str.findall(r'(\w+day\b)').apply(replace_abbr)#, lambda x: x.groups()[0][:3])

0    Mon
1    Tue
2    Wed
3    Thu
4    Fri
Name: text, dtype: object

In [27]:
df['text']

0       Monday: The doctor's appointment is at 2:45pm.
1    Tuesday: The dentist's appointment is at 11:30...
2    Wednesday: At 7:00pm, there is a basketball game!
3    Thursday: Be back home by 11:15 pm at the latest.
4    Friday: Take the train at 08:10 am, arrive at ...
Name: text, dtype: object

In [21]:
# replace weekdays with 3 letter abbrevations
#  re.group()[0] grabs the first instance of words that end in day
df['text'].str.replace(r'(\w+day\b)', lambda x: x.groups()[0][:3])

TypeError: repl must be a string

In [18]:
# create new columns from first match of extracted groups
# use .extract to do this.
df['text'].str.extract(r'(\d?\d):(\d\d)')

  from ipykernel import kernelapp as app


Unnamed: 0,0,1
0,2,45
1,11,30
2,7,0
3,11,15
4,8,10


In [32]:
df['text'].iloc[4]

'Friday: Take the train at 08:10 am, arrive at 09:00am.'

In [30]:
# extract the entire time, the hours, the minutes, and the period
df['text'].str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


In [35]:
# extract the entire time, the hours, the minutes, and the period with group names
# ?P<col_name> is used to name groups and can be used to label the column in a DF
df['text'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,time,hour,minute,period
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


In [36]:
df['text'].str.extractall(r'(?P<fee>(?P<fie>\d?\d):(?P<foe>\d\d) ?(?P<fum>[ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,fee,fie,foe,fum
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


In [39]:
import re

In [40]:
a = "chadaboe"

In [47]:
print(re.findall(r'boe$', a))

['boe']


In [48]:
print(re.findall(r'^ch', a))

['ch']


In [49]:
# 'utf-8' is the future

 ### unicode
 - Industry Standard for encodign
 - over 128,000 characters from 130+ scripts (greek, french..)
 - can be encoded by different charaacter encpding
     -utf-8 / utf-16 / utf-32 bit encodings exists
    - 

### UTF - 8
utf = unicode transofrmation Format
    - Backward compatible with ASCII
    - only byte codes same as ASCII
    - Dominant Character Encoding for the Web
    - Default in Python 3
    - in python2 use: coding:utf-8 at start of script
    - or in the intepreter use: u'string'

In [58]:
a = 'abcxyz'
re.findall(r'xyz',a)

['xyz']