# BASIC TEXT WORKS

In [1]:
text1 = "Ethics are built right into the ideals and objectives of the United Nations "

len(text1) # The length of text1

76

In [2]:
text2 = text1.split(' ') # Return a list of the words in text2, separating by ' '.

len(text2)

14

In [3]:
# creating a new via list comprehension
[w for w in text2 if len(w) > 3] # Words that are greater than 3 letters long in text2

['Ethics',
 'built',
 'right',
 'into',
 'ideals',
 'objectives',
 'United',
 'Nations']

In [4]:
# finding unique words via set conversion
text3 = 'To be or not to be'
text4 = text3.split(' ')

len(text4)

6

In [5]:
len(set(text4))

5

# REGEX AND REGEX WITH PANDAS

## REGEX

In [6]:
text7 = '@UN @UN_Women "Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'
text8 = text7.split(' ')

import re # import re - a module that provides support for regular expressions

[w for w in text8 if re.search('@[A-Za-z0-9_]+', w)]

['@UN', '@UN_Women']

## Regex with Pandas and Named Groups

In [7]:
import pandas as pd

time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

Unnamed: 0,text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30...
2,"Wednesday: At 7:00pm, there is a basketball game!"
3,Thursday: Be back home by 11:15 pm at the latest.
4,"Friday: Take the train at 08:10 am, arrive at ..."


In [8]:
# find the number of characters for each string in df['text']
df['text'].str.len()

0    46
1    50
2    49
3    49
4    54
Name: text, dtype: int64

In [9]:
# number of elements in the resulting list of split
df["text"].str.split().str.len()

0     7
1     8
2     8
3    10
4    10
Name: text, dtype: int64

In [10]:
# method to find if a certain pattern or word is contained in the text
df["text"].str.contains("appointment")

0     True
1     True
2    False
3    False
4    False
Name: text, dtype: bool

In [11]:
# method to count how many times something occurs within a text. Example, digits.
df["text"].str.count("\d")

0    3
1    4
2    3
3    4
4    8
Name: text, dtype: int64

In [12]:
# find all occurances of the digits
df["text"].str.findall("\d")

0                   [2, 4, 5]
1                [1, 1, 3, 0]
2                   [7, 0, 0]
3                [1, 1, 1, 5]
4    [0, 8, 1, 0, 0, 9, 0, 0]
Name: text, dtype: object

In [13]:
# # find all occurances of the digits, by groups
df["text"].str.findall("(\d):(\d)")

0            [(2, 4)]
1            [(1, 3)]
2            [(7, 0)]
3            [(1, 1)]
4    [(8, 1), (9, 0)]
Name: text, dtype: object

In [14]:
# replace weekdays with ???
df["text"].str.replace("(\w+day)","???", regex=True)

0          ???: The doctor's appointment is at 2:45pm.
1       ???: The dentist's appointment is at 11:30 am.
2          ???: At 7:00pm, there is a basketball game!
3         ???: Be back home by 11:15 pm at the latest.
4    ???: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [15]:
# replace weekdays with 3 letter abbrevations
    # the lambda takes the groups formed from the previous regex pattern
df['text'].str.replace('(\w+day)', lambda x: x.groups()[0][:3], regex=True)

0          Mon: The doctor's appointment is at 2:45pm.
1       Tue: The dentist's appointment is at 11:30 am.
2          Wed: At 7:00pm, there is a basketball game!
3         Thu: Be back home by 11:15 pm at the latest.
4    Fri: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [16]:
# create new columns from first match of extracted groups using extract
    # extract works by taking the first grooup only
df['text'].str.extract('(\d?\d):(\d\d)')

Unnamed: 0,0,1
0,2,45
1,11,30
2,7,0
3,11,15
4,8,10


In [17]:
# extract the entire time, the hours, the minutes, and the period
df['text'].str.extractall('((\d?\d):(\d\d) ?([ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


In [18]:
# extract the entire time, the hours, the minutes, and the period with group names
df['text'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,time,hour,minute,period
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am
