# Chapter 10. Proccesing text
## Notebook for Python

In [1]:
!pip3 install regex pandas



In [2]:
import regex
import re
import pandas as pd


### 10.1 Reading and cleaning text

In [3]:
text = "This is text."
print(f"type(text): {type(text)}")
print(f"len(text): {len(text)}")
print(f"text[0]: '{text[0]}'")
print(f"text[5:7]: '{text[5:7]}'")
print(f"text[-1]: '{text[-1]}'")
print(f"text[-4:]: '{text[-5:]}'")

type(text): <class 'str'>
len(text): 13
text[0]: 'T'
text[5:7]: 'is'
text[-1]: '.'
text[-4:]: 'text.'


In [4]:
words = ["These", "are", "words"]
print(f"type(words): {type(words)}")
print(f"len(words): {len(words)}")
print(f"words[0]: '{words[0]}'")
print(f"words[1:3]: '{words[1:3]}'")

type(words): <class 'list'>
len(words): 3
words[0]: 'These'
words[1:3]: '['are', 'words']'


In [5]:
text = """   <b>Communication</b>    
    (from Latin communicare, meaning to share) """
# remove tags:
cleaned=text.replace("<b>","").replace("</b>","")
# normalize white space
cleaned = " ".join(cleaned.split())
# lower case
cleaned = cleaned.lower()
# trim spaces from start and end
cleaned = cleaned.strip()

print(cleaned)

communication (from latin communicare, meaning to share)


In [6]:
text = """   <b>Communication</b>    
    (from Latin communicare, meaning to share) """
# remove tags:
cleaned = re.sub("<[^>]+>", "", text)
# normalize white space
cleaned = re.sub("\s+", " ", cleaned)
# trim spaces from start and end
cleaned = re.sub("^\s+|\s+$", "", cleaned)
cleaned = cleaned.strip()

print(cleaned)

Communication (from Latin communicare, meaning to share)


In [7]:
import pandas as pd
url = "https://cssbook.net/d/example_tweets.csv"
tweets = pd.read_csv(url, index_col="id")
# identify tweets with hashtags
tweets["tag"]=tweets.text.str.contains(r"#\w+")
# How many at-mentions are there?
tweets["at"]=tweets.text.str.count(r"(^|\s)@\w+")
# Extract first url
tweets["url"]=tweets.text.str.extract(
    r"(https?://\S+)")
# Remove urls, tags, and @-mentions
expr=r"(^|\s)(@|#|https?://)\S+"
tweets["plain2"]=(tweets.text.str
                  .replace(expr, " ", regex=True)
                  .replace(r"\W+", " "))
tweets

Unnamed: 0_level_0,text,tag,at,url,plain2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,RT: @john_doe https://example.com/news very in...,False,1,https://example.com/news,RT: very interesting!
2,tweet with just text,False,0,,tweet with just text
3,http://example.com/pandas #breaking #mustread,True,0,http://example.com/pandas,
4,@me and @myself #selfietime,True,2,,and


In [8]:
text = "apples, pears, oranges"
# Three ways to achieve the same thing:
items = text.split(", ")
items = regex.split(r"\p{PUNCTUATION}\s*", text)
items = regex.findall(r"\p{LETTER}+", text)
print(f"Split text into items: {items}")
joined = " & ".join(items)
print(joined)


Split text into items: ['apples', 'pears', 'oranges']
apples & pears & oranges


In [9]:
tags = tweets.text.str.extractall("(#\\w+)")
tags.merge(tweets, left_on="id", right_on="id")

Unnamed: 0_level_0,0,text,tag,at,url,plain2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,#breaking,http://example.com/pandas #breaking #mustread,True,0,http://example.com/pandas,
3,#mustread,http://example.com/pandas #breaking #mustread,True,0,http://example.com/pandas,
4,#selfietime,@me and @myself #selfietime,True,2,,and


In [10]:
words = tweets.text.str.split("\\W+")
words_long = words.explode()

In [11]:
words_long.groupby("id").agg("_".join)

id
1    RT_john_doe_https_example_com_news_very_intere...
2                                 tweet_with_just_text
3           http_example_com_pandas_breaking_mustread_
4                            _me_and_myself_selfietime
Name: text, dtype: object