# Chapter 10. Proccesing text
## Notebook for Python

In [1]:
import sys
!{sys.executable} -m pip install regex pandas



In [2]:
import regex
import pandas as pd


### 10.1 Reading and cleaning text

In [3]:
text = "This is text."
print(f"text is a {type(text)} of length {len(text)}")
print(f"text[0]: '{text[0]}'")
print(f"text[5:7]: '{text[5:7]}'")
print(f"text[-1]: '{text[-1]}'")
print(f"text[-4:]: '{text[-5:]}'")

text is a <class 'str'> of length 13
text[0]: 'T'
text[5:7]: 'is'
text[-1]: '.'
text[-4:]: 'text.'


In [4]:
words = ["These", "are", "words"]
print(f"words is a {type(words)} of length {len(words)}")
print(f"words[0]: '{words[0]}'")
print(f"words[1:3]: '{words[1:3]}'")

words is a <class 'list'> of length 3
words[0]: 'These'
words[1:3]: '['are', 'words']'


In [5]:
import regex

text = "<p><b>Communication</b>    (from Latin <i>communicare</i>, meaning to share)"
# remove HTML tags:
cleaned = regex.sub(r'</?\w[^>]*>',' ', text)
# Remove punctuation:
cleaned = regex.sub(r'\p{PUNCTUATION}',' ', cleaned)
# Normalize spaces
cleaned = regex.sub(r'\s+', ' ', cleaned)
# Lowercase
cleaned = cleaned.lower()
# Strip spaces from start and end
cleaned = cleaned.strip()

print(cleaned)

communication from latin communicare meaning to share


In [6]:
import pandas as pd
tweets = pd.read_csv("http://cssbook.net/d/example_tweets.csv", index_col="id")
# identify tweets with hashtags
tweets['has_tag'] = tweets.text.str.contains(r"#\w+")
# How many at-mentions are there?
tweets['n_at'] = tweets.text.str.count(r"(^|\s)@\w+")
# Extract first url
tweets['url'] = tweets.text.str.extract(r"(https?://\S+)")
# Extract only plain text
tweets['plain2'] = (tweets.text.str.replace(r"(^|\s)(@|#|https?://)\S+", " ")
                    .replace(r"\W+", " "))

tweets

Unnamed: 0_level_0,text,has_tag,n_at,url,plain2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,RT: @john_doe https://example.com/news very in...,False,1,https://example.com/news,RT: very interesting!
2,tweet with just text,False,0,,tweet with just text
3,http://example.com/pandas #breaking #mustread,True,0,http://example.com/pandas,
4,@me and @myself #selfietime,True,2,,and


In [7]:
text = "apples, pears, oranges"
# Three ways to achieve the same thing:
items = text.split(", ")
items = regex.split(r"\p{PUNCTUATION}\s*", text)
items = regex.findall(r"\p{LETTER}+", text)
print(f"Split text into items: {items}")
joined = " & ".join(items)
print(joined)


Split text into items: ['apples', 'pears', 'oranges']
apples & pears & oranges


In [8]:
tags = tweets.text.str.extractall("(#\\w+)")
tags.merge(tweets, left_on="id", right_on="id")

Unnamed: 0_level_0,0,text,has_tag,n_at,url,plain2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,#breaking,http://example.com/pandas #breaking #mustread,True,0,http://example.com/pandas,
3,#mustread,http://example.com/pandas #breaking #mustread,True,0,http://example.com/pandas,
4,#selfietime,@me and @myself #selfietime,True,2,,and


In [9]:
words = tweets.text.str.split("\\W+")
words_long = words.explode()

In [10]:
words_long.groupby('id').agg("_".join)

id
1    RT_john_doe_https_example_com_news_very_intere...
2                                 tweet_with_just_text
3           http_example_com_pandas_breaking_mustread_
4                            _me_and_myself_selfietime
Name: text, dtype: object