# Regex/Text Preprocessing Practice for Austin
### Basic Substitutions with re.sub

In [37]:
import re

text = "<p>    This is a paragraph</p>" 

In [38]:
#removing html tags from text

#format  re.sub(pattern, replacement_text, input)
cleaned_text = re.sub(r'</?p>', '', text)
#removing extra whitespace from text
cleaner_text = re.sub('\s{4}','', cleaned_text)
print(text)
print(cleaned_text)
print(cleaner_text)

<p>    This is a paragraph</p>
    This is a paragraph
This is a paragraph


In [39]:
headline_one = '<h1>Nation\'s Top Pseudoscientists Harness High-Energy Quartz Crystal Capable Of Reversing Effects Of Being Gemini</h1>'
headline_no_tag = re.sub(r'</?h1>', '', headline_one)
print(headline_one)
print(headline_no_tag)

<h1>Nation's Top Pseudoscientists Harness High-Energy Quartz Crystal Capable Of Reversing Effects Of Being Gemini</h1>
Nation's Top Pseudoscientists Harness High-Energy Quartz Crystal Capable Of Reversing Effects Of Being Gemini


In [40]:
tweet = '@fat_meats, veggies are better than you think.'
tweet_no_at = re.sub(r'@', '', tweet)
print(tweet)
print(tweet_no_at)

@fat_meats, veggies are better than you think.
fat_meats, veggies are better than you think.


### Tokenization

In [44]:
from nltk.tokenize import word_tokenize, sent_tokenize

ecg_text = 'An electrocardiogram is used to record the electrical conduction through a person\'s heart. The readings can be used to diagnose cardiac arrhythmias.'

wt_ecg_text = word_tokenize(ecg_text)
st_ecg_text = sent_tokenize(ecg_text)

print("word tokenized text:")
print(wt_ecg_text)
print("\nsentence tokenized text:")
print(st_ecg_text)

word tokenized text:
['An', 'electrocardiogram', 'is', 'used', 'to', 'record', 'the', 'electrical', 'conduction', 'through', 'a', 'person', "'s", 'heart', '.', 'The', 'readings', 'can', 'be', 'used', 'to', 'diagnose', 'cardiac', 'arrhythmias', '.']

sentence tokenized text:
["An electrocardiogram is used to record the electrical conduction through a person's heart.", 'The readings can be used to diagnose cardiac arrhythmias.']


### Normalization
Normalization is a general catch-all term for some preprocessing tasks including:
- changing case
- stopword removal
- stemming (removing prefixes and suffixes)
- lemmatizing (changing a word to its root form)

In [45]:
# changing case

brands = 'Salvation Army, YMCA, Boys & Girls Club of America'

brands_lower = brands.lower()
brands_upper = brands.upper()

In [51]:
# stopword removal

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

survey_text = 'A YouGov study found that American\'s like Italian food more than any other country\'s cuisine.'

stop_words = set(stopwords.words('english'))

# creating tokenized survey text and then filtering stop words from the tokenized text
tokenized_text = word_tokenize(survey_text)
filtered_text = [word for word in tokenized_text if word not in stop_words]

print(filtered_text)

['A', 'YouGov', 'study', 'found', 'American', "'s", 'like', 'Italian', 'food', 'country', "'s", 'cuisine', '.']


In [53]:
# stemming (removing prefixes and suffixes)

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

populated_island = 'Java is an Indonesian island in the Pacific Ocean. It is the most populated island in the world, with over 140 million people.'

# instantiating stemmer object
stemmer = PorterStemmer()

# tokenizing text
tokenized_text = word_tokenize(populated_island)

# using our stemmer to stem the text
stemmed_text = [stemmer.stem(word) for word in tokenized_text]

print(stemmed_text)

['java', 'is', 'an', 'indonesian', 'island', 'in', 'the', 'pacif', 'ocean', '.', 'it', 'is', 'the', 'most', 'popul', 'island', 'in', 'the', 'world', ',', 'with', 'over', '140', 'million', 'peopl', '.']


In [57]:
# lemmatizing (changing a word to its root form)
# (without part of speech tagging first, see below for lemmatization with pos tagging)

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

populated_island = 'Java is an Indonesian island in the Pacific Ocean. It is the most populated island in the world, with over 140 million people.'

lemmatizer = WordNetLemmatizer()

tokenized_text = word_tokenize(populated_island)

lemmatized_text = [lemmatizer.lemmatize(word) for word in tokenized_text]

print(lemmatized_text)

######################
# Note how this isn't very useful -- it just changes a little bit
# that's because we need to use part of speech tagging
######################

['Java', 'is', 'an', 'Indonesian', 'island', 'in', 'the', 'Pacific', 'Ocean', '.', 'It', 'is', 'the', 'most', 'populated', 'island', 'in', 'the', 'world', ',', 'with', 'over', '140', 'million', 'people', '.']


In [59]:
# Part of Speech Tagging function

import nltk
from nltk.corpus import wordnet
from collections import Counter

def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  
  pos_counts = Counter()

  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

In [63]:
# lemmatizing WITH part of speech tagging
# combining previous two steps to improve our lemmatization

pos_lemmatized = [lemmatizer.lemmatize(word, get_part_of_speech(word)) for word in tokenized_text]

print(pos_lemmatized)

['Java', 'be', 'an', 'Indonesian', 'island', 'in', 'the', 'Pacific', 'Ocean', '.', 'It', 'be', 'the', 'most', 'populate', 'island', 'in', 'the', 'world', ',', 'with', 'over', '140', 'million', 'people', '.']
