## **Regular expression**

In [None]:
import re

In [None]:
address = "University of Pennsylvania, Philadelphia, PA 19104"
re.search('\d{5}(-\d{4})?$', address).group()

## **Text normalization**

- Lemmatization

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
text = (
    "The Martian is a 2011 science fiction debut novel written by Andy Weir. "
    "The book was originally self-published on Weir's blog, in a serialized format. "
    "In 2014, the book was re-released after Crown Publishing Group purchased the exclusive publishing rights."
)
text = nlp(text)

In [None]:
# is -> be, written -> write
print(' '.join([token.lemma_ for token in text]))

In [None]:
# Clean HTML Tag
from bs4 import BeautifulSoup
text = (
    "<p><b>The Martian</b> is a 2011 science fiction debut novel written by Andy Weir.</p> "
    "<p>The book was originally self-published on Weir's blog, in a serialized format.</p> "
    "<p>In 2014, the book was re-released after Crown Publishing Group purchased the exclusive publishing rights.</p>"
)
outtext = BeautifulSoup(text, "html.parser").text
print(outtext)

## **Sentiment analysis**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support

In [None]:
path = "https://raw.githubusercontent.com/vineetdhanawat/twitter-sentiment-analysis/master/datasets/Sentiment%20Analysis%20Dataset%20100000.csv"

In [None]:
df = pd.read_csv(path, encoding="ISO-8859-1")

In [None]:
df_train, df_val = train_test_split(df)

In [None]:
count = CountVectorizer()
X_train = count.fit_transform(df_train.SentimentText)
y_train = df_train.Sentiment.values

In [None]:
X_val = count.transform(df_val.SentimentText)
y_val = df_val.Sentiment.values

In [None]:
logist = LogisticRegression()
logist.fit(X_train, y_train)

In [None]:
y_pred = logist.predict(X_train)

In [None]:
precision_recall_fscore_support(y_train, y_pred, average="binary")

In [None]:
y_val_pred = logist.predict(X_val)

In [None]:
precision_recall_fscore_support(y_val, y_val_pred, average="binary")

## **Topic models**