## **Regular expression**

In [None]:
import re

In [None]:
address = "University of Pennsylvania, Philadelphia, PA 19104"
re.search('\d{5}(-\d{4})?$', address).group()

## **Text normalization**

- Lemmatization

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
text = (
    "The Martian is a 2011 science fiction debut novel written by Andy Weir. "
    "The book was originally self-published on Weir's blog, in a serialized format. "
    "In 2014, the book was re-released after Crown Publishing Group purchased the exclusive publishing rights."
)
text = nlp(text)

In [None]:
# is -> be, written -> write
print(' '.join([token.lemma_ for token in text]))

In [None]:
# Clean HTML Tag
from bs4 import BeautifulSoup
text = (
    "<p><b>The Martian</b> is a 2011 science fiction debut novel written by Andy Weir.</p> "
    "<p>The book was originally self-published on Weir's blog, in a serialized format.</p> "
    "<p>In 2014, the book was re-released after Crown Publishing Group purchased the exclusive publishing rights.</p>"
)
outtext = BeautifulSoup(text, "html.parser").text
print(outtext)

## **Sentiment analysis**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support

In [None]:
path = "https://raw.githubusercontent.com/vineetdhanawat/twitter-sentiment-analysis/master/datasets/Sentiment%20Analysis%20Dataset%20100000.csv"

In [None]:
df = pd.read_csv(path, encoding="ISO-8859-1")

In [None]:
df_train, df_val = train_test_split(df)

In [None]:
count = CountVectorizer()
X_train = count.fit_transform(df_train.SentimentText)
y_train = df_train.Sentiment.values

In [None]:
X_val = count.transform(df_val.SentimentText)
y_val = df_val.Sentiment.values

In [None]:
logist = LogisticRegression()
logist.fit(X_train, y_train)

In [None]:
y_pred = logist.predict(X_train)

In [None]:
precision_recall_fscore_support(y_train, y_pred, average="binary")

In [None]:
y_val_pred = logist.predict(X_val)

In [None]:
precision_recall_fscore_support(y_val, y_val_pred, average="binary")

## **Topic models**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import matplotlib.pyplot as plt

In [None]:
nmc_df = pd.read_csv(
    "https://raw.githubusercontent.com/neuromatch/nmc-box/master/sitedata/agenda/agenda-2020-3.csv"
).fillna("")

In [None]:
n_features = 1000
n_components = 10 # number of topics
n_top_words = 20  # number of words in a topic

def plot_top_words(model, feature_names, n_top_words, title):
    """
    Reference: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
    """
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [None]:
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95, min_df=2,
    max_features=n_features,
    stop_words="english"
)

In [None]:
abstracts = list(nmc_df["title"] + " " + nmc_df["abstract"])
X_tfidf = tfidf_vectorizer.fit_transform(abstracts)

In [None]:
nmf = NMF(n_components=n_components, random_state=1, alpha=0.1, l1_ratio=0.5).fit(X_tfidf)

In [None]:
feature_names = tfidf_vectorizer.get_feature_names()
plot_top_words(
    nmf,
    feature_names,
    n_top_words,
    "Topics in NMF topic model",
)

In [None]:
count_vectorizer = CountVectorizer(
    max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
X_count = count_vectorizer.fit_transform(abstracts)

In [None]:
lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=50,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
)
X_lda = lda.fit_transform(X_count)
plot_top_words(
    lda,
    feature_names,
    n_top_words,
    "Topics in LDA model",
)

## **Look at NMF topics**

In [None]:
feature_names = tfidf_vectorizer.get_feature_names()
plot_top_words(
    nmf,
    feature_names,
    n_top_words,
    "Topics in NMF topic model",
)

In [None]:
X_nmf = nmf.transform(X_tfidf)

In [None]:
from collections import Counter

In [None]:
Counter(X_nmf.argmax(axis=1))

In [None]:
# test printing on one topic
idx = 50
print("Components", X_nmf[idx, :])
print("Paper title: ", nmc_df.iloc[idx].title)