In [70]:
import os
import re
import joblib

from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords, names
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pandas as pd
import pyLDAvis
import pyLDAvis.sklearn

In [71]:
# Input and output filepaths
train_clean_100_path = os.path.join("data", "train-clean-100.csv")
dev_clean_path = os.path.join("data", "dev-clean.csv")
test_clean_path = os.path.join("data", "test-clean.csv")

model_outpath = os.path.join("models", "model.jl")

pyLDAvis_outpath = os.path.join("view", "data.html")

os.makedirs("models", exist_ok=True)
os.makedirs("view", exist_ok=True)

In [72]:
# Read the csv files
train_df = pd.read_csv(train_clean_100_path, index_col=0)
dev_df = pd.read_csv(dev_clean_path, index_col=0)
test_df = pd.read_csv(test_clean_path, index_col=0)

train_df =  pd.DataFrame({"TEXT": train_df["REAL TEXT"], "BOOK": train_df["BOOK TITLE"]})
dev_df = pd.DataFrame({"TEXT": dev_df["TEXT"], "BOOK": dev_df["BOOK TITLE"]})
test_df = pd.DataFrame({"TEXT": test_df["TEXT"], "BOOK": test_df["BOOK TITLE"]})

In [73]:
# helper functions to normalize the words in the books
# Used to remove short words. how short is a short words
_short = 2

# Used to remove stopwords from the english language
_more_stopwords = set([
    # interjections
    "oh", "ah",
    # useless
    "yes", "no",
    # archaic terms: they, you, triplet, to do, you
    "thy", "thou", "thrin", "didst", "thee",
    # names
    *map(str.lower, names.words())
])
_stopwords = set(stopwords.words('english')) | _more_stopwords

# Used to lemmatize words that are either adj, nouns or verbs
# depending on _pos_tags otherwise it does nothing.
_lemmatizer = WordNetLemmatizer()
_pos_tags = ["a", "n", "v"]

# Used for min/max filering. CountVectorizer removes words 
# that have a frequency higher that max_df. It also removes 
# words that appear in less documents than min_df.
_max_df = 0.40
_min_df = 3

# https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
# Used to decontract words that contain "'".
def decontract(phrase: str):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def is_not_short(word):
    return len(word) > _short

def not_in_stopwords(word):
    return word not in _stopwords

def lemmatize(pair):
    word, pos = pair
    pos = pos[0].lower()
    if pos not in _pos_tags:
        return word
    return _lemmatizer.lemmatize(word, pos=pos)

def document_analyzer(book: str):
    book = str.lower(book)
    book = decontract(book)
    words = word_tokenize(book)
    words = list(filter(is_not_short, words))
    words = list(filter(not_in_stopwords, words))
    words = list(map(lemmatize, pos_tag(words)))
    return words

In [74]:
vectorizer = CountVectorizer(analyzer=document_analyzer, min_df=_min_df, max_df=_max_df)
X_train = vectorizer.fit_transform([*train_df["TEXT"].to_list(), *dev_df["TEXT"].to_list()])
X_dev = vectorizer.transform(dev_df["TEXT"].to_list())
X_test = vectorizer.transform(test_df["TEXT"].to_list())

In [75]:
lda = LatentDirichletAllocation(n_components=20, max_iter=50, evaluate_every=1, learning_method='online', verbose=1, n_jobs=-1)
lda.fit_transform(X_train)
y_hat = lda.transform(X_test)

print(lda.score(X_dev))
print(lda.perplexity(X_dev))

iteration: 1 of max_iter: 50, perplexity: 15362.2168
iteration: 2 of max_iter: 50, perplexity: 11732.2175
iteration: 3 of max_iter: 50, perplexity: 9704.4800
iteration: 4 of max_iter: 50, perplexity: 8341.4871
iteration: 5 of max_iter: 50, perplexity: 7394.2804
iteration: 6 of max_iter: 50, perplexity: 6725.9729
iteration: 7 of max_iter: 50, perplexity: 6250.3747
iteration: 8 of max_iter: 50, perplexity: 5909.1312
iteration: 9 of max_iter: 50, perplexity: 5662.8157
iteration: 10 of max_iter: 50, perplexity: 5483.4603
iteration: 11 of max_iter: 50, perplexity: 5351.8854
iteration: 12 of max_iter: 50, perplexity: 5254.5401
iteration: 13 of max_iter: 50, perplexity: 5181.9779
iteration: 14 of max_iter: 50, perplexity: 5126.8747
iteration: 15 of max_iter: 50, perplexity: 5084.4808
iteration: 16 of max_iter: 50, perplexity: 5051.5442
iteration: 17 of max_iter: 50, perplexity: 5025.5892
iteration: 18 of max_iter: 50, perplexity: 5004.8857
iteration: 19 of max_iter: 50, perplexity: 4988.0113


-183175.70727333846
277690.41709838336


In [77]:
# Functions for printing keywords for each topic
def print_topics(model, vectorizer, top_n=10):
    words = vectorizer.get_feature_names()

    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx), end='')
        print([words[i] for i in topic.argsort()[:-top_n - 1:-1]]) 

# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, vectorizer, 15) 

Topics found via LDA:
Topic 0:['uncle', 'girl', 'miss', 'fish', 'pirate', 'ship', 'seven', 'english', 'mountain', 'money', 'husband', 'missus', 'family', 'wall', 'indian']
Topic 1:['bread', 'serve', 'butter', 'add', 'flour', 'salt', 'boil', 'cook', 'egg', 'sugar', 'mode', 'dish', 'slice', 'keats', 'soup']
Topic 2:['babylon', 'armour', 'combatant', 'champion', 'device', 'list', 'competitor', 'circus', 'dexterity', 'housing', 'lance', 'feint', 'remount', 'embellish', 'euphrates']
Topic 3:['captain', 'pilot', 'game', 'dat', 'cord', 'brag', 'owl', 'mate', 'quarter', 'yellow', 'thompson', 'caravan', 'paddle', 'passenger', 'petticoat']
Topic 4:['income', 'labor', 'land', 'wheat', 'increase', 'rent', 'agent', 'material', 'value', 'product', 'cause', 'owner', 'principle', 'wealth', 'system']
Topic 5:['girl', 'madame', 'boy', 'table', 'step', 'street', 'kill', 'dear', 'doctor', 'wall', 'barricade', 'everything', 'death', 'boat', 'window']
Topic 6:['fish', 'power', 'eighteen', 'press', 'paper', 

In [87]:
topic_df = pd.DataFrame({"BOOK": test_df["BOOK"], "TOPIC": y_hat.argmax(axis=1)}, copy=True)

topic_df

Unnamed: 0,BOOK,TOPIC
0,Shakespeare's Sonnets,12
1,Mother Carey's Chickens,12
2,The Return of Sherlock Holmes,12
3,Alice's Adventures in Wonderland,16
4,Gentle Measures in the Management and Training...,12
5,Aunt Jane's Nieces at Work,12
6,The Weapons of Mystery,12
7,The Sunny Side,12
8,Les Miserables,12
9,Off on a Comet! a Journey through Planetary Space,12


In [89]:
model = {"vectorizer": vectorizer, "lda": lda}
joblib.dump(model, model_outpath)

['models\\model.jl']

In [90]:
pyLDAvis.enable_notebook()
data = pyLDAvis.sklearn.prepare(lda, X_train, vectorizer, mds='tsne')
pyLDAvis.save_html(data, pyLDAvis_outpath)

