# Doc2Vec
1. Create model.
2. UMAP visualizations.

In [1]:
import re
import pandas as pd
import numpy as np
import gensim
import umap

import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')

abs_dir = "/Users/williamquinn/Desktop/DH/Python/MJP/"

# Prepare and Create Model

1. Merge structured and unstructured MJP data for more accurate word vectors. 
2. Change mjp_index of structured data (>10000) to disambiguate in model.

In [None]:
%%time

# Load data.
mjp_df = pd.read_csv(abs_dir + 'Output/mjp_documents.txt', sep='\t') \
    .rename(columns = {"mjp_id":"mjp_index"})[['mjp_index', 'magazine', 'date', 'type', 'text']]


# Remove bibliographic information (volume/issue, year) from strings.
mjp_df['text'] = mjp_df['text'].astype(str) \
    .str.lower() \
    .str.strip() \
    .str.replace(r'[^\w\s]','', regex=True) \
    .str.replace(r"pgbrk","", regex=True) \
    .str.replace('\.0', '', regex=True) \
    .str.replace(r'vol \w+ no \d+ \w+ \d{4}', '', regex=True) \
    .str.replace(r'\w+ \d{4}', '', regex=True) \
    .str.replace(r'vol \w+ no \d+', '', regex=True) \
    .str.replace(r'v ', '', regex=True) \
    .str.replace(r'vol ', '', regex=True) \
    .str.replace(r'no ', '', regex=True) \
    .str.replace(r'the little review', ' ', regex=True) \
    .str.replace(r'the crisis', ' ', regex=True) \
    .str.replace(r'poetry, a magazine of verse', ' ', regex=True) \
    .str.replace(r'the masses', ' ', regex=True) \
    .str.replace(r'the seven arts', ' ', regex=True)

# Adjust structured data values to match unstructured.
mjp_df['magazine'] = mjp_df['magazine'].astype(str) \
    .str.replace(r'the little review','Little Review') \
    .str.replace('the crisis', 'Crisis') \
    .str.replace('marsden magazines', 'Marsden Magazines') \
    .str.replace('others', 'Others') \
    .str.replace('the masses', 'Masses') \
    .str.replace('poetry, a magazine of verse', 'Poetry') \
    .str.replace('the seven arts', 'Seven Arts')

# Add values to disambiguate structured (>10000) and unstructured data.
mjp_df['mjp_index'] = mjp_df['mjp_index'] + 10000


# Load unstructured, web-scrapped data.
mjp_ec_df = pd.read_csv(abs_dir + "Output/doc2vec/mjp_documents-EntireCorpus.csv", 
                     sep=",") \
    .assign(type = "issue")

# Append dataframes and remove empty text fields.
mjp_appended_df = mjp_df.append(mjp_ec_df, ignore_index=True)

mjp_appended_df = mjp_appended_df.dropna(subset=["text"], how="any")

# Check for NaN
print (mjp_appended_df[mjp_appended_df.isnull().any(axis=1)])

mjp_appended_df.to_csv(abs_dir + 'Output/mjp_appended_df.csv', 
            sep='\t', index=False)

### Train model on all (unstructured) data.

In [None]:
%%time

mjp_appended_df = pd.read_csv(abs_dir + 'Output/mjp_appended_df.csv', sep='\t')

# Remove NaN (they will need save in previous cell)
mjp_appended_df = mjp_appended_df.dropna(subset=["text"], how="any")

# Create Corpus
tagged_docs = mjp_appended_df.apply \
    (lambda x:gensim.models.doc2vec.TaggedDocument \
     (gensim.utils.simple_preprocess(x.text), \
      ['doc{}'.format(x.mjp_index)]), axis=1)

training_corpus = tagged_docs.values

# Training.
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=4, epochs=20)

model.build_vocab(training_corpus)

model.train(training_corpus, total_examples = model.corpus_count, epochs = model.epochs)

# Store Model.
model.save(abs_dir + "Output/doc2vec/mjp_entireCorpus_d2v.bin")

### Train model on structured data.

In [2]:
%%time

mjp_df = pd.read_csv(abs_dir + 'Output/mjp_documents.txt', sep='\t')

# Remove NaN (they will need save in previous cell) & Rename column.
mjp_df = mjp_df.dropna(subset=["text"], how="any") \
    .rename(columns = {"mjp_id":"mjp_index"})

# Create Corpus
tagged_docs = mjp_df.apply \
    (lambda x:gensim.models.doc2vec.TaggedDocument \
     (gensim.utils.simple_preprocess(x.text), \
      ['doc{}'.format(x.mjp_index)]), axis=1)

training_corpus = tagged_docs.values

# Training.
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=4, epochs=20)

model.build_vocab(training_corpus)

model.train(training_corpus, total_examples = model.corpus_count, epochs = model.epochs)

# Store Model.
model.save(abs_dir + "Output/doc2vec/structured_results/mjp_documents-d2v.bin")

CPU times: user 16min 22s, sys: 9.17 s, total: 16min 31s
Wall time: 6min 43s


# UMAP & Visualization

In [83]:
%%time
# UMAP

# model = gensim.models.doc2vec.Doc2Vec.load(abs_dir + "Output/doc2vec/mjp_entireCorpus_d2v.bin")
model = gensim.models.doc2vec.Doc2Vec.load(abs_dir + "Output/doc2vec/structured_results/mjp_documents-d2v.bin")
docs = list(model.docvecs.index2entity)

# Load and Remove NaN (they will not save in previous cell)
mjp_df = pd.read_csv(abs_dir + 'Output/mjp_documents.txt', sep='\t') \
    .dropna(subset=["text"], how="any") \
    .rename(columns = {"mjp_id":"mjp_index"})[['mjp_index', 'magazine','date', 'type', 'text']]

data = np.array(model[docs])
reducer = umap.UMAP()
embedding = reducer.fit_transform(data)

x = []
y = []
for value in embedding:
    x.append(value[0])
    y.append(value[1])


mjp_umap = pd.DataFrame({"mjp_index":mjp_df["mjp_index"],
                       "x":x,
                       "y":y})

mjp_umap = pd.merge(mjp_df, mjp_umap, on='mjp_index')
mjp_umap = mjp_umap[["mjp_index", 'magazine', "type", "date", "x", "y"]]

CPU times: user 23.2 s, sys: 496 ms, total: 23.7 s
Wall time: 23.5 s


In [84]:
%%time

mjp = mjp_umap #.query('(mjp_index < 10000)')

# Remove rows with irregular dates (e.g. New Year's Day) & convert to DateTime.
mjp['date'] = mjp['date'].str.replace("[A-z ']+", "", regex=True) \
    .dropna() \
    .astype('datetime64[ns]')

mjp = mjp[mjp['type'].isin(["articles", "letters", "poetry", "fiction", "advertisements"])]

# Visualize
fig = px.scatter(mjp, x="x", y="y", 
                 color="type", symbol='type',
                 hover_name="date", template = "plotly_white",
                 render_mode="webgl")

fig.show()

CPU times: user 1.12 s, sys: 15.9 ms, total: 1.13 s
Wall time: 1.15 s


In [98]:
fig.update_traces(textposition = 'top center', 
                  marker = dict(size = 4))

fig.update_layout(
    title_text='MJP doc2vec',
    annotations=[
        go.layout.Annotation(
            x=1,
            y=-5.85,
            xref="x",
            yref="y",
            text="Advertisements",
            showarrow=True,
            arrowhead=7,
            ax=0,
            ay=-45),
        go.layout.Annotation(
            x=-3.5,
            y=-3,
            text="Articles",
            showarrow=True,
            arrowhead=7,
            ax=0,
            ay=90),
        go.layout.Annotation(
            x=0.3,
            y=-0.2,
            text="Letters",
            showarrow=True,
            arrowhead=7,
            ax=0,
            ay=-120),
        go.layout.Annotation(
            x=-5.5,
            y=1.9,
            text="Fiction",
            showarrow=True,
            arrowhead=7,
            ax=0,
            ay=-90),
        go.layout.Annotation(
            x=-3.6,
            y=3.15,
            text="Poetry",
            showarrow=True,
            arrowhead=7,
            ax=0,
            ay=-90)
    ]
)

fig.show()

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

draw_umap(n_components=3, 
          title='MJP Doc2Vec')

## UMAP of MJP Issues

In [37]:
%%time

mjp_df = pd.read_csv(abs_dir + 'Output/mjp_issues.txt', sep='\t')

# Remove NaN
mjp_df = mjp_df.dropna(subset=["text"], how="any")

# Create Corpus
tagged_docs = mjp_df.apply \
    (lambda x:gensim.models.doc2vec.TaggedDocument \
     (gensim.utils.simple_preprocess(x.text), \
      ['doc{}'.format(x.mjp_index)]), axis=1)

training_corpus = tagged_docs.values

# Training.
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=4, epochs=20)

model.build_vocab(training_corpus)

model.train(training_corpus, total_examples = model.corpus_count, epochs = model.epochs)

# Store Model.
model.save(abs_dir + "Output/doc2vec/structured_results/mjp_issues_d2v.bin")

CPU times: user 11min 10s, sys: 8.67 s, total: 11min 19s
Wall time: 4min 30s


In [44]:
%%time
# UMAP

model = gensim.models.doc2vec.Doc2Vec.load(abs_dir + "Output/doc2vec/structured_results/mjp_issues_d2v.bin")
docs = list(model.docvecs.index2entity)

# Load and Remove NaN (they will not save in previous cell)
mjp_df = pd.read_csv(abs_dir + 'Output/mjp_issues.txt', sep='\t') \
    .dropna(subset=["text"], how="any") \
    .rename(columns = {"mjp_id":"mjp_index"})[['mjp_index', 'magazine', 'date']]

data = np.array(model[docs])
reducer = umap.UMAP()
embedding = reducer.fit_transform(data)

x = []
y = []
for value in embedding:
    x.append(value[0])
    y.append(value[1])


mjp_umap = pd.DataFrame({"mjp_index":mjp_df["mjp_index"],
                       "x":x,
                       "y":y})

mjp_umap = pd.merge(mjp_df, mjp_umap, on='mjp_index')
mjp_umap = mjp_umap[["mjp_index", 'magazine', "date", "x", "y"]]

CPU times: user 2.46 s, sys: 251 ms, total: 2.71 s
Wall time: 2.61 s


In [57]:
%%time
import plotly.graph_objects as go

# Subset.
mjp = mjp_umap[mjp_umap['magazine'].isin(['marsden magazines', 'the crisis', 
                              'the little review', 'the masses', 
                              'poetry, a magazine of verse'])]

# Remove rows with irregular dates (e.g. New Year's Day) & convert to DateTime.
mjp['date'] = mjp['date'].str.replace("[A-z ']+", "", regex=True) \
    .dropna() \
    .astype('datetime64[ns]')


# Visualize
fig = px.scatter(mjp, x="x", y="y", 
                 color="magazine", symbol='magazine',
                 hover_name="date",
                 render_mode="webgl")

fig.update_traces(textposition = 'top center', 
                  marker = dict(size = 6))


# Annotations

# Layout
fig.update_layout(
    title_text='MJP doc2vec',
    annotations=[
        go.layout.Annotation(
            x=1.19,
            y=4.38,
            xref="x",
            yref="y",
            text="The Crisis",
            showarrow=True,
            arrowhead=7,
            ax=0,
            ay=-90),
        go.layout.Annotation(
            x=2.92,
            y=6.37,
            text="The Masses",
            showarrow=True,
            arrowhead=7,
            ax=0,
            ay=-90),
        go.layout.Annotation(
            x=4.82,
            y=10.09,
            text="Marsden Magazines",
            showarrow=True,
            arrowhead=7,
            ax=0,
            ay=-90),
        go.layout.Annotation(
            x=8.34,
            y=8.54,
            text="Poetry, a Magazine of Verse",
            showarrow=True,
            arrowhead=7,
            ax=0,
            ay=90),
        go.layout.Annotation(
            x=7.11,
            y=10.66,
            text="The Little Review",
            showarrow=True,
            arrowhead=7,
            ax=0,
            ay=-90)
    ]
)

fig.show()

CPU times: user 683 ms, sys: 9.39 ms, total: 693 ms
Wall time: 696 ms
