# Explore doc2vec Similarities in <i>The Masses</i>

In [1]:
import pandas as pd
import numpy as np
import gensim
import umap

import warnings
warnings.filterwarnings('ignore')

import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

abs_dir = "/Users/williamquinn/Desktop/DH/Python/MJP/"

# Data Analysis

## Organize Data

In [None]:
%%time

# Load data.
model = gensim.models.doc2vec.Doc2Vec.load(abs_dir + "Chapter4-Masses/Masses_Data/d2v/mjp_masses-chunks_d2v.bin")
docs = list(model.docvecs.index2entity)

mjp_df = pd.read_csv(abs_dir + 'Chapter4-Masses/Masses_Data/d2v/mjp_masses-chunks-eC.csv', 
                     sep='\t')[['mjp_index', 'date', 'magazine', 'type']]

# mjp_df = mjp_df.rename(columns = {"mjp_id":"mjp_index"})

# Convert doc2vec results to dataframe.
doc_sims = []

for i in mjp_df.mjp_index:
    doc = "doc" + str(i)

    for sim in model.docvecs.most_similar(positive = doc, topn = len(docs)):
        compDoc = sim[0]
        similarity = sim[1]
        doc_sims.append([doc, compDoc, similarity])

sims_docs = pd.DataFrame(doc_sims, columns = ['mjp_index', 'compDoc', 'similarity'])
sims_docs.compDoc = sims_docs.compDoc.replace('doc', '', regex=True).astype(int)
sims_docs.mjp_index = sims_docs.mjp_index.replace('doc', '', regex=True).astype(int)


# Subset and merge data.
sims_docs = sims_docs.query("similarity > .6")
sims_docs = sims_docs.merge(mjp_df[['mjp_index', 'date', 'magazine', 'type']],
           how = "inner", on = 'mjp_index')

# Create compDoc dataframe and join with original.
simsComps = sims_docs[['mjp_index', 'date', 'magazine', 'type']] \
    .rename(columns = {'mjp_index':'compDoc', 'date':'compDate', 'magazine':'compMag', 'type':'compType'}) 

sims_docs = sims_docs.merge(simsComps.drop_duplicates(), on = 'compDoc')

sims_docs['date'] = sims_docs['date'].astype('datetime64[ns]')
sims_docs['compDate'] = sims_docs['compDate'].astype('datetime64[ns]')
sims_docs = sims_docs.query('(date < compDate)')

sims_docs.to_csv(abs_dir + 'Chapter4-Masses/Masses_Data/d2v/mjp_masses-chunks-sims.csv', sep=',', index=False)

# Sankey Work

In [17]:
mjp_df = pd.read_csv(abs_dir + "Chapter4-Masses/Masses_Data/d2v/mjp_masses-chunks-sims.csv", sep= ",")

# Change date/compDate to date values & create sensible labels.
mjp_df = mjp_df.assign(
    date = pd.to_datetime(mjp_df['date'], format="%Y-%m-%d"),
    compDate = pd.to_datetime(mjp_df['compDate'], format="%Y-%m-%d"),
    label = mjp_df['type'] + '-' + mjp_df['date'].astype(str) + " " + mjp_df['mjp_index'].astype(str),
    compLabel = mjp_df['compType'] + '-' + mjp_df['compDate'].astype(str) + " " + mjp_df['compDoc'].astype(str)
)

# Subset by Types, Magazine, and Genre.
mjp_df = mjp_df[mjp_df['type'].isin(['articles', 'fiction', 'letters', 'poetry', 'drama'])]

mjp_df = mjp_df[mjp_df['compType'].isin(['articles', 'fiction', 'letters', 'poetry', 'drama'])] \
    .query('(magazine == "Masses") \
        & (compMag == "Masses") \
        & (similarity > .75) \
        & (mjp_index >= 10000) \
        & (compDoc >= 10000) \
        & (date < compDate)')

# Create Nodes Dataframe.
nodes = mjp_df[['mjp_index', 'label']] \
    .append(mjp_df[['compDoc', 'compLabel']] \
            .rename(columns = {"compDoc":"mjp_index", 
                               "compLabel":"label"})) \
    .drop_duplicates()

# Create Unique Codes for Nodes.
nodes = nodes \
    .assign(code = nodes['mjp_index'].astype('category').cat.codes) \
    .sort_values('code', ascending=True)

nodes_dictionary = nodes.set_index('mjp_index')['code'].to_dict()

# Create Links Dataframe.
links = mjp_df \
    .assign(source = mjp_df['mjp_index'].replace(nodes_dictionary),
            target = mjp_df['compDoc'].replace(nodes_dictionary))

links = links[['source','target','similarity']]

# Subset Dataframe based on occurrences of matches.
links = links \
    .assign(source_occurrence = links.groupby(['source'])['target'].transform('count'), 
            target_occurrence = links.groupby(['target'])['source'].transform('count')) 
# \
#     .query('(source_occurrence >= 3) | (target_occurrence >= 3)')

# links.sort_values('source_occurrence', ascending=False).head()

links

Unnamed: 0,source,target,similarity,source_occurrence,target_occurrence
8632,4,2,0.788245,23,5
8685,9,2,0.751926,1,5
8789,18,2,0.770069,4,5
8825,20,2,0.752596,1,5
8860,25,2,0.771889,9,5
31833,0,25,0.776227,1,4
31843,4,25,0.891309,23,4
31874,12,25,0.766686,1,4
31895,16,25,0.757783,1,4
37109,4,22,0.824725,23,4


## Visualization

In [18]:
data_trace = dict(
#     Create Canvas
    type='sankey',
    domain = dict(
      x =  [0,1],
      y =  [0,1]),
    orientation = "h",
    valueformat = ".0f",
    valuesuffix = 'label',
    arrangement = 'snap',
    
#     Create Nodes
    node = dict(
      pad = 20,
      thickness = 50,
      line = dict(
        color = "#435951",
        width = 1
      ),
      label = nodes['label']
    ),
    
#     Create Links
    link = dict(
      source = links["source"],
      target = links["target"],
        value = links['source_occurrence'])
#       value = links["similarity"])
)

layout =  dict(
    title = "Similarity Amongst Genres of <i>The Masses</i>",
    font = dict(
      size = 10),    
)

fig = dict(data=[data_trace], layout=layout)
iplot(fig)

# Read Texts

In [33]:
reading = pd.read_csv(abs_dir + 'Chapter4-Masses/Masses_Data/d2v/mjp_masses-chunks-eC.csv', 
                       sep='\t')

# mjp_df[(mjp_df['magazine'] == 'poetry, a magazine of verse') &
#        (mjp_df['mjp_index'] > 10000) &
#        (mjp_df['date'] == '1912-12') &
#        (mjp_df['text'].str.contains('corbin', regex=True))]

reading = reading.query('mjp_index == 18739')

reading.iloc[0, 3]

' 16  the masses  december 1911'

# UMAP Scatter Plot of Chunks

In [None]:
# Load data.

mjp_df = pd.read_csv(abs_dir + 'Chapter4-Masses/Masses_Data/d2v/mjp_masses-chunks-eC.csv', sep='\t')

model = gensim.models.doc2vec.Doc2Vec.load(abs_dir + "Chapter4-Masses/Masses_Data/d2v/mjp_masses-chunks_d2v.bin")
docs = list(model.docvecs.index2entity)
data = np.array(model[docs])

In [None]:
%%time

# Map data.

reducer = umap.UMAP()
embedding = reducer.fit_transform(data)

x = []
y = []
for value in embedding:
    x.append(value[0])
    y.append(value[1])
    

mjp_umap = pd.DataFrame({"mjp_index":mjp_df["mjp_index"],
                       "x":x,
                       "y":y})

mjp_umap = pd.merge(mjp_df, mjp_umap, on='mjp_index')

mjp_umap = mjp_umap[["mjp_index", "magazine", "type", "date", "x", "y"]]

In [None]:
%%time

mjp = mjp_umap.query('(mjp_index >= 10000) & (type == "articles")')

# Remove rows with irregular dates (e.g. New Year's Day) & convert to DateTime.
mjp['date'] = mjp['date'].str.replace("[A-z ']+", "", regex=True) \
    .dropna() \
    .astype('datetime64[ns]')


# Assign conditional value "before" or "after" date.
#     Masses merger with the New Review (1916-08-01).

def assign_merger_value(c):
    if c['date'] < pd.to_datetime("1916-08-01"):
        return "Before Merger"
    else:
        return "After Merger"
    
mjp['merger'] = mjp.apply(assign_merger_value, axis=1)

# Visualize
fig = px.scatter(mjp, x="x", y="y", 
                 color="merger", hover_name="date",
                 render_mode="svg")

fig.update_traces(textposition='top center')

fig.update_layout(
    title_text='MJP doc2vec'
)

fig.show()

# Measure 'Coherence' of Yearly Similarities

In [2]:
mjp_df = pd.read_csv(abs_dir + "Chapter4-Masses/Masses_Data/d2v/mjp_masses-chunks-sims.csv", sep= ",")

mjp_df = mjp_df \
    .assign(year = mjp_df['date'].replace(r'(\d{4}).*', '\\1', regex=True))[['type', 'year', 'similarity']]


# fig = px.box(mjp_df[mjp_df['type'].isin(["articles", "fiction", "letters", "poetry"])], 
fig = px.box(mjp_df,
             x="year", y="similarity", 
             color="type", notched=True) # facet_row="type",

fig.show()