# Similarities Across <i>The Masses</i>

In [1]:
import pandas as pd
import numpy as np
import gensim

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

abs_dir = "/Users/williamquinn/Desktop/DH/Python/MJP/"

## Find Similarities

In [15]:
%%time

# Load data.
model = gensim.models.doc2vec.Doc2Vec.load(abs_dir + "Output/doc2vec/mjp_entireCorpus_d2v.bin")
docs = list(model.docvecs.index2entity)

# Remove NaN (they will need save in previous cell)
mjp_df = pd.read_csv(abs_dir + 'Output/mjp_appended_df.csv', sep='\t') \
    .dropna(subset=["text"], how="any")[['mjp_index', 'date', 'magazine', 'type']]

mjp_df_selectedMag = mjp_df.query('magazine == "Masses" & (mjp_index > 10000)')

# Convert doc2vec results to dataframe.
doc_sims = []

for i in mjp_df_selectedMag.mjp_index:
    doc = "doc" + str(i)

    for sim in model.docvecs.most_similar(positive = doc, topn = len(docs)):
        compDoc = sim[0]
        similarity = sim[1]
        doc_sims.append([doc, compDoc, similarity])

sims_docs = pd.DataFrame(doc_sims, columns = ['mjp_index', 'compDoc', 'similarity'])
sims_docs.compDoc = sims_docs.compDoc.replace('doc', '', regex=True).astype(int)
sims_docs.mjp_index = sims_docs.mjp_index.replace('doc', '', regex=True).astype(int)


# Subset and merge data.
sims_docs = sims_docs.merge(mjp_df_selectedMag[['mjp_index', 'date', 'type']],
           how = "inner", on = 'mjp_index')

sims_docs = sims_docs[sims_docs['type'].isin(['articles', 'letters', 'fiction', 'poetry', 'drama'])]

# Create compDoc dataframe and join with original.
simsComps = sims_docs[['mjp_index', 'date', 'type']] \
    .rename(columns = {'mjp_index':'compDoc', 'date':'compDate', 'type':'compType'}) 

sims_docs = sims_docs.merge(simsComps.drop_duplicates(), on = 'compDoc')

sims_docs['date'] = sims_docs['date'].astype('datetime64[ns]')
sims_docs['compDate'] = sims_docs['compDate'].astype('datetime64[ns]')
sims_docs = sims_docs.query('date < compDate')

sims_docs.to_csv(abs_dir + 'Chapter4-Masses/Masses_Data/d2v/mjp_masses-sims.csv', sep=',', index=False)

CPU times: user 4min 44s, sys: 13.3 s, total: 4min 58s
Wall time: 3min 23s


## Create Nodes & Links.

In [4]:
mjp_df = pd.read_csv(abs_dir + 'Chapter4-Masses/Masses_Data/d2v/mjp_masses-sims.csv', sep=',') \
    .query('(similarity > 0.7) \
        & (date < compDate) \
        & (type == "letters" or compType == "letters") ')

mjp_df = mjp_df \
    .assign(
        label = mjp_df['type'] + '-' + mjp_df['date'].astype(str) + " " + mjp_df['mjp_index'].astype(str),
        compLabel = mjp_df['compType'] + '-' + mjp_df['compDate'].astype(str) + " " + mjp_df['compDoc'].astype(str)
    )

mjp_df

Unnamed: 0,mjp_index,compDoc,similarity,date,type,compDate,compType,label,compLabel
404,12642,16437,0.757558,1913-02-01,letters,1915-09-01,articles,letters-1913-02-01 12642,articles-1915-09-01 16437
405,12645,16437,0.776988,1913-02-01,letters,1915-09-01,articles,letters-1913-02-01 12645,articles-1915-09-01 16437
822,18453,16437,0.757666,1913-07-01,letters,1915-09-01,articles,letters-1913-07-01 18453,articles-1915-09-01 16437
1314,12642,10784,0.721769,1913-02-01,letters,1916-07-01,poetry,letters-1913-02-01 12642,poetry-1916-07-01 10784
1315,12645,10784,0.742505,1913-02-01,letters,1916-07-01,poetry,letters-1913-02-01 12645,poetry-1916-07-01 10784
2882,15739,12918,0.710206,1916-12-01,letters,1917-02-01,poetry,letters-1916-12-01 15739,poetry-1917-02-01 12918
3765,12642,17498,0.729528,1913-02-01,letters,1917-04-01,poetry,letters-1913-02-01 12642,poetry-1917-04-01 17498
4206,15739,17498,0.701871,1916-12-01,letters,1917-04-01,poetry,letters-1916-12-01 15739,poetry-1917-04-01 17498
5089,12642,14230,0.726503,1913-02-01,letters,1917-07-01,poetry,letters-1913-02-01 12642,poetry-1917-07-01 14230
7756,11272,12645,0.773886,1911-02-01,articles,1913-02-01,letters,articles-1911-02-01 11272,letters-1913-02-01 12645


## Create Nodes and Edges

In [5]:
# Create Nodes Dataframe.
nodes = mjp_df[['mjp_index', 'label']] \
    .append(mjp_df[['compDoc', 'compLabel']] \
            .rename(columns = {"compDoc":"mjp_index", 
                               "compLabel":"label"})) \
    .drop_duplicates()

nodes = nodes \
    .assign(code = nodes['mjp_index'].astype('category').cat.codes) \
    .sort_values('code', ascending=True)

nodes_dictionary = nodes.set_index('mjp_index')['code'].to_dict()

# Create Links Dataframe.
links = mjp_df \
    .assign(source = mjp_df['mjp_index'].map(nodes_dictionary),
            target = mjp_df['compDoc'].map(nodes_dictionary))

links = links[['source','target', 'similarity']]

# Subset Dataframe based on occurrences of matches.
links = links \
    .assign(source_occurrence = links.groupby(['source'])['target'].transform('count')) \
    .assign(target_occurrence = links.groupby(['target'])['source'].transform('count')) 
# \
#     .query('(source_occurrence > 15) | (target_occurrence > 15)')

links.sort_values('source_occurrence', ascending=False)

Unnamed: 0,source,target,similarity,source_occurrence,target_occurrence
75618,2,10,0.719335,7,1
45396,2,1,0.720635,7,1
38378,2,17,0.774796,7,6
23692,2,5,0.798628,7,2
19246,2,18,0.768931,7,1
7756,2,6,0.773886,7,2
10651,2,11,0.728884,7,2
9930,6,15,0.72578,5,2
33808,6,4,0.734557,5,1
25677,6,3,0.701984,5,1


In [6]:
data_trace = dict(
#     Create Canvas
    type='sankey',
    domain = dict(
      x =  [0,1],
      y =  [0,1]),
    orientation = "h",
    valueformat = ".0f",
    valuesuffix = 'label',
    arrangement = 'freeform', # snap, perpendicular, freeform, fixed
    
#     Create Nodes
    node = dict(
      pad = 20,
      thickness = 50,
      line = dict(
        color = "#435951",
        width = 1
      ),
      label = nodes['label']
    ),
    
#     Create Links
    link = dict(
      source = links["source"],
      target = links["target"],
      value = links["similarity"])
)

layout =  dict(
    title = "Similarity (doc2vec) of <i>The Masses</i>",
    font = dict(
      size = 10),    
)

fig = dict(data=[data_trace], layout=layout)
iplot(fig)

## Exploring Similarities.

In [7]:
mjp_reader = pd.read_csv(abs_dir + 'Output/mjp_appended_df.csv', sep='\t') \
    .dropna(subset=["text"], how="any") \
    .query('magazine == "Masses" \
        & (mjp_index > 10000)') \

In [12]:
mjp_reader.query('mjp_index == 12645').iloc[0,3]

'a big one  from terre haute    has taken on new life and appears under the auspices of a new combination of social revolutionists which insures its future success the current number of   abounds with vital matter from the virile pens of some of the ablest writers in the movement it is filled with firstclass stuff from cover to cover the clear cry of the revolution rings all through its pages and the illustrations are such as could be produced only by artists animated by the militant spirit of socialism   deserves the hearty support of all who believe in the overthrow of wageslavery and in social regeneration through working class emancipation  eugene debs'