# Poetic Similarity of Chunks Across MJP Corpus
## Includes Letters

In [1]:
import pandas as pd
import numpy as np
import gensim
import umap
import re, string
import nltk
englishWords = set(nltk.corpus.words.words())

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

abs_dir = "/Users/williamquinn/Desktop/DH/Python/"

# 1. Data Preparation

In [None]:
%%time

# Load data.
mjp_df = pd.read_csv(abs_dir + 'MJP/Output/mjp_documents.txt', sep='\t')[['mjp_id', 'magazine', 
                                                                         'date', 'type', 'text']]

mjp_df = mjp_df.rename(columns = {'mjp_id':'mjp_index'})

# Remove bibliographic information (volume/issue, year) from strings.
mjp_df['text'] = mjp_df['text'].astype(str) \
    .str.lower() \
    .str.strip() \
    .str.replace(r'[^\w\s]','', regex=True) \
    .str.replace(r"pgbrk","", regex=True) \
    .str.replace('\.0', '', regex=True) \
    .str.replace(r'vol \w+ no \d+ \w+ \d{4}', '', regex=True) \
    .str.replace(r'\w+ \d{4}', '', regex=True) \
    .str.replace(r'vol \w+ no \d+', '', regex=True) \
    .str.replace(r'v ', '', regex=True) \
    .str.replace(r'vol ', '', regex=True) \
    .str.replace(r'no ', '', regex=True) \
    .str.replace(r'poetry a magazine of verse', '', regex=True) \
    .str.replace(r'the little review', '', regex=True) \
    .str.replace(r'the seven arts', '', regex=True) \
    .str.replace(r'others', '', regex=True) \
    .str.replace(r'the masses', '', regex=True) \
    .str.replace(r'the crisis', '', regex=True) \
    .str.replace(r'marsden magazines', '', regex=True) \
    .apply(lambda x: ' '.join([item for item in x.split() if item in englishWords]))
  
# Subset dataframe for selected genre.
mjp_df_Poetry = mjp_df[(mjp_df['type'].isin(['poetry', 'letters']))]
                             

# Split text field string into list of 200 words.
def splitText(string):
    words = string.split()
    grouped_words = [' '.join(words[i: i + 200]) for i in range(0, len(words), 200)]
    return grouped_words

mjp_df_Poetry['text'] = mjp_df_Poetry['text'].apply(splitText)

# Unnest list and create row for each list item.
# https://mikulskibartosz.name/how-to-split-a-list-inside-a-dataframe-cell-into-rows-in-pandas-9849d8ff2401
mjp_df_Poetry = mjp_df_Poetry['text'] \
    .apply(pd.Series) \
    .merge(mjp_df_Poetry, right_index = True, left_index = True) \
    .drop(["text"], axis = 1) \
    .melt(id_vars = ['mjp_index', 'magazine', 'date', 'type'], value_name = "text") \
    .drop("variable", axis = 1) \
    .dropna()

# Count words and remove short strings.
mjp_df_Poetry['count'] = mjp_df_Poetry['text'].str.split().str.len()
mjp_df_Poetry = mjp_df_Poetry[~(mjp_df_Poetry['count'] < 200)]

# Change index value to prevent duplication when merged with original dataframe.
mjp_df_Poetry['mjp_index'] = mjp_df_Poetry.index + 10000


# Creating The Waste Land
regex = re.compile('[\W_]+')
wasteLand = regex.sub(' ', open(abs_dir + '19-20c_Corpus/WasteLand-Eliot.txt').read().lower())

wasteLand = pd.DataFrame({'text':wasteLand}, index = [0])

wasteLand['text'] = wasteLand['text'].apply(splitText)

wasteLand = wasteLand['text'].apply(lambda x: pd.Series(x)) \
    .stack() \
    .reset_index(drop=True) 

wasteLand = pd.DataFrame(wasteLand, columns = ['text'])
wasteLand['mjp_index'] = wasteLand.index + 90000
wasteLand['magazine'] = 'the dial'
wasteLand['date'] = '1922-11-01' # Date published in America in The Dial
wasteLand['type'] = 'poetry'
wasteLand['count'] = wasteLand['text'].str.split().str.len()
wasteLand = wasteLand[~(wasteLand['count'] < 200)]

# Rejoing dataframes
mjp_df = pd.concat([mjp_df, mjp_df_Poetry, wasteLand], sort = True)

mjp_df.to_csv(abs_dir + 'MJP/Chapter3-Poetry/Poetry-MJP/mjp_ALL-poetry-letters-wasteLand_chunks.csv', 
              sep=',', index=False)

# 2. Train Model

In [None]:
%%time

# Create Corpus
tagged_docs = mjp_df.apply \
    (lambda x:gensim.models.doc2vec.TaggedDocument \
     (gensim.utils.simple_preprocess(x.text), \
      ['doc{}'.format(x.mjp_index)]), axis=1)

training_corpus = tagged_docs.values

# Training.
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=4, epochs=20)

model.build_vocab(training_corpus)

model.train(training_corpus, total_examples = model.corpus_count, epochs = model.epochs)

# Store Model.
model.save(abs_dir + "MJP/Chapter3-Poetry/Poetry-MJP/mjp_ALL-poetry-letters-wasteLand_chunks_d2v.bin")

# 3. Create Similarity Table

In [2]:
%%time

# Load data.
model = gensim.models.doc2vec.Doc2Vec \
    .load(abs_dir + "MJP/Chapter3-Poetry/Poetry-MJP/mjp_ALL-poetry-letters-wasteLand_chunks_d2v.bin")
docs = list(model.docvecs.index2entity)

mjp_df = pd.read_csv(abs_dir + 'MJP/Chapter3-Poetry/Poetry-MJP/mjp_ALL-poetry-letters-wasteLand_chunks.csv', 
                     sep=',')[['mjp_index', 'date', 'magazine', 'type']]
mjp_df_Poetry = mjp_df[mjp_df['mjp_index'] >= 10000]

# Convert doc2vec results to dataframe.
doc_sims = []

for i in mjp_df_Poetry.mjp_index:
    doc = "doc" + str(i)

    for sim in model.docvecs.most_similar(positive = doc, topn = len(docs)):
        compDoc = sim[0]
        similarity = sim[1]
        doc_sims.append([doc, compDoc, similarity])

sims_docs = pd.DataFrame(doc_sims, columns = ['mjp_index', 'compDoc', 'similarity'])
sims_docs.compDoc = sims_docs.compDoc.replace('doc', '', regex=True).astype(int)
sims_docs.mjp_index = sims_docs.mjp_index.replace('doc', '', regex=True).astype(int)

# Subset and merge data.
sims_docs = sims_docs[(sims_docs['mjp_index'] >= 10000) & (sims_docs['compDoc'] >= 10000)]
sims_docs = sims_docs[sims_docs['similarity'] > 0.6]
sims_docs = sims_docs.merge(mjp_df_Poetry[['mjp_index', 'date', 'magazine', 'type']],
           how = "inner", on = 'mjp_index')

# Create compDoc dataframe and join with original.
simsComps = sims_docs[['mjp_index', 'date', 'type', 'magazine']] \
    .rename(columns = {'mjp_index':'compDoc', 'date':'compDate', 'magazine':'compMag', 'type':'compType'}) 

sims_docs = sims_docs.merge(simsComps.drop_duplicates(), on = 'compDoc')

sims_docs['date'] = sims_docs['date'].astype('datetime64[ns]')
sims_docs['compDate'] = sims_docs['compDate'].astype('datetime64[ns]')
sims_docs = sims_docs.query('date < compDate')

sims_docs.to_csv(abs_dir + 'MJP/Chapter3-Poetry/Poetry-MJP/mjp_ALL-poetry-letters-wasteLand_sims.csv', 
                 sep=',', index=False)


This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function



CPU times: user 14min 50s, sys: 2min 41s, total: 17min 32s
Wall time: 16min 24s


# 4. Analyze Data

In [20]:
%%time

# Sankey / Alluvial Flow Diagram
# https://plot.ly/~alishobeiri/1591/plotly-sankey-diagrams/#/

def label_maker(row):
    label = row['type'] + ' ' + row['magazine'] + ' ' + str(row['date']) + ' ' + str(row['mjp_index'])
    return label

def compLabel_maker(row):
    compLabel = row['compType'] + ' ' + row['compMag'] + ' ' + str(row['compDate']) + ' ' + str(row['compDoc'])
    return compLabel


poetries = pd.read_csv(abs_dir + 'MJP/Chapter3-Poetry/Poetry-MJP/mjp_ALL-poetry-letters-wasteLand_sims.csv', 
                       sep=',')

# poetries['mjp_index'] = poetries['mjp_index'].astype(int) 

poetries = poetries \
    .assign(sourceLabel = poetries.apply(lambda row: label_maker(row), axis=1), 
            compLabel = poetries.apply(lambda row: compLabel_maker(row), axis=1)) \
    [['sourceLabel', 'compLabel', 'similarity']] 

poetries.head()

CPU times: user 37.7 ms, sys: 3.49 ms, total: 41.1 ms
Wall time: 43.6 ms


Unnamed: 0,sourceLabel,compLabel,similarity
0,poetry marsden magazines 1914-08-01 10016,poetry the little review 1915-02-01 10290,0.611956
1,poetry marsden magazines 1914-03-02 13233,poetry the little review 1915-02-01 10290,0.631541
2,poetry marsden magazines 1914-06-01 13350,poetry the little review 1915-02-01 10290,0.61621
3,"poetry poetry, a magazine of verse 1913-11-01 ...",poetry the little review 1915-02-01 10290,0.600857
4,poetry marsden magazines 1914-06-01 10040,"poetry poetry, a magazine of verse 1914-10-01 ...",0.602744


# 5. Create Nodes and Links

In [21]:
# Subset by similarity
poetries = poetries.query('similarity > .7')

# Create Nodes Dataframe.
nodes = poetries['sourceLabel'] \
    .append(pd.DataFrame(poetries['compLabel'].values.tolist()), ignore_index=True) \
    .drop_duplicates() \
    .rename(columns = {0 : 'label'})

nodes = nodes \
    .assign(code = nodes['label'].astype('category').cat.codes)  \
    .loc[:, 'label':'code'] \
    .sort_values(['code'], ascending=True) # Sorting is necessary to match labels with source codes.


nodes_dictionary = nodes.set_index('label')['code'].to_dict()

# Create Links Dataframe.
links = poetries \
    .assign(source = poetries['sourceLabel'].map(nodes_dictionary),
            target = poetries['compLabel'].map(nodes_dictionary))

# links = links[['sourceLabel', 'compLabel', 'similarity']]

# # Subset Dataframe based on occurrences of matches.
links = links \
    .assign(source_occurrence = links.groupby(['source'])['target'].transform('count')) \
    .assign(target_occurrence = links.groupby(['target'])['source'].transform('count')) \
    .query('(source_occurrence > 5) | (target_occurrence > 5)')

# # Subset for Lindsay - Lowell - Curran focus.
# links = links[(links['source'].isin([6, 7, 14])) | (links['target'].isin([7, 14]))] \
#     .query('target != 2') #     Remove Eddy for clarity in graph.

links.head()

Unnamed: 0,sourceLabel,compLabel,similarity,source,target,source_occurrence,target_occurrence
22,poetry marsden magazines 1915-03-01 10542,poetry the little review 1918-02-01 13006,0.779712,14,55,9,4
23,poetry the little review 1916-11-01 10594,poetry the little review 1918-02-01 13006,0.86086,43,55,6,4
25,poetry marsden magazines 1915-12-01 11125,poetry the little review 1918-02-01 13006,0.845003,16,55,8,4
26,poetry the little review 1916-08-01 11478,poetry the little review 1918-02-01 13006,0.851722,42,55,7,4
27,poetry marsden magazines 1915-03-01 10542,poetry the little review 1918-02-01 11352,0.773811,14,54,9,4


# 6. Visualize Sankey

In [22]:
fig = go.Figure(
    data=[
        go.Sankey(
            node = dict(
                pad = 15, 
                thickness = 20, 
                line = dict(color = "black", width = 0.5), 
                label = nodes['label']), 
            
            link = dict(
                source = links["source"], 
                target = links["target"], 
                value = links["similarity"])
        )
    ])

fig.update_layout(title = "Poetry & Letters Similarity in the Modernist Journals Project", 
                  font_size=12)
fig.show()
# fig.write_image('Poetry-MJP/Visualizations/poetry_sankey.png')