# Sort MJP Corpus for Different Scales

### &

## Create doc2vec Models 

In [5]:
import pandas as pd
import numpy as np
import gensim

abs_dir = "/Users/williamquinn/Desktop/DH/Python/MJP/"

## Issues.

### Create Corpus

In [9]:
%%time

# Load data.
# mjp_documents.txt created by xml parser.
mjp_df = pd.read_csv(abs_dir + 'Output/mjp_documents.txt', 
                     sep='\t')[['magazine', 'date', 'type', 'text']]

# Remove bibliographic information (volume/issue, year) from strings.
mjp_df['text'] = mjp_df['text'].astype(str) \
    .str.lower() \
    .str.strip() \
    .str.replace(r'[^\w\s]','', regex=True) \
    .str.replace(r"pgbrk","", regex=True) \
    .str.replace('\.0', '', regex=True) \
    .str.replace(r'vol \w+ no \d+ \w+ \d{4}', '', regex=True) \
    .str.replace(r'\w+ \d{4}', '', regex=True) \
    .str.replace(r'vol \w+ no \d+', '', regex=True) \
    .str.replace(r'v ', '', regex=True) \
    .str.replace(r'vol ', '', regex=True) \
    .str.replace(r'no ', '', regex=True) \
    .str.replace(r'poetry a magazine of verse', '', regex=True) \
    .str.replace(r'the masses', '', regex=True) \
    .str.replace(r'the freewoman [a ]weekly feminist review', '', regex=True) \
    .str.replace(r'the little review', '', regex=True) \
    .str.replace(r'the crisis', '', regex=True) \
    .str.replace(r'the egoist', '', regex=True)

# Concatenate rows by group.
mjp_df = mjp_df.groupby(['magazine', 'date'])['text'] \
    .apply(' '.join) \
    .reset_index()

mjp_df['mjp_index'] = mjp_df.index

# Save Issues.
mjp_df.to_csv(abs_dir + "Output/mjp_issues.txt", sep="\t", encoding="utf-8", index=False)

CPU times: user 12.3 s, sys: 446 ms, total: 12.8 s
Wall time: 13.1 s


### Create Model

In [10]:
%%time

mjp_df = pd.read_csv(abs_dir + 'Output/mjp_issues.txt', sep='\t')

# Create Model
tagged_docs = mjp_df.apply \
    (lambda x:gensim.models.doc2vec.TaggedDocument \
     (gensim.utils.simple_preprocess(x.text), \
      ['doc{}'.format(x.mjp_index)]), axis=1)

training_corpus = tagged_docs.values

# Training.
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=4, epochs=20)

model.build_vocab(training_corpus)

model.train(training_corpus, total_examples = model.corpus_count, epochs = model.epochs)

# Store Model.
model.save(abs_dir + "Output/doc2vec/mjp_issues-d2v.bin")

CPU times: user 11min 18s, sys: 7.99 s, total: 11min 26s
Wall time: 4min 33s


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Genres.

### Create Corpus

In [18]:
%%time

# Load data.
mjp_df = pd.read_csv(abs_dir + 'Output/mjp_documents.txt', 
                     sep='\t')[['magazine', 'date', 'type', 'text']]

# Remove bibliographic information (volume/issue, year) from strings.
mjp_df['text'] = mjp_df['text'].astype(str) \
    .str.lower() \
    .str.strip() \
    .str.replace(r'[^\w\s]','', regex=True) \
    .str.replace(r"pgbrk","", regex=True) \
    .str.replace('\.0', '', regex=True) \
    .str.replace(r'vol \w+ no \d+ \w+ \d{4}', '', regex=True) \
    .str.replace(r'\w+ \d{4}', '', regex=True) \
    .str.replace(r'vol \w+ no \d+', '', regex=True) \
    .str.replace(r'v ', '', regex=True) \
    .str.replace(r'vol ', '', regex=True) \
    .str.replace(r'no ', '', regex=True) \
    .str.replace(r'poetry a magazine of verse', '', regex=True) \
    .str.replace(r'the masses', '', regex=True) \
    .str.replace(r'the freewoman [a ]weekly feminist review', '', regex=True) \
    .str.replace(r'the little review', '', regex=True) \
    .str.replace(r'the crisis', '', regex=True) \
    .str.replace(r'the egoist', '', regex=True)

# Concatenate rows by group.
mjp_df = mjp_df.groupby(['magazine', 'type', 'date'])['text'] \
    .apply(' '.join) \
    .reset_index()

# Filter out some genres.
filter_types = ['advertisements', 'poetry', 'articles', 'drama', 'fiction', 'letters']
mjp_df = mjp_df[mjp_df['type'].isin(filter_types)]

mjp_df['mjp_index'] = mjp_df.index

# Save Issues.
mjp_df.to_csv(abs_dir + "Output/mjp_genres.txt", sep="\t", encoding="utf-8", index=False)

CPU times: user 13.5 s, sys: 500 ms, total: 14 s
Wall time: 14.7 s


### Create Model

In [19]:
%%time

mjp_df = pd.read_csv(abs_dir + 'Output/mjp_genres.txt', sep='\t')

# Create Model
tagged_docs = mjp_df.apply \
    (lambda x:gensim.models.doc2vec.TaggedDocument \
     (gensim.utils.simple_preprocess(x.text), \
      ['doc{}'.format(x.mjp_index)]), axis=1)

training_corpus = tagged_docs.values

# Training.
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=4, epochs=20)

model.build_vocab(training_corpus)

model.train(training_corpus, total_examples = model.corpus_count, epochs = model.epochs)

# Store Model.
model.save(abs_dir + "Output/doc2vec/mjp_genres-d2v.bin")

CPU times: user 16min 11s, sys: 11.6 s, total: 16min 22s
Wall time: 6min 21s
