RNN based language model for the vShakespeare plays and poems

[Recurrent neural network based language model
](http://www.fit.vutbr.cz/research/groups/speech/publi/2010/mikolov_interspeech2010_IS100722.pdf)

In [10]:
import os
import os.path
import shutil
import numpy as np
import pandas as pd
import string
import collections
import itertools
import random
import json
import pickle

import nltk.data

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
#print(tf.__version__)
nltk.download('punkt')

from nltk.corpus import PlaintextCorpusReader
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize

from __future__ import print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alberto.garza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


We first begin by de-serializing the data streams from our pickle files, which contain all texts from both Shakespeare's plays and poems.

In [11]:
with open('shakespeare_plays.pickle', 'rb') as handle:
    speeches = pickle.load(handle)

We take the speeches and poems and feed them into shakespeare_texts, which will hold a list of all sentences.

In [12]:
#from pickle files
shakespeare_texts = [s['speech_text'] for s in speeches]
shakespeare_speakers = [s['speaker'] for s in speeches]

#character/gender mapping
characters = pd.read_csv('characters.txt', sep='\t')

#combine character with their lines
shakes = pd.DataFrame(list(zip(shakespeare_speakers, shakespeare_texts)))
shakes.columns = ['character', 'text']

#join in character gender and drop lines where gender is missing
shakes = pd.merge(shakes, characters, how='left', on=['character'])
shakes = shakes[shakes.gender.notnull()]
shakes = shakes.reset_index(drop=True)
shakes.head()

Unnamed: 0,character,text,gender
0,COUNTESS,"In delivering my son from me, I bury a second ...",Female
1,BERTRAM,"And I in going, madam, weep o'er my father's d...",Male
2,COUNTESS,What hope is there of his majesty's amendment?,Female
3,COUNTESS,"This young gentlewoman had a father,--O, that\...",Female
4,COUNTESS,"He was famous, sir, in his profession, and it ...",Female


In [13]:
print("%d documents" % len(shakes.text))
print("%d categories" % len(shakes.gender.unique()))

19661 documents
2 categories


In [31]:
labels = shakes.gender
true_k = np.unique(labels).shape[0]

In [32]:
# parse commandline arguments
op = OptionParser()

op.add_option("--lsa",
              dest="n_components", 
              type="int",
              help="Preprocess documents with latent semantic analysis.")

op.add_option("--no-minibatch",
              action="store_false", 
              dest="minibatch", 
              default=True,
              help="Use ordinary k-means algorithm (in batch mode).")

op.add_option("--no-idf",
              action="store_false", 
              dest="use_idf", 
              default=True,
              help="Disable Inverse Document Frequency feature weighting.")

op.add_option("--use-hashing",
              action="store_true", 
              default=False,
              help="Use a hashing feature vectorizer")

op.add_option("--n-features", 
              type=int, 
              default=10000,
              help="Maximum number of features (dimensions) to extract from text.")

op.add_option("--verbose",
              action="store_true", 
              dest="verbose", 
              default=False,
              help="Print progress reports inside k-means algorithm.")

print(__doc__)
op.print_help()

def is_interactive():
    return not hasattr(sys.modules['__main__'], '__file__')

# work-around for Jupyter notebook and IPython console
argv = [] if is_interactive() else sys.argv[1:]
(opts, args) = op.parse_args(argv)
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

Automatically created module for IPython interactive environment
Usage: ipykernel_launcher.py [options]

Options:
  -h, --help            show this help message and exit
  --lsa=N_COMPONENTS    Preprocess documents with latent semantic analysis.
  --no-minibatch        Use ordinary k-means algorithm (in batch mode).
  --no-idf              Disable Inverse Document Frequency feature weighting.
  --use-hashing         Use a hashing feature vectorizer
  --n-features=N_FEATURES
                        Maximum number of features (dimensions) to extract
                        from text.
  --verbose             Print progress reports inside k-means algorithm.


In [33]:
print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    if opts.use_idf:
        # Perform an IDF normalization on the output of HashingVectorizer
        hasher = HashingVectorizer(n_features=opts.n_features,
                                   stop_words='english', 
                                   alternate_sign=False,
                                   norm=None, 
                                   binary=False)
        vectorizer = make_pipeline(hasher, TfidfTransformer())
    else:
        vectorizer = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english',
                                       alternate_sign=False, 
                                       norm='l2',
                                       binary=False)
else:
    vectorizer = TfidfVectorizer(max_df=0.5, 
                                 max_features=opts.n_features,
                                 min_df=2, 
                                 stop_words='english',
                                 use_idf=opts.use_idf)
    
X = vectorizer.fit_transform(shakes.text)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

Extracting features from the training dataset using a sparse vectorizer
done in 0.456000s
n_samples: 19661, n_features: 10000



In [34]:
if opts.n_components:
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are
    # not normalized, we have to redo the normalization.
    svd = TruncatedSVD(opts.n_components)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)

    print("done in %fs" % (time() - t0))

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print()

In [35]:
# #############################################################################
# Do the actual clustering

if opts.minibatch:
    km = MiniBatchKMeans(n_clusters=true_k, 
                         init='k-means++', 
                         n_init=1,
                         init_size=1000, 
                         batch_size=1000, 
                         verbose=opts.verbose)
else:
    km = KMeans(n_clusters=true_k, 
                init='k-means++', 
                max_iter=100, 
                n_init=1,
                verbose=opts.verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()


if not opts.use_hashing:
    print("Top terms per cluster:")

    if opts.n_components:
        original_space_centroids = svd.inverse_transform(km.cluster_centers_)
        order_centroids = original_space_centroids.argsort()[:, ::-1]
    else:
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()

Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=2,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=False)
done in 0.072s

Homogeneity: 0.000
Completeness: 0.000
V-measure: 0.000
Adjusted Rand-Index: 0.005
Silhouette Coefficient: 0.011

Top terms per cluster:
Cluster 0: thou sir shall good thee thy come ll know love
Cluster 1: lord good ay ll shall did say noble know think
