In [1]:
import os
import os.path
import shutil
import numpy as np
import pandas as pd
import string
import collections
import pickle
import nltk.data
import sys
from time import time
import matplotlib.pyplot as plt
%matplotlib inline
import logging
import tensorflow as tf

from __future__ import print_function
from nltk.corpus import PlaintextCorpusReader
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alberto.garza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


We first begin by de-serializing the data streams from our pickle files, which contain all texts from both Shakespeare's plays and poems.

In [2]:
with open('shakespeare_plays.pickle', 'rb') as handle:
    speeches = pickle.load(handle)

We take the speeches and poems and feed them into shakespeare_texts, which will hold a list of all sentences.

In [3]:
#from pickle files
shakespeare_texts = [s['speech_text'] for s in speeches]
shakespeare_speakers = [s['speaker'] for s in speeches]

#character/gender mapping
characters = pd.read_csv('characters.txt', sep='\t')

#combine character with their lines
shakes = pd.DataFrame(list(zip(shakespeare_speakers, shakespeare_texts)))
shakes.columns = ['character', 'text']

#join in character gender and drop lines where gender is missing
shakes = pd.merge(shakes, characters, how='left', on=['character'])
shakes = shakes[shakes.gender.notnull()]
shakes = shakes.reset_index(drop=True)
shakes.head()

Unnamed: 0,character,text,gender
0,COUNTESS,"In delivering my son from me, I bury a second ...",Female
1,BERTRAM,"And I in going, madam, weep o'er my father's d...",Male
2,COUNTESS,What hope is there of his majesty's amendment?,Female
3,COUNTESS,"This young gentlewoman had a father,--O, that\...",Female
4,COUNTESS,"He was famous, sir, in his profession, and it ...",Female


In [4]:
print("%d documents" % len(shakes.text))
print("%d categories" % len(shakes.gender.unique()))

19661 documents
2 categories


In [5]:
labels = shakes.gender
true_k = np.unique(labels).shape[0]

In [6]:
n_features = 10000
print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()

# Perform an IDF normalization on the output of HashingVectorizer
hasher = HashingVectorizer(n_features=n_features,
                           stop_words='english', 
                           #alternate_sign=False,
                           norm=None, 
                           binary=False)

vectorizer = TfidfVectorizer(max_df=0.5, 
                                 max_features=100,
                                 min_df=2, 
                                 stop_words='english')
   
X = vectorizer.fit_transform(shakes.text)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

Extracting features from the training dataset using a sparse vectorizer
done in 0.405603s
n_samples: 19661, n_features: 100



In [7]:
print("Performing dimensionality reduction using LSA")
t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(99)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

print()

Performing dimensionality reduction using LSA
done in 0.561604s
Explained variance of the SVD step: 99%



In [8]:
# #############################################################################
# Do the actual clustering

mbkm = MiniBatchKMeans(n_clusters=true_k, 
                     init='k-means++', 
                     n_init=1,
                     init_size=500, 
                     batch_size=1000, 
                     verbose=False)

km = KMeans(n_clusters=true_k, 
            init='k-means++', 
            max_iter=100, 
            n_init=1,
            verbose=False)

#fitting
km.fit(X)
mbkm.fit(X)

print("K-Means Summary:")
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))
print()
print("Top terms per cluster:")
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()
    
print()
print()

print("Minibatch K-Means Summary:")
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, mbkm.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, mbkm.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, mbkm.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, mbkm.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, mbkm.labels_, sample_size=1000))
print()

print("Top terms per cluster:")
original_space_centroids = svd.inverse_transform(mbkm.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

K-Means Summary:
Homogeneity: 0.000
Completeness: 0.000
V-measure: 0.000
Adjusted Rand-Index: -0.007
Silhouette Coefficient: 0.105

Top terms per cluster:
Cluster 0: thou shall good thee ll thy come let love man
Cluster 1: sir lord good shall ay know did come ll pray


Minibatch K-Means Summary:
Homogeneity: 0.000
Completeness: 0.000
V-measure: 0.000
Adjusted Rand-Index: -0.002
Silhouette Coefficient: 0.110

Top terms per cluster:
Cluster 0: thou thy thee art st hast dost say know come
Cluster 1: lord sir good shall ll come let love know man
