# Comparing Writing Styles

## Using Word Embeddings and Dynamic Time Warping

In [25]:
%matplotlib inline

from gensim.models import Word2Vec
import numpy as np
import cPickle
import nltk.data
import pyprind
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("whitegrid")

from sklearn.decomposition import RandomizedPCA
from utils import book_data
from fastdtw import fastdtw
from sklearn import manifold
from scipy.signal import gaussian
from scipy.ndimage import filters
from sklearn.cluster import KMeans

import os
import sys
reload(sys)
sys.setdefaultencoding("utf8")

### Load word2vec model
Train using `train_word2vec.py` in `src\utils`

In [10]:
model = Word2Vec.load("data/word2vec_model")

### Find cluster centers

In [15]:
vecs = []
word_names = []
for word in model.vocab:
    vecs.append(model[word])
    word_names.append(word)
    
vecs = np.array(vecs)
km = KMeans(n_clusters=4)
km.fit(vecs)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=4, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

### Start Reading books

In [16]:
# Find all books for comparison
root_path = "../gutenberg"

all_books = []
for f in os.listdir(root_path):
    if f.endswith(".epub"):
        all_books.append(f)

### Get Sentence vectors

In [21]:
data = []
tokenizer = nltk.data.load("tokenizers\punkt\english.pickle")

bar = pyprind.ProgBar(len(all_books))
for book in all_books:
    data.append(book_data.get_sentence_vector(os.path.join(root_path, book), model, tokenizer))
    bar.update()

### Dtw distance

In [22]:
def dtw_distance(sig_one, sig_two, fft=False):
    """
    Return dtw distance
    """
    
    if fft:
        sig_one = np.fft.fftn(sig_one)
        sig_two = np.fft.fftn(sig_two)
        
    d = fastdtw(sig_one, sig_two, dist=lambda a, b: sum((a - b) ** 2) ** 0.5)[0]
    return np.sqrt(np.abs(d ** 2))

### Filters

In [26]:
def gauss(x, y):
	b = gaussian(200, 60)
	ga = filters.convolve1d(y, b/b.sum())
	return ga

def cos_sim(a, b):
    return np.dot(a, b.T) / (np.linalg.norm(a) * np.linalg.norm(b))

def clean(signal):
    y = signal
    x = xrange(y.shape[0])
    y = cos_sim(y, km.cluster_centers_)
    
    # Filter
    cleaned = np.zeros(y.shape)
    for i in xrange(y.shape[1]):
        cleaned[:, i] = gauss(x, y[:, i])
    
    return cleaned

### Generating distance matrix

In [27]:
lim = len(all_books)
distances = -1 * np.ones((lim, lim))

bar = pyprind.ProgBar(lim * lim)
for i in xrange(lim):
    for j in xrange(lim):
        if i == j:
            distances[i, j] = 0
        else:
            if distances[i, j] == -1:
                dist = dtw_distance(clean(data[i]), clean(data[j]))
                distances[i, j] = dist
                distances[j, i] = dist
        bar.update()

In [None]:
plot_idx = 22 # index of book to get signal plots

f, axarr = plt.subplots(4, sharex=True, figsize=(15, 13))

cleaned = clean(data[plot_idx])

for idx, ax in enumerate(axarr):
    ax.plot(cos_sim(data[plot_idx], km.cluster_centers_)[:, idx], alpha=0.3)
    ax.plot(cleaned[:, idx])
    ax.set_yticks([])
    ax.set_xlim([0, 2800])

plt.show()

### Multidimensional Scaling Plot

In [None]:
mds = manifold.MDS(
    n_components=2,
    max_iter=3000,
    eps=1e-9,
    random_state=1234,
    dissimilarity="precomputed",
    n_jobs=1
)

pos = mds.fit(distances).embedding_

# Rotate the data
clf = RandomizedPCA(n_components=2)

pos = clf.fit_transform(pos)


In [None]:
fig = plt.figure(figsize=(15, 15))

plt.scatter(pos[:, 0], pos[:, 1], s=200)

for i in xrange(len(all_books)):
    plt.annotate(all_books[i][:-5], (pos[i, 0] + 2, pos[i, 1]))

plt.show()