## Imports

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
from IPython import display

import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds

import itertools
import pickle
import math
import re

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from gensim.models import word2vec

from word2vec_as_MF import Word2vecMF
from functions import *

import time

## Read and preprocess enwik9

In [2]:
# Load enwik 9
# data = np.loadtxt("data/enwik9.txt", dtype=str, delimiter='.')
data = np.loadtxt("data/xaa", dtype=str, delimiter='.')

In [3]:
def wiki_to_wordlist(sentence, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.

    # 3. Convert words to lower case and split them
    words = sentence.split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [4]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for sentence in data:
    sentences += [wiki_to_wordlist(sentence)]

indices = []
for i, sentence in enumerate(sentences):
    if not sentence:
        pass
    else:
        indices.append(i)

real_sentences = np.array(sentences)[indices]

Parsing sentences from training set


In [12]:
# Create word2vec as matrix factorization model
model_enwik = Word2vecMF()
model_enwik.data_to_matrices(real_sentences, 200, 5, 'enwik-200/matrices.npz')

In [6]:
# If the model has been already created, load it from file
model_enwik = Word2vecMF()
model_enwik.load_matrices(from_file='enwik-200/matrices.npz')

## Train ro_sgns model starting from SVD of SPPMI

In [7]:
# SVD initialization
SPPMI = np.maximum(np.log(model_enwik.D) - np.log(model_enwik.B), 0)
# print SPPMI
u, s, vt = svds(SPPMI, k=200)
C_svd = u.dot(np.sqrt(np.diag(s))).T
W_svd = np.sqrt(np.diag(s)).dot(vt)

  


In [13]:
model_enwik.C = C_svd
model_enwik.W = W_svd

model_enwik.save_CW('enwik-200/initializations/SVD_dim200', 0)

In [21]:
# Train the model
start_time = time.time()
opt_experiment(model_enwik,
               mode='PS', 
               d=200,
               eta = 5e-5,
               MAX_ITER=100,
               from_iter=0,
               start_from='SVD',
               init=(True, C_svd, W_svd), display=True)
print("--- %s seconds ---" % (time.time() - start_time))

Iter #: 1 loss 14.5709831941
Iter #: 2 loss 14.5709831941
Iter #: 3 loss 14.5461437313
Iter #: 4 loss 14.5072837444
Iter #: 5 loss 14.395312055
Iter #: 6 loss 14.1393172118
Iter #: 7 loss 13.8474359115
Iter #: 8 loss 13.6249103605
Iter #: 9 loss 13.4567772505
Iter #: 10 loss 13.322318689
Iter #: 11 loss 13.2091472975
Iter #: 12 loss 13.1097102004
Iter #: 13 loss 13.0189718647
Iter #: 14 loss 12.9333205818
Iter #: 15 loss 12.8502085526
Iter #: 16 loss 12.7682466503
Iter #: 17 loss 12.687422071
Iter #: 18 loss 12.6089647989
Iter #: 19 loss 12.5346022735
Iter #: 20 loss 12.4655893821
Iter #: 21 loss 12.402235817
Iter #: 22 loss 12.3441287405
Iter #: 23 loss 12.2905916542
Iter #: 24 loss 12.2409690029
Iter #: 25 loss 12.1947096576
Iter #: 26 loss 12.1513643419
Iter #: 27 loss 12.1105655208
Iter #: 28 loss 12.0720092096
Iter #: 29 loss 12.0354412093
Iter #: 30 loss 12.000646967
Iter #: 31 loss 11.9674440342
Iter #: 32 loss 11.9356763366
Iter #: 33 loss 11.9052097134
Iter #: 34 loss 11.87592

In [15]:

model_enwik.C = C_svd
model_enwik.W = W_svd
start_time = time.time()
model_enwik.bfgd(d=200,from_iter=0, MAX_ITER=100, eta=8e-6, init=(True, C_svd, W_svd), display=True)
print("--- %s seconds ---" % (time.time() - start_time))

Itecr #: 1 loss 9.16735031547
Itecr #: 2 loss 9.70276310932
Itecr #: 3 loss 9.37616560721
Itecr #: 4 loss 9.86247403949
Itecr #: 5 loss 9.38488840726
Itecr #: 6 loss 9.8641219436
Itecr #: 7 loss 9.38807074629
Itecr #: 8 loss 9.81562115335
Itecr #: 9 loss 9.43808499171
Itecr #: 10 loss 9.9482745964
Itecr #: 11 loss 9.51271548951
Itecr #: 12 loss 9.97095877957
Itecr #: 13 loss 9.33314150213
Itecr #: 14 loss 9.70517339076
Itecr #: 15 loss 9.1011111511
Itecr #: 16 loss 9.42437144481
Itecr #: 17 loss 9.53643096015
Itecr #: 18 loss 10.3767739669
Itecr #: 19 loss 9.46319300524
Itecr #: 20 loss 9.55336214024
Itecr #: 21 loss 9.11218860316
Itecr #: 22 loss 9.57168072733
Itecr #: 23 loss 9.39077153572
Itecr #: 24 loss 10.0093064257
Itecr #: 25 loss 9.28570934782
Itecr #: 26 loss 9.58303543613
Itecr #: 27 loss 9.4999582266
Itecr #: 28 loss 10.2637774015
Itecr #: 29 loss 9.37109423071
Itecr #: 30 loss 9.47533672502
Itecr #: 31 loss 9.14071614627
Itecr #: 32 loss 9.75932922934
Itecr #: 33 loss 9.31