In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Math libraries
import numpy as np
from scipy.sparse.linalg import svds
from numpy.linalg import norm, svd

# Library for iterations
import itertools

# Basic library for math functions
import math

# Library for regular expressions
import re

# Libraries designed for this task - see /util
from util.word2vec_as_MF import Word2vecMF
from util.functions import *

# Library for timing
import time

Please run:
  python setup.py build_ext --inplace


In [2]:
# Problem parameters
d = 300       # Inner dimension (low-rank) of matrices
rho = 0.0     # Regularization parameter: rho * ||W W' - C C'||_F^2
k = 10        # Negative sampling parameter

In [3]:
# Generate class instance to load data
model_enwik = Word2vecMF()        # Creates object

# Load data
model_enwik.data_to_matrices('data/x1', 
                             r = 50, 
                             k = k, 
                             DB_to_file = False,
                             vocab_to_file = False)

if np.argwhere(np.isnan(model_enwik.D)).size == 0:
    print('No NaNs in D')
if np.argwhere(np.isnan(model_enwik.B)).size == 0:
    print('No NaNs in B')
    
if np.argwhere(np.isinf(model_enwik.D)).size == 0:
    print('No Infs in D.')
if np.argwhere(np.isinf(model_enwik.B)).size == 0:
    print('No Infs in B.')

SPPMI = np.maximum(np.nan_to_num(np.log(model_enwik.D) - np.log(model_enwik.B)), 0)
SPPMI2 = np.maximum(np.log(model_enwik.D) - np.log(model_enwik.B), 0)
if np.array_equal(SPPMI, SPPMI2):
    print('np.nan_to_num() is not used in SPPMI.')    

Parsing sentences from training set
No NaNs in D
No NaNs in B
No Infs in D.
No Infs in B.
np.nan_to_num() is not used in SPPMI.
30772.52560507925




In [4]:
# Generate specific initializations
C0_SPPMI, W0_SPPMI, step_size = SPPMI_init(model_enwik, dimension = d, negative = k)

print('Dimension of C0: ', C0_SPPMI.shape)
print('Dimension of W0: ', W0_SPPMI.shape)

C0_BFGD, W0_BFGD, step_size = BFGD_init(model_enwik, dimension = d, reg = rho, calculate_step = True)

  


0.027206866973066266
115.71293390302307
('Initial loss', 4473009.696893112, 'theoretical step size', None)
Dimension of C0:  (300, 823)
Dimension of W0:  (300, 823)
hello there
('Initial loss', 4666204.253151528, 'theoretical step size', 4.849237969563645e-07)


In [5]:
# Run BFGD - Our initialization, user-defined step size 1e-5, rho is zero
start_time = time.time()

opt_experiment(model_enwik,            # Load model
               mode = 'BFGD',          # Mode = 'BFGD'
               d = d,                  # Inner dimension
               eta = 1e-5,             # Step size
               #eta = step_size,
               reg = rho,              # Regularization parameter
               MAX_ITER = 1000,        # Max. number of iterations
               from_iter = 0,          # Start from iteration 0 (i.e., don't load any unfinished jobs)
               start_from = 'test',    # ??
               itv_print = 100,          # ??
               itv_save = 100000,      # ??
               init = (True, C0_BFGD, W0_BFGD),  # Initialization
               display = True,         # Display results
               autostop = False)

print("--- %s seconds ---" % (time.time() - start_time))

enwik-200/BFGDiter_fromtest_dim300_step1e-05_0.0
('Iter #:', 0, 'loss', 4666204.253151528)
('Iter #:', 100, 'loss', 2047860.6739421804)
('Iter #:', 200, 'loss', 1856487.4365483103)
('Iter #:', 300, 'loss', 1799501.1949033055)
('Iter #:', 400, 'loss', 1772826.5493207132)
('Iter #:', 500, 'loss', 1757552.191833937)
('Iter #:', 600, 'loss', 1747513.2203865629)
('Iter #:', 700, 'loss', 1739978.6701471142)
('Iter #:', 800, 'loss', 1733916.1004861847)
('Iter #:', 900, 'loss', 1730848.7153326385)
('Iter #:', 1000, 'loss', 1725910.3420607678)
--- 31.544533014297485 seconds ---


In [6]:
# Run BFGD - Our initialization, our step size, rho is zero
start_time = time.time()

opt_experiment(model_enwik,            # Load model
               mode = 'BFGD',          # Mode = 'BFGD'
               d = d,                  # Inner dimension
               eta = step_size,        # Step size
               reg = rho,              # Regularization parameter
               MAX_ITER = 1000,        # Max. number of iterations
               from_iter = 0,          # Start from iteration 0 (i.e., don't load any unfinished jobs)
               start_from = 'test',    # ??
               itv_print = 100,        # ??
               itv_save = 100000,      # ??
               init = (True, C0_BFGD, W0_BFGD),  # Initialization
               display = True,         # Display results
               autostop = False)

print("--- %s seconds ---" % (time.time() - start_time))

enwik-200/BFGDiter_fromtest_dim300_step4.923674238056453e-09_0.0
('Iter #:', 0, 'loss', 3959916.1376499464)
('Iter #:', 100, 'loss', 3952781.1796068856)
('Iter #:', 200, 'loss', 3945761.245044749)
('Iter #:', 300, 'loss', 3938853.3110885546)
('Iter #:', 400, 'loss', 3932054.456434045)
('Iter #:', 500, 'loss', 3925361.8551674276)
('Iter #:', 600, 'loss', 3918772.771270943)
('Iter #:', 700, 'loss', 3912284.5537536344)
('Iter #:', 800, 'loss', 3905894.632345954)
('Iter #:', 900, 'loss', 3899600.5136977197)
('Iter #:', 1000, 'loss', 3893399.7780208085)
--- 32.98883676528931 seconds ---


In [7]:
# Run BFGD - SPPMI initialization, user-defined step size 1e-5, rho is zero
start_time = time.time()

opt_experiment(model_enwik,            # Load model
               mode = 'BFGD',          # Mode = 'BFGD'
               d = d,                  # Inner dimension
               eta = 1e-5,             # Step size
               #eta = step_size,
               reg = rho,              # Regularization parameter
               MAX_ITER = 1000,        # Max. number of iterations
               from_iter = 0,          # Start from iteration 0 (i.e., don't load any unfinished jobs)
               start_from = 'test',    # ??
               itv_print = 100,        # ??
               itv_save = 100000,      # ??
               init = (True, C0_SPPMI, W0_SPPMI),  # Initialization
               display = True,         # Display results
               autostop = False)

print("--- %s seconds ---" % (time.time() - start_time))

enwik-200/BFGDiter_fromtest_dim300_step1e-05_0.0
('Iter #:', 0, 'loss', 4473009.696893112)
('Iter #:', 100, 'loss', 2113858.3720422033)
('Iter #:', 200, 'loss', 1910045.4327186688)
('Iter #:', 300, 'loss', 1833808.0433454078)
('Iter #:', 400, 'loss', 1792505.875537711)
('Iter #:', 500, 'loss', 1767575.777827814)
('Iter #:', 600, 'loss', 1751147.3684291001)
('Iter #:', 700, 'loss', 1739154.5176801584)
('Iter #:', 800, 'loss', 1730593.6998784845)
('Iter #:', 900, 'loss', 1722726.968755842)
('Iter #:', 1000, 'loss', 1716183.3658050634)
--- 33.359684228897095 seconds ---


In [35]:
# Run BFGD - SPPMI initialization, our step size, rho is zero
start_time = time.time()

opt_experiment(model_enwik,            # Load model
               mode = 'BFGD',          # Mode = 'BFGD'
               d = d,                  # Inner dimension
               eta = step_size,        # Step size
               reg = rho,              # Regularization parameter
               MAX_ITER = 1000,        # Max. number of iterations
               from_iter = 0,          # Start from iteration 0 (i.e., don't load any unfinished jobs)
               start_from = 'test',    # ??
               itv_print = 100,        # ??
               itv_save = 100000,      # ??
               init = (True, C0_SPPMI, W0_SPPMI),  # Initialization
               display = True,         # Display results
               autostop = False)

print("--- %s seconds ---" % (time.time() - start_time))

enwik-200/BFGDiter_fromtest_dim300_step4.6260178076223814e-07_0.0
('Iter #:', 0, 'loss', 4473009.696893112)
('Iter #:', 100, 'loss', 4440258.094397495)
('Iter #:', 200, 'loss', 3988010.134604834)
('Iter #:', 300, 'loss', 3549726.4725704193)
('Iter #:', 400, 'loss', 3274235.945186249)
('Iter #:', 500, 'loss', 3072919.244226388)
('Iter #:', 600, 'loss', 2917173.0964144054)
('Iter #:', 700, 'loss', 2791364.723934947)
('Iter #:', 800, 'loss', 2686535.1442358517)
('Iter #:', 900, 'loss', 2596590.560600534)
('Iter #:', 1000, 'loss', 2517627.1307845498)
--- 36.38679075241089 seconds ---
