diff --git a/.gitignore b/.gitignore index 4838572..5b5ff13 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ inst/extdata/ hum2vec.Rproj src/*.o src/*.so +cookbooks cookbooks.txt cookbooks.vectors cookbooks.zip +cookbooks* +etc diff --git a/DESCRIPTION b/DESCRIPTION index d7a09f2..4cbf41b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,18 +1,23 @@ Package: wordVectors Type: Package Title: Tools for creating and analyzing vector-space models of texts -Version: 1.0 +Version: 1.1 Date: 2015-09-10 Author: Ben Schmidt, Jian Li Maintainer: Ben Schmidt -Description: wordVectors wraps the word2vec code for creating vector-space models of texts, and defines a new class "VectorSpaceModel" extending matrix with a number of functions that make it easier to perform useful operations in a word-vector space. +Description: wordVectors wraps Google's word2vec code for creating vector-space + models of texts, and defines a new class "VectorSpaceModel" (extending the native matrix class) + with a number of functions that make it easier to perform useful operations in a + word-vector space. License: Apache License (== 2.0) -Depends: R (>= 2.14.0) +Depends: + R (>= 2.14.0) LazyData: TRUE Imports: - graphics, - methods, - utils + graphics, + methods, + utils Suggests: - stringi, - tsne + stringi, + tsne +RoxygenNote: 5.0.1 diff --git a/NAMESPACE b/NAMESPACE index fc08bb7..bf53fd9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,4 @@ -# Generated by roxygen2 (4.1.1): do not edit by hand +# Generated by roxygen2: do not edit by hand export(as.VectorSpaceModel) export(cosineDist) diff --git a/R/matrixFunctions.R b/R/matrixFunctions.R index a49cf7a..d88ef5a 100644 --- a/R/matrixFunctions.R +++ b/R/matrixFunctions.R @@ -18,14 +18,13 @@ setMethod("initialize", "VectorSpaceModel", callNextMethod(.Object, .cache=.cache, ...) }) -#' Cached Square Magnitudes +#' Square Magnitudes with caching #' -#' @param VectorSpaceModel +#' @param VectorSpaceModel A matrix or VectorSpaceModel object #' @description square_magnitudes Returns the square magnitudes and #' caches them if necessary #' @return A vector of the square magnitudes for each row #' @keywords internal -#' @examples square_magnitudes = function(object) { if (class(object)=="VectorSpaceModel") { if (.hasSlot(object, ".cache")) { @@ -47,10 +46,11 @@ square_magnitudes = function(object) { #' VectorSpaceModel indexing #' -# @description Reduce a VectorSpaceModel to a smaller one +#' @description Reduce a VectorSpaceModel to a smaller one #' @param x The vectorspace model to subset #' @param i The row numbers to extract #' @param j The column numbers to extract +#' @param j Other arguments to extract (unlikely to be useful). #' @param drop Whether to drop columns. This parameter is ignored. #' @return A VectorSpaceModel #' @@ -451,7 +451,7 @@ cosineDist <- function(x,y) { #' of the same length as the VectorSpaceModel. #' #' @return A new matrix or VectorSpaceModel of the same dimensions as `matrix`, -#' each row of which is parallel to vector +#' each row of which is parallel to vector. #' #' If the input is a matrix, the output will be a matrix: if a VectorSpaceModel, #' it will be a VectorSpaceModel. diff --git a/data/demo_vectors.rda b/data/demo_vectors.rda index 44fa4fb..e489a0e 100644 Binary files a/data/demo_vectors.rda and b/data/demo_vectors.rda differ diff --git a/man/VectorSpaceModel-VectorSpaceModel-method.Rd b/man/VectorSpaceModel-VectorSpaceModel-method.Rd index 260e663..053896b 100644 --- a/man/VectorSpaceModel-VectorSpaceModel-method.Rd +++ b/man/VectorSpaceModel-VectorSpaceModel-method.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \docType{methods} \name{-,VectorSpaceModel,VectorSpaceModel-method} diff --git a/man/VectorSpaceModel-class.Rd b/man/VectorSpaceModel-class.Rd index 736424f..07c6d71 100644 --- a/man/VectorSpaceModel-class.Rd +++ b/man/VectorSpaceModel-class.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \docType{class} \name{VectorSpaceModel-class} @@ -13,4 +13,10 @@ The base object is simply a matrix with columns describing dimensions and unique as the names of vectors. This package gives a number of convenience functions for printing and, most importantly, accessing these objects. } +\section{Slots}{ + +\describe{ +\item{\code{magnitudes}}{The cached sum-of-squares for each row in the matrix. Can be cached to +speed up similarity calculations} +}} diff --git a/man/as.VectorSpaceModel.Rd b/man/as.VectorSpaceModel.Rd index 670d213..daa73ae 100644 --- a/man/as.VectorSpaceModel.Rd +++ b/man/as.VectorSpaceModel.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \name{as.VectorSpaceModel} \alias{as.VectorSpaceModel} diff --git a/man/cosineDist.Rd b/man/cosineDist.Rd index 9f62546..f857027 100644 --- a/man/cosineDist.Rd +++ b/man/cosineDist.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \name{cosineDist} \alias{cosineDist} diff --git a/man/cosineSimilarity.Rd b/man/cosineSimilarity.Rd index 5d8d2fc..512ed59 100644 --- a/man/cosineSimilarity.Rd +++ b/man/cosineSimilarity.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \name{cosineSimilarity} \alias{cosineSimilarity} @@ -22,6 +22,12 @@ Calculate the cosine similarity of two matrices or a matrix and a vector. } \examples{ subjects = demo_vectors[[c("history","literature","biology","math","stats"),average=FALSE]] -cosineSimilarity(subjects,subjects) +similarities = cosineSimilarity(subjects,subjects) + +subjects = demo_vectors[[c("history","literature","biology","math","stats"),average=TRUE]] +new_subject_list = nearest_to(demo_vectors,subjects,20) +new_subjects = demo_vectors[[names(new_subject_list),average=FALSE]] +plot(hclust(as.dist(cosineDist(new_subjects,new_subjects)))) + } diff --git a/man/demo_vectors.Rd b/man/demo_vectors.Rd index 22c9ca7..99efc68 100644 --- a/man/demo_vectors.Rd +++ b/man/demo_vectors.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{demo_vectors} diff --git a/man/filter_to_rownames.Rd b/man/filter_to_rownames.Rd index a1e6913..3b47cb1 100644 --- a/man/filter_to_rownames.Rd +++ b/man/filter_to_rownames.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \name{filter_to_rownames} \alias{filter_to_rownames} diff --git a/man/magnitudes.Rd b/man/magnitudes.Rd index 9d5f075..c8dafbc 100644 --- a/man/magnitudes.Rd +++ b/man/magnitudes.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \name{magnitudes} \alias{magnitudes} diff --git a/man/nearest_to.Rd b/man/nearest_to.Rd index 2772ea0..8eede2e 100644 --- a/man/nearest_to.Rd +++ b/man/nearest_to.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \name{nearest_to} \alias{nearest_to} @@ -22,6 +22,7 @@ in the parent VectorSpaceModel, of length n. Return the n closest words in a VectorSpaceModel to a given vector. } \examples{ + #Synonyms and similar words nearest_to(demo_vectors,demo_vectors[["good"]]) @@ -29,5 +30,6 @@ nearest_to(demo_vectors,demo_vectors[["good"]]) # What's the equivalent word for a female teacher that "guy" is for # a male one? nearest_to(demo_vectors,demo_vectors[["guy"]] - demo_vectors[["man"]] + demo_vectors[["woman"]]) + } diff --git a/man/normalize_lengths.Rd b/man/normalize_lengths.Rd index cd6473f..3a3482c 100644 --- a/man/normalize_lengths.Rd +++ b/man/normalize_lengths.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \name{normalize_lengths} \alias{normalize_lengths} diff --git a/man/plot-VectorSpaceModel-method.Rd b/man/plot-VectorSpaceModel-method.Rd index 32076e5..7016682 100644 --- a/man/plot-VectorSpaceModel-method.Rd +++ b/man/plot-VectorSpaceModel-method.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \docType{methods} \name{plot,VectorSpaceModel-method} diff --git a/man/prep_word2vec.Rd b/man/prep_word2vec.Rd index 8aad1f9..d9216c2 100644 --- a/man/prep_word2vec.Rd +++ b/man/prep_word2vec.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/word2vec.R \name{prep_word2vec} \alias{prep_word2vec} diff --git a/man/project.Rd b/man/project.Rd index 8b24d71..6e002bc 100644 --- a/man/project.Rd +++ b/man/project.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \name{project} \alias{project} @@ -14,7 +14,7 @@ of the same length as the VectorSpaceModel.} } \value{ A new matrix or VectorSpaceModel of the same dimensions as `matrix`, -each row of which is parallel to vector +each row of which is parallel to vector. If the input is a matrix, the output will be a matrix: if a VectorSpaceModel, it will be a VectorSpaceModel. diff --git a/man/read.binary.vectors.Rd b/man/read.binary.vectors.Rd index 3915769..85e4e8d 100644 --- a/man/read.binary.vectors.Rd +++ b/man/read.binary.vectors.Rd @@ -1,10 +1,10 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \name{read.binary.vectors} \alias{read.binary.vectors} \title{Read binary word2vec format files} \usage{ -read.binary.vectors(filename, nrows = Inf) +read.binary.vectors(filename, nrows = Inf, cols = "All") } \arguments{ \item{filename}{A file in the binary word2vec format to import.} @@ -13,9 +13,13 @@ read.binary.vectors(filename, nrows = Inf) Word2vec sorts by frequency, so limiting to the first 1000 rows will give the thousand most-common words; it can be useful not to load the whole matrix into memory} + +\item{cols}{The column numbers to read. Default is "All"; +if you are in a memory-limited environment, +you can limit the number of columns you read in by giving a vector of column integers} } \value{ -A word2vec object +A VectorSpaceModel object } \description{ Read binary word2vec format files diff --git a/man/read.vectors.Rd b/man/read.vectors.Rd index 0cdc869..5260cdd 100644 --- a/man/read.vectors.Rd +++ b/man/read.vectors.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \name{read.vectors} \alias{read.vectors} diff --git a/man/reject.Rd b/man/reject.Rd index b848635..bff9438 100644 --- a/man/reject.Rd +++ b/man/reject.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \name{reject} \alias{reject} @@ -28,5 +28,6 @@ Return a vector rejection for each element in a VectorSpaceModel nearest_to(demo_vectors,demo_vectors[["man"]]) genderless = reject(demo_vectors,demo_vectors[["he"]] - demo_vectors[["she"]]) nearest_to(genderless,genderless[["man"]]) + } diff --git a/man/square_magnitudes.Rd b/man/square_magnitudes.Rd new file mode 100644 index 0000000..36e08ed --- /dev/null +++ b/man/square_magnitudes.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrixFunctions.R +\name{square_magnitudes} +\alias{square_magnitudes} +\title{Square Magnitudes with caching} +\usage{ +square_magnitudes(object) +} +\arguments{ +\item{VectorSpaceModel}{A matrix or VectorSpaceModel object} +} +\value{ +A vector of the square magnitudes for each row +} +\description{ +square_magnitudes Returns the square magnitudes and +caches them if necessary +} +\keyword{internal} + diff --git a/man/sub-VectorSpaceModel-method.Rd b/man/sub-VectorSpaceModel-method.Rd index a2dafd1..257589d 100644 --- a/man/sub-VectorSpaceModel-method.Rd +++ b/man/sub-VectorSpaceModel-method.Rd @@ -1,11 +1,11 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \docType{methods} \name{[,VectorSpaceModel-method} \alias{[,VectorSpaceModel-method} \title{VectorSpaceModel indexing} \usage{ -\S4method{[}{VectorSpaceModel}(x, i, j) +\S4method{[}{VectorSpaceModel}(x, i, j, ..., drop = TRUE) } \arguments{ \item{x}{The vectorspace model to subset} @@ -13,11 +13,15 @@ \item{i}{The row numbers to extract} \item{j}{The column numbers to extract} + +\item{drop}{Whether to drop columns. This parameter is ignored.} + +\item{j}{Other arguments to extract (unlikely to be useful).} } \value{ A VectorSpaceModel } \description{ -VectorSpaceModel indexing +Reduce a VectorSpaceModel to a smaller one } diff --git a/man/sub-sub-VectorSpaceModel-method.Rd b/man/sub-sub-VectorSpaceModel-method.Rd index 3a4a7b9..79cbce7 100644 --- a/man/sub-sub-VectorSpaceModel-method.Rd +++ b/man/sub-sub-VectorSpaceModel-method.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \docType{methods} \name{[[,VectorSpaceModel-method} diff --git a/man/train_word2vec.Rd b/man/train_word2vec.Rd index 074a413..ecc111a 100644 --- a/man/train_word2vec.Rd +++ b/man/train_word2vec.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/word2vec.R \name{train_word2vec} \alias{train_word2vec} diff --git a/man/write.binary.word2vec.Rd b/man/write.binary.word2vec.Rd index 75e0e7b..a942065 100644 --- a/man/write.binary.word2vec.Rd +++ b/man/write.binary.word2vec.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrixFunctions.R \name{write.binary.word2vec} \alias{write.binary.word2vec}