From 93e7617da25d95753e9f93a0b37ba1abad3a98d6 Mon Sep 17 00:00:00 2001 From: "Alexandre T. S" Date: Tue, 26 Jul 2016 17:32:42 -0300 Subject: [PATCH] first release --- README.md | 57 +- demo.sh | 27 + demo_external_memory.sh | 28 + external_memory_lexvec.sh | 46 ++ lexvec.go | 1122 +++++++++++++++++++++++++++++++++++++ line_merge.py | 49 ++ merge_context_vectors.py | 68 +++ pairs_to_counts.sh | 46 ++ shuffle.py | 73 +++ word2vec | 66 +++ 10 files changed, 1581 insertions(+), 1 deletion(-) create mode 100755 demo.sh create mode 100755 demo_external_memory.sh create mode 100755 external_memory_lexvec.sh create mode 100644 lexvec.go create mode 100644 line_merge.py create mode 100644 merge_context_vectors.py create mode 100755 pairs_to_counts.sh create mode 100644 shuffle.py create mode 100755 word2vec diff --git a/README.md b/README.md index 466e0f3..9a44cbd 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,58 @@ # LexVec -Source code for the model will be published shortly before ACL 2016 which takes place between August 7-12. +This is an implementation of the **LexVec word embedding model** (similar to word2vec and GloVe) that achieves state of the art results in multiple NLP tasks, as described in [this paper](https://arxiv.org/pdf/1606.00819v2) and [this one](https://arxiv.org/pdf/1606.01283v1). + +## Installation + +### Binary + +The easiest way to get started with LexVec is to download the binary release. We only distribute amd64 binaries for Linux. + +**[Download binary](https://github.com/alexandres/lexvec/releases)** + +If you are using Windows, OS X, 32-bit Linux, or any other OS, follow the instructions below to build from source. + +### Building from source + +1. [Install the Go compiler](https://golang.org/doc/install) +2. Make sure your `$GOPATH` is set +3. Execute the following commands in your terminal: + + ```bash + go get github.com/alexandres/lexvec + cd $GOPATH/src/github.com/alexandres/lexvec + go build + ``` + +## Usage + +### In-memory (default, faster) + +To get started, run `$ ./demo.sh` which trains a model using the small [text8](http://mattmahoney.net/dc/text8.zip) corpus (100MB from Wikipedia). + +Basic usage of LexVec is: + +`$ ./lexvec -corpus somecorpus -output someoutputdirectory/vectors` + +Run `$ ./lexvec -h` for a full list of options. + +Additionally, we provide a `word2vec` script which implements the exact same interface as the [word2vec](https://code.google.com/archive/p/word2vec/) package should you want to test LexVec using existing scripts. + +### External Memory + +By default, LexVec stores the sparse matrix being factorized in-memory. This can be a problem if your training corpus is large and your system memory limited. We suggest you first try using the in-memory implementation. If you run into Out-Of-Memory issues, try this External Memory approximation. +xi + +`env OUTPUTDIR=output ./external_memory_lexvec.sh -corpus somecorpus -dim 300 ...exactsameoptionsasinmemory` + +Pre-processing can be accelerated by installing [nsort](http://www.ordinal.com/try.cgi/nsort-i386-3.4.54.rpm) and [pypy](http://pypy.org/) and editing `pairs_to_counts.sh`. + +## References + +Salle, A., Idiart, M., & Villavicencio, A. (2016). [Matrix Factorization using Window Sampling and Negative Sampling for Improved Word Representations](https://arxiv.org/pdf/1606.00819v2). arXiv preprint arXiv:1606.00819. + +Salle, A., Idiart, M., & Villavicencio, A. (2016). [Enhancing the LexVec Distributed Word Representation Model Using Positional Contexts and External Memory](https://arxiv.org/pdf/1606.01283v1). arXiv preprint arXiv:1606.01283. + +## License + +Copyright (c) 2016 Salle, Alexandre . All work in this package is distributed under the MIT License. diff --git a/demo.sh b/demo.sh new file mode 100755 index 0000000..1ee781d --- /dev/null +++ b/demo.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -e + +# build lexvec binary +if [ ! -e lexvec ]; then + go build +fi + +if [ ! -e text8 ]; then + echo Downloading text8 corpus + if hash wget 2>/dev/null; then + wget http://mattmahoney.net/dc/text8.zip + else + curl -O http://mattmahoney.net/dc/text8.zip + fi + unzip text8.zip + rm text8.zip +fi + +OUTPUTDIR=output + +mkdir -p $OUTPUTDIR +# These settings are for small corpora such as text8. For larger corpora, stick to the default settings. +./lexvec -corpus text8 -output $OUTPUTDIR/vectors -dim 200 -iterations 15 -subsample 1e-4 -window 2 -model 2 -negative 25 -minfreq 5 -threads 12 -pos=false + +echo Trained vectors saved to file $OUTPUTDIR/vectors diff --git a/demo_external_memory.sh b/demo_external_memory.sh new file mode 100755 index 0000000..6f5df57 --- /dev/null +++ b/demo_external_memory.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +set -e + +# build lexvec binary +if [ ! -e lexvec ]; then + go build +fi + +if [ ! -e text8 ]; then + echo Downloading text8 corpus + if hash wget 2>/dev/null; then + wget http://mattmahoney.net/dc/text8.zip + else + curl -O http://mattmahoney.net/dc/text8.zip + fi + unzip text8.zip + rm text8.zip +fi + +export OUTPUTDIR=output +export MI=false +export MEMORY=1 + +# These settings are for small corpora such as text8. For larger corpora, stick to the default settings. +./external_memory_lexvec.sh -corpus text8 -dim 200 -iterations 15 -subsample 1e-4 -window 2 -model 2 -negative 25 -minfreq 5 -threads 12 -pos=false + +echo Trained vectors saved to file $OUTPUTDIR/vectors diff --git a/external_memory_lexvec.sh b/external_memory_lexvec.sh new file mode 100755 index 0000000..aa0f9ef --- /dev/null +++ b/external_memory_lexvec.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Copyright (c) 2016 Salle, Alexandre +# Author: Salle, Alexandre +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +set -e + +if [ -z "$OUTPUTDIR" ]; then echo "Need to set OUTPUTDIR"; fi + +mkdir -p $OUTPUTDIR + +export TMPDIR=$OUTPUTDIR + +export MI=${MI:-false} +COOC=$OUTPUTDIR/coocs +COOC_TOTALS=$COOC.totals +VOCAB=$OUTPUTDIR/vocab + +CMD="./lexvec $@ -mi=$MI -cooctotalspath $COOC_TOTALS -externalmemory" + +echo identifying w,c pairs +eval $CMD -printcooc -coocpath $COOC -savevocab $VOCAB + +echo aggregating pairs +./pairs_to_counts.sh < $COOC > $COOC.ready +rm $COOC + +echo traning model +eval $CMD -coocpath $COOC.ready -output $OUTPUTDIR/vectors -readvocab $VOCAB diff --git a/lexvec.go b/lexvec.go new file mode 100644 index 0000000..f0b2d68 --- /dev/null +++ b/lexvec.go @@ -0,0 +1,1122 @@ +/* + * Copyright (c) 2016 Salle, Alexandre + * Author: Salle, Alexandre + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package main + +import "flag" +import "io" +import "os" +import "bufio" +import "strings" +import "strconv" +import "sync" +import "time" + +import "math" +import "math/rand" +import "fmt" +import "sort" +import "unicode/utf8" + +const ( + ERROR = 0 + INFO = 1 + DEBUG = 2 + + PPMI_MATRIX = "ppmi" + PMI_MATRIX = "pmi" + COOC_MATRIX = "cooc" + LOG_COOC_MATRIX = "logcooc" + + CTXBREAK = "" + MAX_SENTENCE_LENGTH = 1000 + CONTEXT_SUFFIX = ".context" +) + +var verbose int +var dim, corpusSize uint64 +var mVec, mCtx, bVec, bCtx, mVecGrad, mCtxGrad, bVecGrad, bCtxGrad []float64 +var contextDistributionSmoothing, cdsTotal, postSubsample float64 +var useBias, adagrad, externalMemory, positionalContexts, periodIsWhitespace bool +var randng *rand.Rand +var matrix string +var window int +var ctxbreakw *Word + +var ctxbreakbytes []byte + +func check(e error) { + if e != nil { + panic(e) + } +} + +func logit(msg string, lineBreak bool, level int) { + if verbose < level { + return + } + if lineBreak { + fmt.Fprintf(os.Stderr, "\n") + } + fmt.Fprintf(os.Stderr, msg) + if lineBreak { + fmt.Fprintf(os.Stderr, "\n") + } + os.Stderr.Sync() +} + +type Word struct { + w string + i uint64 + freq uint64 + cooc map[uint64]float64 + totalCooc float64 +} + +func (w *Word) Ppmi(c *Word) float64 { + cooc, ok := w.cooc[c.i] + if !ok { + return 0.0 + } + return w.PpmiDirect(c, cooc) +} + +func (w *Word) PpmiDirect(c *Word, cooc float64) float64 { + ppmi := math.Log(cooc) - math.Log(w.totalCooc) - math.Log(math.Pow(c.totalCooc, contextDistributionSmoothing)) + math.Log(cdsTotal) + if ppmi < 0 { + return 0 + } + return ppmi +} + +func (w *Word) Pmi(c *Word) float64 { + cooc, ok := w.cooc[c.i] + if !ok { + return 0.0 + } + return w.PmiDirect(c, cooc) +} + +func (w *Word) PmiDirect(c *Word, cooc float64) float64 { + if cooc < 1 { + cooc = 1 + } + pmi := math.Log(cooc) - math.Log(w.totalCooc) - math.Log(math.Pow(c.totalCooc, contextDistributionSmoothing)) + math.Log(cdsTotal) + return pmi +} + +func (w *Word) LogCooc(c *Word) float64 { + cooc, ok := w.cooc[c.i] + if !ok { + return 0.0 + } + return w.LogCoocDirect(c, cooc) +} + +func (w *Word) LogCoocDirect(c *Word, cooc float64) float64 { + if cooc < 1 { + cooc = 1 + } + return math.Log(cooc) +} + +func (w *Word) posW(pos int) string { + return fmt.Sprintf("%s_%d", w.w, pos) +} + +type ByFreq []*Word + +func (a ByFreq) Len() int { return len(a) } +func (a ByFreq) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a ByFreq) Less(i, j int) bool { return a[i].freq >= a[j].freq } + +func ShuffleVocab(a []*Word) { + n := len(a) + for i := n - 1; i > 0; i-- { + j := randng.Intn(i + 1) + a[i], a[j] = a[j], a[i] + } +} + +func createScanner(reader io.Reader) *bufio.Scanner { + var s = bufio.NewScanner(bufio.NewReader(reader)) + s.Split(scanWords) + return s +} + +func scanWords(data []byte, atEOF bool) (advance int, token []byte, err error) { + // Skip leading spaces. + start := 0 + for width := 0; start < len(data); start += width { + var r rune + r, width = utf8.DecodeRune(data[start:]) + if r == '\n' || r == '.' { + return start + width, ctxbreakbytes, nil + } + if !isSpace(r) { + break + } + } + // Scan until space, marking end of word. + for width, i := 0, start; i < len(data); i += width { + var r rune + r, width = utf8.DecodeRune(data[i:]) + if isSpace(r) { + if r == '\n' || r == '.' { + width = 0 + } + return i + width, data[start:i], nil + } + } + // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. + if atEOF && len(data) > start { + return len(data), data[start:], nil + } + // Request more data. + return start, nil, nil +} + +func isSpace(r rune) bool { + if r <= '\u00FF' { + // Obvious ASCII ones: \t through \r plus space. Plus two Latin-1 oddballs. + if periodIsWhitespace && r == '.' { + return true + } + switch r { + case ' ', '\t', '\n', '\v', '\f', '\r': + return true + case '\u0085', '\u00A0': + return true + } + return false + } + // High-valued ones. + if '\u2000' <= r && r <= '\u200a' { + return true + } + switch r { + case '\u1680', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000': + return true + } + return false +} + +type Sampler interface { + Sample(r *rand.Rand) *Word +} + +type UnigramDist struct { + vocab []*Word + table []int +} + +func (w *Word) SubsampleP(t float64, corpusSize uint64) float64 { + if t == 0 { + return 0 + } + subsampleP := 1 - math.Sqrt(t/(float64(w.freq)/float64(corpusSize))) + if subsampleP < 0 { + subsampleP = 0 + } + return subsampleP +} + +func (w *Word) KeepP(t float64, corpusSize uint64) float64 { + return 1 - w.SubsampleP(t, corpusSize) +} + +func NewUnigramDist(vocab []*Word, table_size int, power float64) *UnigramDist { + // ported from w2v implementation + var train_words_pow float64 + table := make([]int, table_size) + vocab_size := len(vocab) + for i := 0; i < vocab_size; i++ { + w := vocab[i] + train_words_pow += math.Pow(float64(w.freq), power) + } + var i int + d1 := math.Pow(float64(vocab[i].freq), power) / train_words_pow + for a := 0; a < table_size; a++ { + table[a] = i + if float64(a)/float64(table_size) > d1 { + i++ + d1 += math.Pow(float64(vocab[i].freq), power) / train_words_pow + } + if i >= vocab_size { + i = vocab_size - 1 + } + } + return &UnigramDist{vocab, table} +} + +func (d *UnigramDist) Sample(r *rand.Rand) *Word { + i := r.Intn(len(d.table)) + w := d.vocab[d.table[i]] + return w +} + +func init() { + ctxbreakbytes = []byte(CTXBREAK) +} + +func min(a, b int) int { + if a <= b { + return a + } + return b +} + +func max(a, b int) int { + if a > b { + return a + } + return b +} + +func learn(mapw, mapc *Word, noiseSampler Sampler, r *rand.Rand, noiseSamples int, deltaVec []float64, alpha float64, directCooc float64) float64 { + for j := uint64(0); j < uint64(dim); j++ { + deltaVec[j] = 0 + } + err := float64(0) + deltaVecBias := float64(0) + w := mapw.i + for k := 0; k < noiseSamples+1; k++ { + if k > 0 { + mapc = noiseSampler.Sample(r) + if mapw == mapc { + continue + } + for mapc == ctxbreakw { + mapc = noiseSampler.Sample(r) + } + } + c := mapc.i + dot := float64(0) + for j := uint64(0); j < uint64(dim); j++ { + dot += mVec[w*dim+j] * mCtx[c*dim+j] + } + g := float64(0) + if externalMemory { + g = dot - directCooc + if useBias { + g += bVec[w] + bCtx[c] + } + } else { + cooc, ok := mapw.cooc[mapc.i] + if !ok { + cooc = 0 + } + g = dot - cooc + if useBias { + g += bVec[w] + bCtx[c] + } + } + err += 0.5 * g * g + g *= alpha + for j := uint64(0); j < uint64(dim); j++ { + mVecG := g * mCtx[c*dim+j] + deltaVec[j] += mVecG + mCtxG := g * mVec[w*dim+j] + mCtxGAdj := mCtxG + if adagrad { + mCtxGAdj /= math.Sqrt(mCtxGrad[c*dim+j]) + mCtxGrad[c*dim+j] += mCtxG * mCtxG + } + mCtx[c*dim+j] -= mCtxGAdj + if math.IsNaN(deltaVec[j]) || math.IsNaN(mCtx[c*dim+j]) { + panic("nan") + } + } + if useBias { + bVecG := g + deltaVecBias += bVecG + bCtxG := g + bCtxGAdj := bCtxG + if adagrad { + bCtxGAdj /= math.Sqrt(bCtxGrad[c]) + bCtxGrad[c] += bCtxG * bCtxG + } + bCtx[c] -= bCtxGAdj + } + if math.IsNaN(deltaVecBias) || math.IsNaN(bCtx[c]) { + panic("nan") + } + } + for j := uint64(0); j < uint64(dim); j++ { + mVecGAdj := deltaVec[j] + if adagrad { + mVecGAdj /= math.Sqrt(mVecGrad[w*dim+j]) + mVecGrad[w*dim+j] += deltaVec[j] * deltaVec[j] + } + mVec[w*dim+j] -= mVecGAdj + } + if useBias { + bVecGAdj := deltaVecBias + if adagrad { + bVecGAdj /= math.Sqrt(bVecGrad[w]) + bVecGrad[w] += deltaVecBias * deltaVecBias + } + bVec[w] -= bVecGAdj + } + return err +} + +type RingBuffer struct { + arr []RingBufferNode + c, n, start, end int +} + +type RingBufferNode struct { + v interface{} +} + +// call with array +func NewRingBuffer(c int) *RingBuffer { + return &RingBuffer{make([]RingBufferNode, c), c, 0, 0, 0} +} + +func (r *RingBuffer) Len() int { + return r.n +} + +func (r *RingBuffer) Clear() { + r.start, r.end, r.n = 0, 0, 0 +} + +// push to end +func (r *RingBuffer) Push(v interface{}) { + if r.n == r.c { + r.Pop() + } + r.arr[r.end].v = v + r.end = (r.end + 1) % r.c + r.n++ +} + +// removes first in +func (r *RingBuffer) Pop() interface{} { + if r.n == 0 { + return nil + } + ret := r.arr[r.start].v + r.start = (r.start + 1) % r.c + r.n-- + return ret +} + +// get +func (r *RingBuffer) Get(i int) interface{} { + if i >= r.n { + return nil + } + return r.arr[(r.start+i)%r.c].v +} + +func main() { + randng = rand.New(rand.NewSource(1)) + var corpusPath = flag.String("corpus", "", "path to corpus") + var vocabPath = flag.String("savevocab", "", "path where to output vocab") + var readVocabPath = flag.String("readvocab", "", "path where to read vocab") + var initialAlpha = flag.Float64("alpha", 0.025, "learning rate") + var subsample = flag.Float64("subsample", 1e-5, "subsampling threshold") + flag.Float64Var(&contextDistributionSmoothing, "cds", 0.75, "context distribution smoothing") + var dimRaw = flag.Int("dim", 300, "number of dimensions of word vectors") + var iterations = flag.Int("iterations", 5, "how many times to process corpus") + flag.IntVar(&window, "window", 2, "symmetric window of (window, word, window)") + var postWindow = flag.Int("postwindow", 0, "post symmetric window of (window, word, window); if 0 it is set -window") + var minFreq = flag.Int("minfreq", 100, "remove from vocab words that occur less that this number of times") + var decayAlpha = flag.Bool("decay", true, "decaying learning rate") + var noise = flag.Int("negative", 5, "number of negative samples") + var sgNoise = flag.Bool("minibatch", false, "negative sampling per w,c pair rather than per window") + var unigramPower = flag.Float64("unigrampow", 0.75, "raise unigram dist to this power") + var weightedWindow = flag.Bool("weightwindow", false, "use randomized window size from uniform(1, window)") + var postWeightedWindow = flag.Bool("postweightwindow", false, "use randomized postwindow size from uniform(1, window)") + var model = flag.Int("model", 1, "0 = output W, C; 1 = output W; 2 = output W + C") + var numThreads = flag.Int("threads", 12, "number of threads to use") + flag.BoolVar(&externalMemory, "externalmemory", false, "use external memory") + flag.BoolVar(&useBias, "bias", false, "use bias") + flag.BoolVar(&adagrad, "adagrad", false, "use adagrad") + flag.Float64Var(&postSubsample, "postsubsample", 0, "subsampling during SGD; if 0 it is set to -subsample") + flag.StringVar(&matrix, "matrix", PPMI_MATRIX, "which matrix to factor ("+PPMI_MATRIX+","+PMI_MATRIX+","+LOG_COOC_MATRIX+","+COOC_MATRIX+") default = "+PPMI_MATRIX) + flag.IntVar(&verbose, "verbose", DEBUG, "verboseness (0 = errors only, 1 = info, 2 = debug) default = 1") + var printCooc = flag.Bool("printcooc", false, "print coocs for external memory use") + var coocTotalsPath = flag.String("cooctotalspath", "", "path to cooc totals for each word when using external memory") + var coocPath = flag.String("coocpath", "", "path to coocs when using external memory") + var mi = flag.Bool("mi", false, "use MI (multiple instance) rather than SI when using external memory") + flag.BoolVar(&positionalContexts, "pos", true, "use positional contexts") + var vectorOutputPath = flag.String("output", "", "where to save vectors") + flag.BoolVar(&periodIsWhitespace, "periodiswhitespace", false, "treat period as whitespace") + + flag.Usage = func() { + fmt.Printf("Usage: lexvec [options]\nOptions:\n") + flag.PrintDefaults() + } + flag.Parse() + if *postWindow == 0 { + *postWindow = window + } + if postSubsample == 0 { + postSubsample = *subsample + } + dim = uint64(*dimRaw) + var err error + var coocStream *os.File + var coocStreamFileSize int64 + if *printCooc || externalMemory { + if len(*coocPath) == 0 || len(*coocTotalsPath) == 0 { + logit("FATAL ERROR: coocpath and cooctotalspath are required arguments", true, ERROR) + os.Exit(1) + } + if *printCooc { + coocStream, err = os.Create(*coocPath) + check(err) + } else { + coocStream, err = os.Open(*coocPath) + check(err) + coocStat, err := os.Stat(*coocPath) + check(err) + coocStreamFileSize = coocStat.Size() + } + + } + var corpus *os.File + var corpusFileSize int64 + if *printCooc || !externalMemory { + if len(*corpusPath) == 0 { + logit("FATAL ERROR: corpus is a required argument", true, ERROR) + os.Exit(1) + } + corpus, err = os.Open(*corpusPath) + check(err) + corpusStat, err := os.Stat(*corpusPath) + check(err) + corpusFileSize = corpusStat.Size() + } + if !externalMemory || !*printCooc { + if len(*vectorOutputPath) == 0 { + logit("FATAL ERROR: output is a required argument", true, ERROR) + os.Exit(1) + } + } + vocab := make(map[string]*Word) + var vocabSize uint64 + var vocabList []*Word + iVocab := make(map[uint64]*Word) + ctxVocab := vocab + var ctxVocabSize uint64 + var ctxVocabList []*Word + iCtxVocab := iVocab + if positionalContexts { + ctxVocab = make(map[string]*Word) + iCtxVocab = make(map[uint64]*Word) + } + if len(*readVocabPath) > 0 { + logit("reading vocab", true, INFO) + vocabFile, err := os.Open(*readVocabPath) + check(err) + s := createScanner(vocabFile) + for s.Scan() { + w := s.Text() + s.Scan() + freq, err := strconv.ParseUint(s.Text(), 10, 64) + check(err) + vocab[w] = &Word{w, 0, freq, make(map[uint64]float64), 0} + s.Scan() // kill context-break + } + if positionalContexts { + var i uint64 + logit("reading context vocab", true, INFO) + vocabFile, err := os.Open(*readVocabPath + CONTEXT_SUFFIX) + check(err) + s := bufio.NewScanner(vocabFile) + s.Split(bufio.ScanLines) + for s.Scan() { + parts := strings.Split(s.Text(), " ") + w := parts[0] + coocsString := parts[1] + coocs, err := strconv.ParseFloat(coocsString, 64) + check(err) + mapw := &Word{w, i, uint64(coocs), make(map[uint64]float64), coocs} + ctxVocab[w] = mapw + iCtxVocab[i] = mapw + ctxVocabList = append(ctxVocabList, mapw) + i++ + } + ctxVocabSize = i + } + } else { + logit("build vocab", true, INFO) + s := createScanner(corpus) + for s.Scan() { + if vocabSize%1000 == 0 { + logit(fmt.Sprintf("%d\r", vocabSize), false, DEBUG) + } + tok := s.Text() + _, ok := vocab[tok] + if !ok { + vocab[tok] = &Word{tok, 0, 0, make(map[uint64]float64), 0} + vocabSize++ + } + vocab[tok].freq += 1 + } + } + var i = uint64(0) + var newVocabList []*Word + if _, ok := vocab[CTXBREAK]; !ok { + vocab[CTXBREAK] = &Word{CTXBREAK, 0, 0, make(map[uint64]float64), 0} + } + ctxbreakw = vocab[CTXBREAK] + for _, v := range vocab { + vocabList = append(vocabList, v) + } + sort.Sort(ByFreq(vocabList)) + for _, w := range vocabList { + if w.freq < uint64(*minFreq) { + delete(vocab, w.w) + continue + } + w.i = i + iVocab[i] = w + i++ + newVocabList = append(newVocabList, w) + corpusSize += w.freq + } + vocabList = newVocabList + if !positionalContexts { + ctxVocabList = vocabList + } else if len(ctxVocabList) == 0 { + var i uint64 + logit("creating positional vocab words", true, INFO) + for _, w := range vocabList { + if w == ctxbreakw { + continue + } + for j := -window; j <= window; j++ { + if j == 0 { + continue + } + posW := w.posW(j) + w := &Word{posW, i, 0, make(map[uint64]float64), 0} + ctxVocab[posW] = w + iCtxVocab[i] = w + i++ + ctxVocabList = append(ctxVocabList, w) + } + } + } + vocabSize = uint64(len(vocabList)) + ctxVocabSize = uint64(len(ctxVocabList)) + logit(fmt.Sprintf("vocab size: %d\ncontext vocab size: %d\ncorpus size: %d", vocabSize, ctxVocabSize, corpusSize), true, INFO) + var noiseSampler Sampler + if !externalMemory || *printCooc { + logit("identify coocurrence", true, INFO) + corpus.Seek(0, 0) + s := createScanner(corpus) + coocurrenceCounter := uint64(0) + buf := NewRingBuffer(MAX_SENTENCE_LENGTH) + var coocStreamWriter *bufio.Writer + if *printCooc { + coocStreamWriter = bufio.NewWriter(coocStream) + } + for s.Scan() { + coocurrenceCounter++ + if coocurrenceCounter%1000 == 0 { + logit(fmt.Sprintf("%d\r", coocurrenceCounter), false, DEBUG) + } + wordInStream := s.Text() + mapw, ok := vocab[wordInStream] + if !ok { + continue + } + if mapw != ctxbreakw && *subsample > 0 { + subsampleP := mapw.SubsampleP(*subsample, corpusSize) + if subsampleP > 0 { + bernoulliTrial := randng.Float64() // uniform dist 0.0 - 1.0 + if bernoulliTrial <= subsampleP { + continue + } + } + } + if mapw == ctxbreakw || buf.Len() == MAX_SENTENCE_LENGTH { + // process, clear buf, add token + for j := 0; j < buf.Len(); j++ { + target := buf.Get(j).(*Word) + win := window + if *weightedWindow { + win = 1 + randng.Intn(window) + } + start := max(0, j-win) + end := min(buf.Len(), j+win+1) + for i := start; i < end; i++ { + if i == j { + continue + } + mapc := buf.Get(i).(*Word) + if positionalContexts { + posW := mapc.posW(i - j) + mapc, _ = ctxVocab[posW] + } + inc := float64(1) + target.totalCooc += inc + if positionalContexts { + mapc.totalCooc += inc + } + if *printCooc { + coocStreamWriter.WriteString(target.w) + coocStreamWriter.WriteString(" ") + coocStreamWriter.WriteString(mapc.w) + coocStreamWriter.WriteString("\n") + } else { + cooc, ok := target.cooc[mapc.i] + if !ok { + cooc = 0 + } + target.cooc[mapc.i] = cooc + inc + } + } + } + buf.Clear() + } + if mapw != ctxbreakw { + buf.Push(mapw) + } + } + if positionalContexts { + for _, w := range ctxVocabList { + w.freq = uint64(w.totalCooc) + } + } + if *printCooc { + logit("creating vocab sampling distribution", true, INFO) + noiseSampler = NewUnigramDist(ctxVocabList, 1e8, *unigramPower) + // now add the negative samples + logit("adding negative samples", true, INFO) + coocurrenceCounter = 0 + for _, mapw := range vocabList { + if mapw == ctxbreakw { + continue + } + numSamples := uint64(math.Ceil(float64(mapw.freq) * mapw.KeepP(*subsample, corpusSize) * float64(*noise))) + for i := uint64(0); i < numSamples; i++ { + coocurrenceCounter++ + if coocurrenceCounter%1000 == 0 { + logit(fmt.Sprintf("%d\r", coocurrenceCounter), false, DEBUG) + } + mapc := noiseSampler.Sample(randng) + if mapw == mapc { + continue + } + for mapc == ctxbreakw { + mapc = noiseSampler.Sample(randng) + + } + coocStreamWriter.WriteString(mapw.w) + coocStreamWriter.WriteString(" ") + coocStreamWriter.WriteString(mapc.w) + coocStreamWriter.WriteString(" *") + coocStreamWriter.WriteString("\n") + } + } + coocStreamWriter.Flush() + } + } + if len(*vocabPath) > 0 { + logit("saving vocab", true, INFO) + vocabOutput, err := os.Create(*vocabPath) + if err != nil { + panic("unable to open vocab path") + } + for _, w := range vocabList { + fmt.Fprintf(vocabOutput, "%s %d\n", w.w, w.freq) + } + vocabOutput.Close() + if positionalContexts { + vocabOutput, err := os.Create(*vocabPath + CONTEXT_SUFFIX) + if err != nil { + panic("unable to open context vocab path") + } + for _, w := range ctxVocabList { + fmt.Fprintf(vocabOutput, "%s %f\n", w.w, w.totalCooc) + } + vocabOutput.Close() + } + } + if externalMemory && *printCooc && len(*coocTotalsPath) > 0 { + coocTotalsOutput, err := os.Create(*coocTotalsPath) + check(err) + logit("writing cooc totals", true, INFO) + for _, w := range vocabList { + fmt.Fprintf(coocTotalsOutput, "%s %f\n", w.w, w.totalCooc) + } + coocTotalsOutput.Close() + } + if *printCooc { + logit("done", true, INFO) + os.Exit(0) + } + sort.Sort(ByFreq(ctxVocabList)) + noiseSampler = NewUnigramDist(ctxVocabList, 1e8, *unigramPower) + if externalMemory && !*printCooc { + logit("reading cooc totals", true, INFO) + coocTotalsStream, err := os.Open(*coocTotalsPath) + check(err) + s := bufio.NewScanner(coocTotalsStream) + s.Split(bufio.ScanLines) + for s.Scan() { + parts := strings.Split(s.Text(), " ") + w := parts[0] + coocsString := parts[1] + coocs, err := strconv.ParseFloat(coocsString, 64) + check(err) + vocab[w].totalCooc = coocs + } + coocTotalsStream.Close() + } + for _, w := range ctxVocabList { + cdsTotal += math.Pow(w.totalCooc, contextDistributionSmoothing) + } + logit(fmt.Sprintf("cds total: %f", cdsTotal), true, INFO) + if !externalMemory { + logit("calculating "+matrix+" matrix", true, INFO) + for _, w := range vocabList { + ppmiTotal := float64(0) + for c, p := range w.cooc { + switch matrix { + case PPMI_MATRIX: + p = w.Ppmi(iCtxVocab[c]) + case PMI_MATRIX: + p = w.Pmi(iCtxVocab[c]) + case LOG_COOC_MATRIX: + p = w.LogCooc(iCtxVocab[c]) + // case COOC_MATRIX not needed as is exactly p + } + w.cooc[c] = p + ppmiTotal += p * p + } + } + } + mVec = make([]float64, vocabSize*dim) + mCtx = make([]float64, ctxVocabSize*dim) + bVec = make([]float64, vocabSize) + bCtx = make([]float64, ctxVocabSize) + if adagrad { + mVecGrad = make([]float64, vocabSize*dim) + bVecGrad = make([]float64, vocabSize) + mCtxGrad = make([]float64, ctxVocabSize*dim) + bCtxGrad = make([]float64, ctxVocabSize) + } + logit("create vectors", true, INFO) + for j := uint64(0); j < vocabSize; j++ { + for k := uint64(0); k < dim; k++ { + mVec[j*dim+k] = (randng.Float64() - 0.5) / float64(dim) + if adagrad { + mVecGrad[j*dim+k] = 1.0 + } + } + bVec[j] = (randng.Float64() - 0.5) / float64(dim) + if adagrad { + bVecGrad[j] = 1.0 + } + } + for j := uint64(0); j < ctxVocabSize; j++ { + for k := uint64(0); k < dim; k++ { + mCtx[j*dim+k] = (randng.Float64() - 0.5) / float64(dim) + if adagrad { + mCtxGrad[j*dim+k] = 1.0 + } + } + bCtx[j] = (randng.Float64() - 0.5) / float64(dim) + if adagrad { + bCtxGrad[j] = 1.0 + } + } + logit("running lexvec", true, INFO) + processed := uint64(0) + bytesRead := uint64(0) + var wg sync.WaitGroup + avgError := float64(0) + avgErrorNum := uint64(0) + refAlpha := *initialAlpha + for threadId := 0; threadId < *numThreads; threadId++ { + wg.Add(1) + go func(threadId int) { + alpha := *initialAlpha + randn := rand.New(rand.NewSource(int64(threadId))) + deltaVec := make([]float64, dim) + if externalMemory { + coocStream, err := os.Open(*coocPath) + check(err) + coocStreamOffsetStart := (coocStreamFileSize / int64(*numThreads)) * int64(threadId) + coocStreamOffsetEnd := (coocStreamFileSize / int64(*numThreads)) * int64(threadId+1) + if coocStreamOffsetEnd >= coocStreamFileSize { + coocStreamOffsetEnd = coocStreamFileSize + } + for iter := 0; iter < *iterations; iter++ { + _, err := coocStream.Seek(coocStreamOffsetStart, 0) + check(err) + + s := bufio.NewScanner(coocStream) + s.Split(bufio.ScanLines) + // eat first line to make sure no junk + if !s.Scan() { + logit("got nothing, exiting", true, INFO) + wg.Done() + return + } + for s.Scan() { + curPos, aerr := coocStream.Seek(0, 1) + check(aerr) + if curPos > coocStreamOffsetEnd { + break + } + bytesRead += uint64(len(s.Text())) + processed++ + if processed%1000 == 0 { + if *decayAlpha && !adagrad { + alpha = *initialAlpha * (float64(1) - (float64(bytesRead) / (float64(coocStreamFileSize) * float64(*iterations)))) + if alpha < *initialAlpha*0.0001 { + alpha = *initialAlpha * 0.0001 + } + } + if threadId == 0 { + refAlpha = alpha + } + } + parts := strings.Split(s.Text(), " ") + w := parts[0] + mapw, ok1 := vocab[w] + c := parts[1] + mapc, ok2 := ctxVocab[c] + pText := parts[2] + p, nok3 := strconv.ParseFloat(pText, 64) + if !ok1 || !ok2 || nok3 != nil { + panic("problem parsing") + } + isNoise := len(parts) > 3 && parts[3] == "*" + learningIterations := uint64(math.Ceil(p)) + if isNoise && len(parts) == 5 { + // negative sample with non-zero ppmi, cooc is stored in last part + isNoise = false + p, nok3 = strconv.ParseFloat(parts[4], 64) + if nok3 != nil { + panic("problem parsing2") + } + } + y := float64(0) + if !isNoise { + switch matrix { + case PPMI_MATRIX: + y = mapw.PpmiDirect(mapc, p) + case PMI_MATRIX: + y = mapw.PmiDirect(mapc, p) + case LOG_COOC_MATRIX: + y = mapw.LogCoocDirect(mapc, p) + // case COOC_MATRIX not needed as is exactly p + } + } + if !*mi { + learningIterations = 1 + } + for i := uint64(0); i < learningIterations; i++ { + err := learn(mapw, mapc, noiseSampler, randn, 0, deltaVec, alpha, y) + avgError += err + avgErrorNum++ + } + } + if threadId == 0 { + avgError /= float64(avgErrorNum) + logit(fmt.Sprintf("iteration %d MSE = %f", iter+1, avgError), true, INFO) + avgError = 0 + avgErrorNum = 0 + } + } + } else { + corpus, err := os.Open(*corpusPath) + check(err) + corpusOffsetStart := (corpusFileSize / int64(*numThreads)) * int64(threadId) + corpusOffsetEnd := (corpusFileSize / int64(*numThreads)) * int64(threadId+1) + if corpusOffsetEnd >= corpusFileSize { + corpusOffsetEnd = corpusFileSize + } + buf := NewRingBuffer(MAX_SENTENCE_LENGTH) + for iter := 0; iter < *iterations; iter++ { + _, err := corpus.Seek(corpusOffsetStart, 0) + check(err) + + s := createScanner(corpus) + if !s.Scan() { + // consume first word in case it's partial + logit("got nothing, exiting", true, INFO) + wg.Done() + return + } + buf.Clear() + for s.Scan() { + curPos, err := corpus.Seek(0, 1) + check(err) + if curPos > corpusOffsetEnd { + break + } + processed++ + if processed%1000 == 0 { + if *decayAlpha && !adagrad { + alpha = *initialAlpha * (float64(1) - (float64(processed) / (float64(corpusSize) * float64(*iterations)))) + if alpha < *initialAlpha*0.0001 { + alpha = *initialAlpha * 0.0001 + } + } + if threadId == 0 { + refAlpha = alpha + } + } + wordInStream := s.Text() + mapw, ok := vocab[wordInStream] + if !ok { + continue + } + if mapw != ctxbreakw && postSubsample > 0 { + subsampleP := mapw.SubsampleP(postSubsample, corpusSize) + if subsampleP > 0 { + bernoulliTrial := randn.Float64() // uniform dist 0.0 - 1.0 + if bernoulliTrial <= subsampleP { + continue + } + } + } + if mapw == ctxbreakw || buf.Len() == MAX_SENTENCE_LENGTH { + // process, clear buf, add token + for j := 0; j < buf.Len(); j++ { + target := buf.Get(j).(*Word) + win := *postWindow + if *postWeightedWindow { + win = 1 + randn.Intn(*postWindow) + } + start := max(0, j-win) + end := min(buf.Len(), j+win+1) + for i := start; i < end; i++ { + if i == j { + continue + } + mapc := buf.Get(i).(*Word) + if positionalContexts { + posW := mapc.posW(i - j) + mapc, _ = ctxVocab[posW] + } + wNoise := 0 + if *sgNoise { + wNoise = *noise + } + err := learn(target, mapc, noiseSampler, randn, wNoise, deltaVec, alpha, 0) + avgError += err + avgErrorNum++ + } + if !*sgNoise { + for i := 0; i < *noise; i++ { + mapc := noiseSampler.Sample(randn) + if target == mapc { + continue + } + for mapc == ctxbreakw { + mapc = noiseSampler.Sample(randn) + } + err := learn(target, mapc, noiseSampler, randn, 0, deltaVec, alpha, 0) + avgError += err + avgErrorNum++ + } + } + } + buf.Clear() + } + if mapw != ctxbreakw { + buf.Push(mapw) + } + } + if threadId == 0 { + avgError /= float64(avgErrorNum) + logit(fmt.Sprintf("iteration %d MSE = %f", iter+1, avgError), true, INFO) + avgError = 0 + avgErrorNum = 0 + } + } + } + wg.Done() + }(threadId) + } + var done = false + go func() { + time.Sleep(time.Second) + previousProcessed := processed + previousTime := time.Now() + for !done { + speed := float64(processed-previousProcessed) / float64(*numThreads) / float64(1000) / time.Since(previousTime).Seconds() + previousProcessed = processed + previousTime = time.Now() + logit(fmt.Sprintf("%d alpha %f speed %.1fk words/thread/s\r", processed, refAlpha, speed), false, DEBUG) + time.Sleep(time.Second) + } + }() + wg.Wait() + done = true + logit("outputting vectors", true, INFO) + vectorOutput, err := os.Create(*vectorOutputPath) + check(err) + outputStream := bufio.NewWriter(vectorOutput) + outputStream.WriteString(fmt.Sprintf("%d %d\n", len(vocabList), dim)) + for _, w := range vocabList { + outputStream.WriteString(w.w) + for j := uint64(0); j < dim; j++ { + v := mVec[w.i*dim+j] + if *model == 2 { + if !positionalContexts { + v += mCtx[w.i*dim+j] + } else { + for k := -window; k <= window; k++ { + if k == 0 { + continue + } + posC := w.posW(k) + c, _ := ctxVocab[posC] + v += mCtx[c.i*dim+j] + } + } + } + fmt.Fprintf(outputStream, " %f", v) + } + outputStream.WriteString("\n") + } + outputStream.Flush() + if *model == 0 { + ctxOutput, err := os.Create(*vectorOutputPath + CONTEXT_SUFFIX) + check(err) + ctxOutputStream := bufio.NewWriter(ctxOutput) + fmt.Fprintf(ctxOutputStream, "%d %d\n", len(ctxVocabList), dim) + for _, w := range ctxVocabList { + ctxOutputStream.WriteString(w.w) + for j := uint64(0); j < dim; j++ { + fmt.Fprintf(ctxOutputStream, " %f", mCtx[w.i*dim+j]) + } + ctxOutputStream.WriteString("\n") + } + ctxOutputStream.Flush() + } + logit("finished!", true, INFO) +} diff --git a/line_merge.py b/line_merge.py new file mode 100644 index 0000000..6d79af5 --- /dev/null +++ b/line_merge.py @@ -0,0 +1,49 @@ +# Copyright (c) 2016 Salle, Alexandre +# Author: Salle, Alexandre +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import math +import sys +import os + +if sys.version_info < (3,): + range = xrange + +mi = os.environ.get('MI', 'false') == 'true' + +prior_parts = [] +for line in sys.stdin: + sline = line.strip() + if not sline: + break + parts = sline.split() + positive = False + if parts[-1] == "*" and prior_parts: + if parts[0] == prior_parts[0] and parts[1] == prior_parts[1]: + positive = True + coocs = float(parts[2]) + repeat = 1 + if not mi: + repeat = int(math.ceil(coocs)) + for i in range(repeat): + sys.stdout.write(sline) + if positive: + sys.stdout.write(" " + prior_parts[2]) + sys.stdout.write("\n") + prior_parts = parts diff --git a/merge_context_vectors.py b/merge_context_vectors.py new file mode 100644 index 0000000..686b330 --- /dev/null +++ b/merge_context_vectors.py @@ -0,0 +1,68 @@ +# Copyright (c) 2016 Salle, Alexandre +# Author: Salle, Alexandre +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import print_function +import argparse +import re +import sys + +if sys.version_info < (3,): + range = xrange + +parser = argparse.ArgumentParser() +parser.add_argument("vectors", type=str) +args = parser.parse_args() +vectors = args.vectors +with open(vectors) as vec: + with open(vectors + '.context') as ctx: + with open(vectors + '.merged', 'w') as merged: + num, d = map(lambda x: int(x), vec.readline().strip().split()) + numCtx, d2 = map(lambda x: int(x), ctx.readline().strip().split()) + print(num, d, file=merged) + ctxVec = {} + pos = False + if not num == numCtx: + pos = True + for line in ctx: + parts = line.strip().split() + w = parts[0] + if pos: + match = re.match("(.+)_([-0-9]+)$", parts[0]) + w = match.group(1) + vw = list(map(lambda x: float(x), parts[1:1 + d])) + if w in ctxVec: + old = ctxVec[w] + for i in range(d): + old[i] += vw[i] + else: + ctxVec[w] = vw + not_found = 0 + for line in vec: + parts = line.strip().split() + w = parts[0] + vw = list(map(lambda x: float(x), parts[1:1 + d])) + if w in ctxVec: + vc = ctxVec[w] + for i in range(d): + vw[i] += vc[i] + else: + not_found += 1 + print(w, ' '.join(map(lambda x: "%.6f" % (x,), vw)), file=merged) + print("not found was", not_found) diff --git a/pairs_to_counts.sh b/pairs_to_counts.sh new file mode 100755 index 0000000..f85befc --- /dev/null +++ b/pairs_to_counts.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Copyright (c) 2016 Salle, Alexandre +# Author: Salle, Alexandre +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +# This script can be sped up significantly by using nsort and pypy. Uncomment lines (and comment corresponding lines) below if you have these installed. + +set -e + +export MI=${MI:-false} +export MEMORY=${MEMORY:-4} +TMPFILE=`mktemp` +TMPFILE2=`mktemp` + +echo sorting to $TMPFILE 1>&2 +sort -k1,2 -S "$MEMORY"G - | uniq -c | awk '{print $2 " " $3 " " $1 " " $4}' > $TMPFILE +# ./nsort -T"$TMPDIR" - | uniq -c | awk '{print $2 " " $3 " " $1 " " $4}' > $TMPFILE + +echo removing lines ending in \* if they match previous line 1>&2 +python line_merge.py < $TMPFILE > $TMPFILE2 +# pypy line_merge.py < $TMPFILE > $TMPFILE2 +rm $TMPFILE + +echo shuffling $TMPFILE2 1>&2 +python shuffle.py < $TMPFILE2 +# pypy shuffle.py < $TMPFILE2 +rm $TMPFILE2 + diff --git a/shuffle.py b/shuffle.py new file mode 100644 index 0000000..741dd5d --- /dev/null +++ b/shuffle.py @@ -0,0 +1,73 @@ +# Copyright (c) 2016 Salle, Alexandre +# Author: Salle, Alexandre +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import os +import sys +import tempfile +import random + +# need better way to estimate this given overhead for str. setting to +# 0.5GB uses rougly 4GB of ram. +memory = int(float(os.environ.get('MEMORY', 4.0)) / 8 * 1024**3) + +files = [] +files.append(tempfile.NamedTemporaryFile(mode='w', delete=False)) +total_bytes = 0 +total_lines = 0 +buf = [] +bytes_used = 0 + + +def shuffle_and_close(buf, f): + random.shuffle(buf) + f.writelines(buf) + f.close() + +for line in sys.stdin: + bytes_used += len(line) + total_bytes += len(line) + total_lines += 1 + buf.append(line) + if bytes_used >= memory: + #sys.stderr.write("wrote %d lines\n" % (len(buf))) + shuffle_and_close(buf, files[-1]) + files.append(tempfile.NamedTemporaryFile(mode='w', delete=False)) + buf = [] + bytes_used = 0 +if buf: + shuffle_and_close(buf, files[-1]) + +buf = [] +avg_bytes_per_line = total_bytes / float(total_lines) +files = [open(f.name) for f in files] +while files: + rm_files = [] + lines_per_file = int((memory / avg_bytes_per_line) / len(files)) + for f in files: + lines = f.readlines(lines_per_file) + if not lines: + rm_files.append(f) + buf += lines + random.shuffle(buf) + sys.stdout.writelines(buf) + buf = [] + for f in rm_files: + files.remove(f) + os.unlink(f.name) diff --git a/word2vec b/word2vec new file mode 100755 index 0000000..8326d15 --- /dev/null +++ b/word2vec @@ -0,0 +1,66 @@ +#!/usr/bin/env python + +# Copyright (c) 2016 Salle, Alexandre +# Author: Salle, Alexandre +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import print_function +import argparse +import subprocess +import os +import sys + +arg_map = [ + ('train', 'corpus', '', True, 'tokenized corpus used for training'), + ('output', 'output', '', True, 'file where to output vectors'), + ('size', 'dim', '100', False, 'dimensions of vectors'), + ('window', 'window', '2', False, 'window to each side of target word'), + ('sample', 'subsample', '1e-3', False, 'subsampling corpus'), + ('negative', 'negative', '5', False, '# of negative samples'), + ('threads', 'threads', '12', False, '# of threads'), + ('iter', 'iterations', '5', False, '# of iterations'), + ('min-count', 'minfreq', '5', False, 'remove from vocabulary words that appear less than # times'), + ('alpha', 'alpha', '0.025', False, 'learning rate'), + ('debug', 'verbose', '2', False, 'debug info (0 for none, 1 for some, 2 for lots)'), + ('save-vocab', 'savevocab', None, False, 'save vocab to file'), + ('read-vocab', 'readvocab', None, False, 'read vocab from file'), +] + +LEXVEC = os.environ.get('LEXVEC', '/home/nlpserver2/alex/lexvec/lexvec') + +ignore = ['hs', 'classes', 'binary', 'cbow'] + +parser = argparse.ArgumentParser(description="word2vec interface to lexvec", formatter_class=argparse.ArgumentDefaultsHelpFormatter) +for arg in arg_map: + parser.add_argument('-' + arg[0], dest=arg[1], default=arg[2], required=arg[3], help=arg[4]) +for arg in ignore: + parser.add_argument('-' + arg, dest=arg, default=None) +args = parser.parse_args() + +p = vars(args) +output = p['output'] +del p['output'] + +args = map(lambda x: ('-' + x[0], x[1]), filter(lambda x: x[0] not in ignore and x[1], p.items())) +args = [y for x in args for y in x] + +cmd = [LEXVEC] + args +print(' '.join(cmd)) +subprocess.Popen(cmd, stdout=open(output, 'w'), stderr=sys.stderr).wait() +