Skip to content

Commit

Permalink
Merge commit 'd787bf8e5e3d01672974b1214bab6c33e763b313' into laser
Browse files Browse the repository at this point in the history
  • Loading branch information
ceshine committed Jun 8, 2019
2 parents 64d01b4 + d787bf8 commit 7c4b87c
Show file tree
Hide file tree
Showing 28 changed files with 833 additions and 883 deletions.
3 changes: 3 additions & 0 deletions LASER_PROJECT/.dockerignore
@@ -0,0 +1,3 @@
models/
tools-external/
notebooks/cache/
9 changes: 9 additions & 0 deletions LASER_PROJECT/.gitignore
Expand Up @@ -3,3 +3,12 @@ source/lib/__pycache__
models
tools-external
tasks/mldoc/MLDoc
embed
tasks/bucc/downloaded
tasks/similarity/dev/
tasks/xnli/XNLI-1.0*
tasks/xnli/multinli_1.0*
.??*swp
.mypy_cache
notebooks/cache
notebooks/.ipynb_checkpoints/
49 changes: 49 additions & 0 deletions LASER_PROJECT/Dockerfile.cpu
@@ -0,0 +1,49 @@
# CPU environment
FROM ubuntu:18.04

LABEL maintainer="ceshine@ceshine.net"

ARG CONDA_PYTHON_VERSION=3
ARG USERNAME=docker
ARG USERID=1000
ARG CONDA_DIR=/opt/conda

# Instal basic utilities
RUN apt-get update && \
apt-get install -y --no-install-recommends git wget unzip bzip2 sudo build-essential ca-certificates && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Install miniconda
ENV PATH $CONDA_DIR/bin:$PATH
RUN wget --quiet https://repo.continuum.io/miniconda/Miniconda$CONDA_PYTHON_VERSION-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \
echo 'export PATH=$CONDA_DIR/bin:$PATH' > /etc/profile.d/conda.sh && \
/bin/bash /tmp/miniconda.sh -b -p $CONDA_DIR && \
rm -rf /tmp/*

# Create the user
RUN useradd --create-home -s /bin/bash --no-user-group -u $USERID $USERNAME && \
chown $USERNAME $CONDA_DIR -R && \
adduser $USERNAME sudo && \
echo "$USERNAME ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers

USER $USERNAME
WORKDIR /home/$USERNAME

RUN conda install pytorch-cpu torchvision-cpu faiss-cpu -c pytorch && \
conda install cython jupyter && \
conda clean -tipsy
RUN pip install jieba beautifulsoup4 && rm -rf ~/.cache/pip

ENV LASER=/src
COPY . /src
RUN sudo chown -R $USERNAME /src
WORKDIR /src

# RUN conda install -y cython jupyter && \
# conda clean -tipsy
RUN bash ./install_models.sh
RUN bash ./install_external_tools.sh

RUN echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate base" >> ~/.bashrc
415 changes: 23 additions & 392 deletions LASER_PROJECT/LICENSE

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions LASER_PROJECT/README.md
Expand Up @@ -3,7 +3,8 @@
LASER is a library to calculate and use multilingual sentence embeddings.

**NEWS**
* The code to perform bitext mining is [**now available**](tasks/bucc)
* 2019/03/18 switch to BSD license
* 2019/02/13 The code to perform bitext mining is [**now available**](tasks/bucc)

**CURRENT VERSION:**
* We now provide an encoder which was trained on [**93 languages**](#supported-languages), written in 23 different alphabets [6].
Expand All @@ -29,6 +30,7 @@ be found in [6], together with an extensive experimental evaluation.
* Python 3.6
* [PyTorch 1.0](http://pytorch.org/)
* [NumPy](http://www.numpy.org/), tested with 1.15.4
* [Cython](https://pypi.org/project/Cython/), needed by Python wrapper of FastBPE, tested with 0.29.6
* [Faiss](https://github.com/facebookresearch/faiss), for fast similarity search and bitext mining
* [transliterate 1.10.2](https://pypi.org/project/transliterate), only used for Greek (`pip install transliterate`)
* [jieba 0.39](https://pypi.org/project/jieba/), Chinese segmenter (`pip install jieba`)
Expand Down Expand Up @@ -62,7 +64,7 @@ with code to reproduce our results (in the directory "tasks").

## License

This source code is licensed under the license found in the [`LICENSE`](LICENSE) file in the root directory of this source tree.
LASER is BSD-licensed, as found in the [`LICENSE`](LICENSE) file in the root directory of this source tree.

## Supported languages

Expand Down
8 changes: 4 additions & 4 deletions LASER_PROJECT/install_external_tools.sh
@@ -1,8 +1,8 @@
#!/bin/bash
# Copyright (c) 2017-present, Facebook, Inc.
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
# LASER Language-Agnostic SEntence Representations
Expand Down Expand Up @@ -106,10 +106,11 @@ InstallFastBPE () {
mv fastBPE-master fastBPE
cd fastBPE
echo " - compiling"
g++ -std=c++11 -pthread -O3 fast.cc -o fast
g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
if [ $? -eq 1 ] ; then
echo "ERROR: compilation failed, please install manually"; exit
fi
python setup.py install
fi
}

Expand Down Expand Up @@ -150,7 +151,6 @@ InstallMecab () {
}



###################################################################
#
# main
Expand Down
17 changes: 9 additions & 8 deletions LASER_PROJECT/install_models.sh
@@ -1,21 +1,21 @@
#!/bin/bash
# Copyright (c) 2017-present, Facebook, Inc.
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
# LASER Language-Agnostic SEntence Representations
# is a toolkit to calculate multilingual sentence embeddings
# and to use them for document classification, bitext filtering
# and mining
#
#
#-------------------------------------------------------
#
# This bash script installs sentence encoders from Amazon s3
#

if [ -z ${LASER} ] ; then
if [ -z ${LASER} ] ; then
echo "Please set the environment variable 'LASER'"
exit
fi
Expand All @@ -24,10 +24,11 @@ mdir="${LASER}/models"

# available encoders
s3="https://dl.fbaipublicfiles.com/laser/models"
networks=("bilstm.eparl21.2018-11-19.pt" \
"eparl21.fcodes" "eparl21.fvocab" \
"bilstm.93langs.2018-12-26.pt" \
"93langs.fcodes" "93langs.fvocab")
networks=(
# "bilstm.eparl21.2018-11-19.pt" \
# "eparl21.fcodes" "eparl21.fvocab" \
"bilstm.93langs.2018-12-26.pt" \
"93langs.fcodes" "93langs.fvocab")


echo "Downloading networks"
Expand Down

0 comments on commit 7c4b87c

Please sign in to comment.