Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add Multi-instance translate chapter in README and its script example #361

Merged
merged 17 commits into from
Jun 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tutorials/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ introduce different concepts and parameters used for training and translation.
1. [Image captioning](image_captioning)
1. [Domain adaptation of NMT models](adapt)
1. [Decoding with lexical constraints](constraints)
1. [Process per core translation](process_per_core_translation)
13 changes: 13 additions & 0 deletions tutorials/process_per_core_translation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# CPU process per core translation
On multi-core processor computer, translation per core separately can speedup translation performance, due to some operation can't be handled parallel in one process.
Using this method, translation on each core can be parallel.

One python script example is given and you can run it as follows:
```bash
> python cpu_process_per_core_translation.py -m model -i input_file_name -o output_file_name -bs batch_size -t true
```

-t true: each core translate the whole input file.

-t false: each core translate (input file line/core number) lines , then merge the translated file into one complete output file.

Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# Licensed under the Apache License, Version 2.0 (the "License"). You may not
# use this file except in compliance with the License. A copy of the License
# is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is distributed on
# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

# \Author: {zhiyuan.huang, rong.a.zhang}@intel.com


# Describtion: This script is used to process per core translation, which can greatly speedup translate perefomance.

import argparse
import subprocess
import threading
import tempfile
import logging
import string
import os
import time
import mxnet as mx

def add_args(parser):
parser.add_argument("--module" , '-m', help='the pretrained module', default="wmt_model")
parser.add_argument("--input_file", '-i', help='the file input to translate', default='newstest2016.tc.BPE.de')
parser.add_argument("--output_file", '-o', help='the file saved the result of translation', default='newstest2016.tc.BPE.en')
parser.add_argument("--batch_size", '-bs', help='the batch size of process', default=32)
parser.add_argument("--data_type", '-t', help='if uses run benchmark or not', default="benchmark")

def task(args):
os.system(args)

def benchmark(cores, args):
"""
benchmark is used for Processing per core translation. Each core translates the whole input file.
Return after all translations done.

:param cores: the number of cores used for translation, each core will launch a thread to translate
:param args: input parameters
"""
model = args.module
fileInput = args.input_file
fileOutput = args.output_file
batchsize = args.batch_size

thread = []
for i in range(cores):
command = "taskset -c %d-%d python3 -m sockeye.translate -m %s -i %s -o %s --batch-size %d --output-type benchmark --use-cpu > /dev/null 2>&1 " % (i, i, model, fileInput, fileOutput, batchsize)
t = threading.Thread(target = task, args=(command,))
thread.append(t)
t.start()

for t in thread:
t.join()

def split_file(splitNum, fileInput, lines):
"""
split_file is used to split fileInput into splitNum small pieces file.
For example, when splitNum is 56, a 112 lines file will be split into 56 files and each file has 2 lines.

:param splitNum: split into splitNum files
:param fileInput: file to be split
:param lines: lines of fileInput
"""
quot = lines // splitNum
rema = lines % splitNum
files = []
current_line = 0
for i in range(splitNum):
if i < rema:
read_line = quot + 1
else:
read_line = quot
temp = tempfile.NamedTemporaryFile()
os.system("head -n%d %s| tail -n%d > %s" % (current_line + read_line, fileInput, read_line, temp.name))
current_line += read_line
files.append(temp)

return files

def translate(cores, files, args):
"""
translate is used for Processing per core translation. cores[i] will translate files[i]

:param cores: the number of cores used for translation, each core will launch a thread to translate
:param files: file list to be translated
:param args: input parameters
:return: list of translated file
"""
model = args.module
batchsize = args.batch_size

# split inputfile to a series of small files which number is equal cores
file = []
thread = []
for i in range(cores):
files[i].seek(0)
temp = tempfile.NamedTemporaryFile()
command = "taskset -c %d-%d python3 -m sockeye.translate -m %s -i %s -o %s --batch-size %d --output-type benchmark --use-cpu > /dev/null 2>&1 " % (i, i, model, files[i].name, temp.name, batchsize)
file.append(temp)

t = threading.Thread(target = task, args=(command,))
thread.append(t)
t.start()
#wait for all translation done
for t in thread:
t.join()

return file

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='MXnet Sockeye cpu_process_per_core_translation.py -m model-name -i file_to_translate -o result_to_save --batch-size 32')
add_args(parser)
args = parser.parse_args()
fileInput = args.input_file
fileOutput = args.output_file

socket = int(os.popen("grep -w 'physical id' /proc/cpuinfo | sort -u | wc -l").read().strip())
cores = int(os.popen("grep -w 'core id' /proc/cpuinfo | sort -u | wc -l").read().strip())
total_cores = socket * cores
print(total_cores)

os.system("export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0")
ompStr = "export OMP_NUM_THREADS=" + str(total_cores)
os.system(ompStr)

# the total lines of input file will be translated
lines = 0
for line in open(args.input_file): lines += 1
print('Translating...')
# clear file for output
os.system("rm %s -f"%fileOutput)
if args.data_type == "benchmark":
lines = lines * total_cores
start = time.time()
order = benchmark(total_cores, args)
end = time.time()
else:
splited_files = split_file(total_cores, fileInput, lines)
start = time.time()
translated_files = translate(total_cores, splited_files, args)
end = time.time()
for i in range(total_cores):
os.system("cat %s >> %s" % (translated_files[i].name, fileOutput))
splited_files[i].close()
translated_files[i].close()

total_time = end - start

print("Instance nums: %d" % total_cores)
print("Total Processed lines: %d" % lines)
print("Total time(s): %.3f s" % total_time)
print("Speed: %.3f sent/sec" % (lines / total_time))