Skip to content

Commit

Permalink
Started auto data download
Browse files Browse the repository at this point in the history
  • Loading branch information
PJ-Finlay committed Apr 3, 2021
1 parent 6657d5e commit 7d1de98
Show file tree
Hide file tree
Showing 6 changed files with 143 additions and 0 deletions.
38 changes: 38 additions & 0 deletions data/2
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
[
{
"name": "WikiMatrix",
"type": "parallel_corpus",
"from_code": "en",
"to_code": "de",
"size": "1573438",
"reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019.",
"links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_de.argosdata"
},
{
"name": "WikiMatrix",
"type": "parallel_corpus",
"from_code": "en",
"to_code": "fr",
"size": 2757884,
"reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019."
"links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_de.argosdata"
},
{
"name": "WikiMatrix",
"type": "parallel_corpus",
"from_code": "en",
"to_code": "es",
"size": 3377912,
"reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019."
"links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_de.argosdata"
},
{
"name": "WikiMatrix",
"type": "parallel_corpus",
"from_code": "en",
"to_code": "ru",
"size": 1661909,
"reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019."
"links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_de.argosdata"
}
]
19 changes: 19 additions & 0 deletions data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Auto data downloader

### Metadata example
```
{
"name": "WikiMatrix",
"type": "parallel_corpus",
"from_code": "en",
"to_code": "de",
"size": "1000000",
"reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019."
}
```

### Overview
Packages are a zip archive with the .argosdata extension. `metadata.json` is in the root like for packages. The data is in two text files `source` and `target`. Each parallel line in each file is a data point.

Optionally data packages can contain a `README` and `LICENSE` file.

2 changes: 2 additions & 0 deletions data/cache/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
### Downloaded data packages are stored here

45 changes: 45 additions & 0 deletions data/download_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env python3

from pathlib import Path
import json

import requests

class Dataset:
CACHE_PATH = Path('cache')

def load_metadata_from_json(self, metadata):
"""Loads package metadata from a JSON object.
Args:
metadata: A json object from json.load
"""
self.name = metadata.get('name')
self.type = metadata.get('type')
self.from_code = metadata.get('from_code')
self.to_code = metadata.get('to_code')
self.size = metadata.get('size')
self.links = metadata.get('links')

def __str__(self):
return '-'.join([
str(self.name),
str(self.type),
str(self.from_code),
str(self.to_code),
str(self.name)])

def download(self):
"""Downloads the package and returns its path"""
url = self.links[0]
filename = str(self) + '.argosdata'
filepath = CACHE_PATH / filename
if not filepath.exists():
r = requests.get(url, allow_redirects=True)
open(filepath, 'wb').write(r.content)
return filepath


DATA_INDEX = Path('index.json')
with open(DATA_INDEX) as data_index:
index = json.load(data_index)
print(index)
38 changes: 38 additions & 0 deletions data/index.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
[
{
"name": "WikiMatrix",
"type": "parallel_corpus",
"from_code": "en",
"to_code": "de",
"size": "1573438",
"reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019.",
"links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_de.argosdata"
},
{
"name": "WikiMatrix",
"type": "parallel_corpus",
"from_code": "en",
"to_code": "fr",
"size": 2757884,
"reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019.",
"links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_fr.argosdata"
},
{
"name": "WikiMatrix",
"type": "parallel_corpus",
"from_code": "en",
"to_code": "es",
"size": 3377912,
"reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019.",
"links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_es.argosdata"
},
{
"name": "WikiMatrix",
"type": "parallel_corpus",
"from_code": "en",
"to_code": "ru",
"size": 1661909,
"reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019.",
"links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_ru.argosdata"
}
]
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
ctranslate2==1.14.0
emoji==1.2.0
stanza==1.1.1
requests==2.25.1

0 comments on commit 7d1de98

Please sign in to comment.