Started auto data download

argosopentech · Apr 3, 2021 · 7d1de98 · 7d1de98
1 parent 6657d5e
commit 7d1de98
Show file tree

Hide file tree

Showing 6 changed files with 143 additions and 0 deletions.
diff --git a/data/2 b/data/2
@@ -0,0 +1,38 @@
+[
+{
+  "name": "WikiMatrix",
+  "type": "parallel_corpus",
+  "from_code": "en",
+  "to_code": "de",
+  "size": "1573438",
+  "reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019.",
+  "links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_de.argosdata"
+},
+{
+  "name": "WikiMatrix",
+  "type": "parallel_corpus",
+  "from_code": "en",
+  "to_code": "fr",
+  "size": 2757884,
+  "reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019."
+  "links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_de.argosdata"
+},
+{       
+  "name": "WikiMatrix",
+  "type": "parallel_corpus",
+  "from_code": "en",
+  "to_code": "es",
+  "size": 3377912,
+  "reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019."
+  "links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_de.argosdata"
+},
+{
+  "name": "WikiMatrix",
+  "type": "parallel_corpus",
+  "from_code": "en",
+  "to_code": "ru",
+  "size": 1661909,
+  "reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019."
+  "links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_de.argosdata"
+}
+]
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,19 @@
+# Auto data downloader
+
+### Metadata example
+```
+{
+  "name": "WikiMatrix",
+  "type": "parallel_corpus",
+  "from_code": "en",
+  "to_code": "de",
+  "size": "1000000",
+  "reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019."
+}
+```
+
+### Overview
+Packages are a zip archive with the .argosdata extension. `metadata.json` is in the root like for packages. The data is in two text files `source` and `target`. Each parallel line in each file is a data point.
+
+Optionally data packages can contain a `README` and `LICENSE` file.
+
diff --git a/data/cache/README.md b/data/cache/README.md
@@ -0,0 +1,2 @@
+### Downloaded data packages are stored here
+
diff --git a/data/download_data.py b/data/download_data.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+
+from pathlib import Path
+import json
+
+import requests
+
+class Dataset:
+    CACHE_PATH = Path('cache')
+
+    def load_metadata_from_json(self, metadata):
+        """Loads package metadata from a JSON object.
+        Args:
+            metadata: A json object from json.load
+        """
+        self.name = metadata.get('name')
+        self.type = metadata.get('type')
+        self.from_code = metadata.get('from_code')
+        self.to_code = metadata.get('to_code')
+        self.size = metadata.get('size')
+        self.links = metadata.get('links')
+
+    def __str__(self):
+        return '-'.join([
+                str(self.name),
+                str(self.type),
+                str(self.from_code),
+                str(self.to_code),
+                str(self.name)])
+
+    def download(self):
+        """Downloads the package and returns its path"""
+        url = self.links[0]
+        filename = str(self) + '.argosdata'
+        filepath = CACHE_PATH / filename
+        if not filepath.exists():
+            r = requests.get(url, allow_redirects=True)
+            open(filepath, 'wb').write(r.content)
+        return filepath
+
+
+DATA_INDEX = Path('index.json')
+with open(DATA_INDEX) as data_index:
+    index = json.load(data_index)
+    print(index)
diff --git a/data/index.json b/data/index.json
@@ -0,0 +1,38 @@
+[
+{
+  "name": "WikiMatrix",
+  "type": "parallel_corpus",
+  "from_code": "en",
+  "to_code": "de",
+  "size": "1573438",
+  "reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019.",
+  "links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_de.argosdata"
+},
+{
+  "name": "WikiMatrix",
+  "type": "parallel_corpus",
+  "from_code": "en",
+  "to_code": "fr",
+  "size": 2757884,
+  "reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019.",
+  "links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_fr.argosdata"
+},
+{       
+  "name": "WikiMatrix",
+  "type": "parallel_corpus",
+  "from_code": "en",
+  "to_code": "es",
+  "size": 3377912,
+  "reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019.",
+  "links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_es.argosdata"
+},
+{
+  "name": "WikiMatrix",
+  "type": "parallel_corpus",
+  "from_code": "en",
+  "to_code": "ru",
+  "size": 1661909,
+  "reference": "Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia, arXiv, July 11 2019.",
+  "links": "https://storage.googleapis.com/argosopentech-data/wikimatrix-en_ru.argosdata"
+}
+]
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 ctranslate2==1.14.0
 emoji==1.2.0
 stanza==1.1.1
+requests==2.25.1