Merge pull request #272 from centre-for-humanities-computing/fix_issu…

…es_from_validator modify timestamps: ongoing. I'm still generating new datasets.
centre-for-humanities-computing · May 24, 2024 · 270180b · 270180b
2 parents c923e09 + 4bcbedb
commit 270180b
Show file tree

Hide file tree

Showing 30 changed files with 193 additions and 256 deletions.
diff --git a/data-processing/scripts/convert_dagw_to_jsonlgz.py b/data-processing/scripts/convert_dagw_to_jsonlgz.py
@@ -14,6 +14,65 @@
 from datasets import Dataset, DatasetDict, load_dataset  # type: ignore
 from typing import Generator
 import os
+from datetime import datetime
+from dateutil.parser import parse
+
+
+def replace_swedish(timestamp_str):
+    # Mapping of Swedish day and month names to English
+    swedish_to_english = {
+        'mån': 'Mon',
+        'tis': 'Tue',
+        'ons': 'Wed',
+        'tors': 'Thu',
+        'fre': 'Fri',
+        'lör': 'Sat',
+        'sön': 'Sun',
+        'jan': 'Jan',
+        'feb': 'Feb',
+        'mar': 'Mar',
+        'apr': 'Apr',
+        'maj': 'May',
+        'jun': 'Jun',
+        'jul': 'Jul',
+        'aug': 'Aug',
+        'sep': 'Sep',
+        'okt': 'Oct',
+        'nov': 'Nov',
+        'dec': 'Dec'
+    }
+    for swedish, english in swedish_to_english.items():
+        timestamp_str = timestamp_str.replace(swedish, english)
+    return timestamp_str
+
+def parse_added(timestamp_str):
+    # Handling NA
+    if timestamp_str == "NA":
+        return "2024-05-16" # Set a default value
+
+    timestamp_str = replace_swedish(timestamp_str)
+
+    formats = [
+        "%a %b %d %H:%M:%S %Y %z",     # Without timezone name, with UTC offset
+        "%a %b %d %H:%M:%S %Y %Z %z",   # With timezone name and UTC offset
+        "%a %d %b %Y %H:%M:%S %Z %z",  # With day first, e.g., ons 13 nov 2019 12:42:34 CET +0100, Swedish abbreviations
+        "%a %d %b %Y %H:%M:%S %z"      # Without timezone name, with UTC offset, day first
+    ]
+
+    for fmt in formats:
+        try:
+            dt = datetime.strptime(timestamp_str, fmt)
+            return dt.strftime("%Y-%m-%d")
+        except ValueError:
+            continue
+
+    # Fallback using dateutil.parser.parse
+    try:
+        dt = parse(timestamp_str)
+        return dt.strftime("%Y-%m-%d")
+    except ValueError:
+        raise ValueError(f"Timestamp '{timestamp_str}' does not match any known format.")
+
 
 
 def reformat_and_clean_dataset(ds: Dataset, num_proc: int) -> Dataset:
@@ -25,6 +84,11 @@ def reformat_and_clean_dataset(ds: Dataset, num_proc: int) -> Dataset:
     ds = ds.rename_column("date_built", "added")
     ## source --> sub-source
     #ds = ds.rename_column("source", "sub-source")
+    # format added: "%a %b %d %H:%M:%S %Y %Z %z" to "%Y-%m-%d"
+    ds = ds.map(
+        lambda x: {"added": parse_added(x["added"])},  # type: ignore
+        num_proc=num_proc,
+    )
 
     source2domain = {
         "retsinformationdk": "Legal",
@@ -59,31 +123,32 @@ def reformat_and_clean_dataset(ds: Dataset, num_proc: int) -> Dataset:
         num_proc=num_proc,  # type: ignore
     )
 
-    source2time = {
-        "retsinformationdk": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "skat": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "retspraksis": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "hest": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "cc": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "adl": "1700-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "botxt": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "danavis": "1999-01-01T00:00:00.000Z, 2004-01-01T00:00:00.000Z",
-        "dannet": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "depbank": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "ep": "2004-01-01T00:00:00.000Z, 2009-01-01T00:00:00.000Z",
-        "ft": "2009-01-01T00:00:00.000Z, 2019-01-01T00:00:00.000Z",
-        "gutenberg": "1700-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "jvj": "1873-01-01T00:00:00.000Z, 1951-01-01T00:00:00.000Z",
-        "naat": "1930-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "opensub": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "relig": "NA",
-        "spont": "2019-01-01T00:00:00.000Z, 2020-01-01T00:00:00.000Z",
-        "synne": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "tv2r": "2015-01-01T00:00:00.000Z, 2020-01-01T00:00:00.000Z",
-        "wiki": "2019-01-01T00:00:00.000Z, 2021-01-01T00:00:00.000Z",
-        "wikibooks": "2019-01-01T00:00:00.000Z, 2021-01-01T00:00:00.000Z",
-        "wikisource": "1700-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
-        "twfv19": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",  # not present in this version of the dataset
+    source2time ={
+        "retsinformationdk": "2000-01-01, 2022-01-01",
+        "skat": "2000-01-01, 2022-01-01",
+        "retspraksis": "2000-01-01, 2022-01-01",
+        "hest": "2000-01-01, 2022-01-01",
+        "cc": "2000-01-01, 2022-01-01",
+        "adl": "1700-01-01, 2022-01-01",
+        "botxt": "2000-01-01, 2022-01-01",
+        "danavis": "1999-01-01, 2004-01-01",
+        "dannet": "2000-01-01, 2022-01-01",
+        "depbank": "2000-01-01, 2022-01-01",
+        "ep": "2004-01-01, 2009-01-01",
+        "ft": "2009-01-01, 2019-01-01",
+        "gutenberg": "1700-01-01, 2022-01-01",
+        "jvj": "1873-01-01, 1951-01-01",
+        "naat": "1930-01-01, 2022-01-01",
+        "opensub": "2000-01-01, 2022-01-01",
+        # "relig": "NA",
+        "relig" : "1700-01-01, 2022-01-01", # Take a guess instead
+        "spont": "2019-01-01, 2020-01-01",
+        "synne": "2000-01-01, 2022-01-01",
+        "tv2r": "2015-01-01, 2020-01-01",
+        "wiki": "2019-01-01, 2021-01-01",
+        "wikibooks": "2019-01-01, 2021-01-01",
+        "wikisource": "1700-01-01, 2022-01-01",
+        "twfv19": "2000-01-01, 2022-01-01" # not present in this version of the dataset
     }
 
     # add created
@@ -215,7 +280,7 @@ def make_markdown(ds: Dataset, directory: str) -> None:
     num_records = ds.num_rows
     num_records_category = determine_size_category(num_records)
     sample = ds[0]
-    text = sample["text"][:50].replace("'", "\\'")  # Escaping single quotes in the YAML-like example
+    text = sample["text"].strip()[:50].replace("'", "\\'")  # Escaping single quotes in the YAML-like example and strip leading/trailing whitespace
     source = sample["source"]
     id = sample["id"]
     added = sample["added"]

diff --git a/...infomedia/convert_infomedia_to_jsonlgz.py → ...g/scripts/convert_infomedia_to_jsonlgz.py b/...infomedia/convert_infomedia_to_jsonlgz.py → ...g/scripts/convert_infomedia_to_jsonlgz.py
@@ -1,5 +1,6 @@
 '''
 Converting infomedia dataset.
+Filtering infomedia records that have 'Information', 'Inormation', 'Information (Papermill)' in the field of 'source'.
 
 ndjson -> jsonl.gz:
 
@@ -12,7 +13,7 @@
     "metadata": {            # OPTIONAL: source-specific metadata
 										"sub-source": "...", # OPTIONAL: E.g. "newspaper_ocr"
 										...
-								}        
+								}
 }
 '''
 
@@ -30,7 +31,7 @@ def format_created_range(publish_date, delay_days=365):
     """Create a formatted string representing a time range starting from `publish_date`."""
     start_date = datetime.strptime(publish_date, "%Y-%m-%dT%H:%M:%SZ")
     end_date = start_date + timedelta(days=delay_days)
-    return f"{start_date.strftime('%Y-%m-%dT%H:%M:%SZ')}, {end_date.strftime('%Y-%m-%dT%H:%M:%SZ')}"
+    return f"{start_date.strftime('%Y-%m-%d')}, {end_date.strftime('%Y-%m-%d')}"
 
 def remove_html_tags(text: str) -> str:
     """Remove HTML tags from a string."""
@@ -44,7 +45,7 @@ def remove_whitespace(text: str) -> str:
     clean_text = re.sub(pat_ws, " ", text)
     return clean_text
 
-def process_file(filepath):
+def process_file(filepath, filter_source=['Information', 'Inormation', 'Information (Papermill)']):
     """Process a single file and write its processed contents to a temporary file."""
     # Note: writing to the same file for all workers might lead to some problem.
     articles = []
@@ -55,7 +56,7 @@ def process_file(filepath):
             with open(filepath, 'r', encoding='utf-8') as file, open(temp_filename, 'w', encoding='utf-8') as temp_file:
                 for line in file:
                     original = json.loads(line)
-                    added = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z')
+                    added = datetime.now().strftime('%Y-%m-%d')
                     # Extract the fields
                     heading = original.get("Heading", "")
                     sub_heading = original.get("SubHeading", "")
@@ -77,15 +78,27 @@ def process_file(filepath):
                     text = remove_html_tags(text)
                     # Remove excess whitespace
                     text = remove_whitespace(text)
+
+                    sub_source = original.get("Source", "")
+                    # Filtering
+                    if sub_source in filter_source:
+                        continue
 
                     transformed = {
-                        "id": original.get("ArticleId", ""),
-                        "text": text,
-                        "source": original.get("Source", ""),
-                        "added": added,
-                        "created": format_created_range(original.get("PublishDate", "2000-01-01T00:00:00Z")),
-                        "metadata": {key: value for key, value in original.items() if key not in ["ArticleId", "BodyText", "Source", "PublishDate", "Heading", "SubHeading", "Lead", "Paragraph"]}
-                    }
+                            "id": original.get("ArticleId", ""),
+                            "text": text,
+                            "source": "danew2.0",  # Fixed source value
+                            "added": added,
+                            "created": format_created_range(original.get("PublishDate", "2000-01-01T00:00:00Z")),
+                            "metadata": {
+                                "sub-source": sub_source,  # Moving original source to metadata
+                            }
+                        }
+
+                        # Add remaining metadata fields excluding specific ones already extracted
+                    for key, value in original.items():
+                        if key not in ["ArticleId", "BodyText", "Source", "PublishDate", "Heading", "SubHeading", "Lead", "Paragraph"]:
+                            transformed["metadata"][key] = value
                     json.dump(transformed, temp_file)
                     # Line break
                     temp_file.write('\n')
@@ -136,5 +149,5 @@ def main(directory, output_jsonl_gz):
 
 if __name__ == '__main__':
     directory = '/work/github/infomedia'
-    output_jsonl_gz = '/work/dfm-data/pre-training/danews2.0/articles.jsonl.gz'
+    output_jsonl_gz = '/work/dfm-data/pre-training/danews2.0/documents/danews2.0.jsonl.gz'
     main(directory, output_jsonl_gz)
diff --git a/...g/scripts/memo/convert_memo_to_jsonlgz.py → ...essing/scripts/convert_memo_to_jsonlgz.py b/...g/scripts/memo/convert_memo_to_jsonlgz.py → ...essing/scripts/convert_memo_to_jsonlgz.py
@@ -8,8 +8,8 @@
 from tqdm import tqdm
 import datetime
 import uuid
-import nltk
-from nltk.tokenize import word_tokenize
+import nltk # type: ignore
+from nltk.tokenize import word_tokenize # type: ignore
 
 # Ensure nltk punkt tokenizer models are downloaded
 nltk.download('punkt', quiet=True)
@@ -33,9 +33,9 @@ def default_converter(o):
 
 def format_created_range(year):
     """Creates a date range string for the document's creation time."""
-    start_date = f"{year}-01-01T00:00:00.000Z"
+    start_date = f"{year}-01-01"
     end_year = int(year) + 100
-    end_date = f"{end_year}-01-01T00:00:00.000Z"
+    end_date = f"{end_year}-01-01"
     return f"{start_date}, {end_date}"
 
 def convert_txt_and_metadata(metadata_path, txt_folder_path, output_path):
@@ -59,12 +59,12 @@ def convert_txt_and_metadata(metadata_path, txt_folder_path, output_path):
 
                     file_id = str(row.get('file_id')) if pd.notna(row.get('file_id')) else str(uuid.uuid4())
                     metadata = {key: value for key, value in row.to_dict().items() if key not in ['file_id', 'source']}
-
+                    metadata["sub-source"] = source_value # Moving original source to metadata
                     document = {
                         "id": file_id,
                         "text": text,
-                        "source": source_value,
-                        "added": datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z'),
+                        "source": "memo", # Fixed source value
+                        "added": datetime.datetime.now().strftime('%Y-%m-%d'),
                         "created": format_created_range(row.get('year', '1870')),
                         "metadata": clean_nested_dict(metadata)
                     }
@@ -82,7 +82,7 @@ def convert_txt_and_metadata(metadata_path, txt_folder_path, output_path):
 def main():
     metadata_path =  '/work/github/Corpus-v1.1/MeMo-corpus-metadata-v1.1-2023-06-20.csv'
     txt_folder_path = '/work/github/Corpus-v1.1/normalized'
-    output_path = '/work/dfm-data/pre-training/memo/normalized_memo.jsonl.gz'
+    output_path = '/work/dfm-data/pre-training/memo/documents/memo.jsonl.gz'
     convert_txt_and_metadata(metadata_path, txt_folder_path, output_path)
 
 if __name__ == "__main__":

diff --git a/data-processing/scripts/convert_nordjyllandnews_to_jsonlgz.py b/data-processing/scripts/convert_nordjyllandnews_to_jsonlgz.py
@@ -20,7 +20,7 @@ def format_created_range(created_date, delay_days=365):
     """Create a formatted string representing a time range starting from `publish_date`."""
     start_date = datetime.strptime(created_date, "%Y-%m-%dT%H:%M:%S.000Z")
     end_date = start_date + timedelta(days=delay_days)
-    return f"{start_date.strftime('%Y-%m-%dT%H:%M:%S.000Z')}, {end_date.strftime('%Y-%m-%dT%H:%M:%S.000Z')}"
+    return f"{start_date.strftime('%Y-%m-%d')}, {end_date.strftime('%Y-%m-%d')}"
 
 def parquet_to_jsonlgz(input_path, output_path):
     """Convert .parquet to newline-delimited json.gz."""
@@ -30,21 +30,22 @@ def parquet_to_jsonlgz(input_path, output_path):
             transformed = {
                 "id": f'nordjylland-news{idx}',
                 "text": row.get("text", ""),
-                "source": "TV2 Nord",
-                "added": datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z'),
+                "source": "nordjylland_news",
+                "added": datetime.now().strftime('%Y-%m-%d'),
                 "created": format_created_range(datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z')),
                 "metadata": {
                     "summary": row.get("summary", ""),
                     "text_len": row.get("text_len", ""),
-                    "summary_len": row.get("summary_len", "")
+                    "summary_len": row.get("summary_len", ""),
+                    "sub-source": "TV2 Nord"
                 }
             }
             json_str = json.dumps(transformed)
             out_file.write(json_str + '\n')
 
 def main():
     parquet_path = "/work/github/nordjylland-news-summarization/data/train-00000-of-00001-4fb110c0f6314175.parquet"
-    converted_path = "/work/github/nordjylland-news-summarization/data/converted_train.jsonl.gz"
+    converted_path = "/work/dfm-data/pre-training/nordjylland_news/documents/nordjylland_news.jsonl.gz"
 
     parquet_to_jsonlgz(parquet_path, converted_path)
 

diff --git a/...en/convert_scrape_hovedstaden_to_jsonl.py → ...ts/convert_scrape_hovedstaden_to_jsonl.py b/...en/convert_scrape_hovedstaden_to_jsonl.py → ...ts/convert_scrape_hovedstaden_to_jsonl.py
@@ -11,20 +11,22 @@
 
 def process_file(file_path):
     try:
-        added_timestamp = datetime.utcfromtimestamp(os.path.getctime(file_path)).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
+        # added_timestamp = datetime.utcfromtimestamp(os.path.getctime(file_path)).strftime('%Y-%m-%d')
+        added_timestamp = datetime.now().strftime('%Y-%m-%d')
         with open(file_path, 'r', encoding='utf-8') as file:
             text_content = file.read()
-        
+
         json_object = {
             "id": f"doc_hovedstaden_{os.path.splitext(os.path.basename(file_path))[0]}",
             "text": text_content,
-            "source": "University of Southern Denmark (SDU) & Capital Region",
+            "source": "scrape_hovedstaden",
             "added": added_timestamp,
-            "created": "2023-11-16T13:44:00+01:00, 2024-04-04T09:09:00+02:00",
+            "created": "2023-11-16, 2024-04-04",
             "metadata": {
                 "subject": "health",
                 "language": "danish",
                 "organization": "The Danish Agency for Digitalisation",
+                "source-pretty": "University of Southern Denmark (SDU) & Capital Region",
                 "URL": "https://sprogteknologi.dk/dataset/1076892a-14ee-4f14-a9db-32efb03c40c9"
             }
         }
@@ -58,7 +60,7 @@ def convert_txt_to_jsonl_gz(directory, output_file):
 
 def main():
     directory = '/work/github/capital_region/korpus/renset'
-    output_file = 'scrape_hovedstaden.jsonl.gz'
+    output_file = '/work/dfm-data/pre-training/scrape_hovedstaden/documents/scrape_hovedstaden.jsonl.gz'
     convert_txt_to_jsonl_gz(directory, output_file)
 
 if __name__ == '__main__':

diff --git a/...word/convert_swedish_gigaword_to_jsonl.py → ...ipts/convert_swedish_gigaword_to_jsonl.py b/...word/convert_swedish_gigaword_to_jsonl.py → ...ipts/convert_swedish_gigaword_to_jsonl.py
@@ -72,15 +72,15 @@ def process_file(bz2_file_path):
                 year = elem.get('year')
                 datefrom_raw = elem.get('datefrom') if elem.get('datefrom') else f"{year}0101"
                 dateto_raw = elem.get('dateto') if elem.get('dateto') else f"{year}1231"
-                datefrom = datetime.strptime(datefrom_raw, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S.000Z")
-                dateto = datetime.strptime(dateto_raw, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S.000Z")
+                datefrom = datetime.strptime(datefrom_raw, "%Y%m%d").strftime("%Y-%m-%d")
+                dateto = datetime.strptime(dateto_raw, "%Y%m%d").strftime("%Y-%m-%d")
                 created_range = f"{datefrom}, {dateto}"
-                added_timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%S.000Z")
+                added_timestamp = datetime.now().strftime("%Y-%m-%d")
 
                 json_obj = {
                     "id": corpus_id,
                     "text": document_text,
-                    "source": "Swedish gigaword",
+                    "source": "swedish_gigaword",
                     "added": added_timestamp,
                     "created": created_range,
                     "metadata": {
@@ -113,7 +113,7 @@ def process_files_in_parallel(directories, output_file_path):
 
 def main():
     directories = ['/work/github/swedish_gigaword/gigaword/1950', '/work/github/swedish_gigaword/gigaword/1960', '/work/github/swedish_gigaword/gigaword/1970', '/work/github/swedish_gigaword/gigaword/1980', '/work/github/swedish_gigaword/gigaword/1990', '/work/github/swedish_gigaword/gigaword/2000', '/work/github/swedish_gigaword/gigaword/2010']
-    output_file_path = '/work/github/swedish_gigaword/gigaword/swedish_gigaword_new.jsonl.gz'
+    output_file_path = '/work/github/swedish_gigaword/gigaword/swedish_gigaword.jsonl.gz'
     process_files_in_parallel(directories, output_file_path)
 
 if __name__ == '__main__':