Skip to content

Commit

Permalink
Merge pull request #272 from centre-for-humanities-computing/fix_issu…
Browse files Browse the repository at this point in the history
…es_from_validator

modify timestamps: ongoing. I'm still generating new datasets.
  • Loading branch information
TTTTao725 committed May 24, 2024
2 parents c923e09 + 4bcbedb commit 270180b
Show file tree
Hide file tree
Showing 30 changed files with 193 additions and 256 deletions.
117 changes: 91 additions & 26 deletions data-processing/scripts/convert_dagw_to_jsonlgz.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,65 @@
from datasets import Dataset, DatasetDict, load_dataset # type: ignore
from typing import Generator
import os
from datetime import datetime
from dateutil.parser import parse


def replace_swedish(timestamp_str):
# Mapping of Swedish day and month names to English
swedish_to_english = {
'mån': 'Mon',
'tis': 'Tue',
'ons': 'Wed',
'tors': 'Thu',
'fre': 'Fri',
'lör': 'Sat',
'sön': 'Sun',
'jan': 'Jan',
'feb': 'Feb',
'mar': 'Mar',
'apr': 'Apr',
'maj': 'May',
'jun': 'Jun',
'jul': 'Jul',
'aug': 'Aug',
'sep': 'Sep',
'okt': 'Oct',
'nov': 'Nov',
'dec': 'Dec'
}
for swedish, english in swedish_to_english.items():
timestamp_str = timestamp_str.replace(swedish, english)
return timestamp_str

def parse_added(timestamp_str):
# Handling NA
if timestamp_str == "NA":
return "2024-05-16" # Set a default value

timestamp_str = replace_swedish(timestamp_str)

formats = [
"%a %b %d %H:%M:%S %Y %z", # Without timezone name, with UTC offset
"%a %b %d %H:%M:%S %Y %Z %z", # With timezone name and UTC offset
"%a %d %b %Y %H:%M:%S %Z %z", # With day first, e.g., ons 13 nov 2019 12:42:34 CET +0100, Swedish abbreviations
"%a %d %b %Y %H:%M:%S %z" # Without timezone name, with UTC offset, day first
]

for fmt in formats:
try:
dt = datetime.strptime(timestamp_str, fmt)
return dt.strftime("%Y-%m-%d")
except ValueError:
continue

# Fallback using dateutil.parser.parse
try:
dt = parse(timestamp_str)
return dt.strftime("%Y-%m-%d")
except ValueError:
raise ValueError(f"Timestamp '{timestamp_str}' does not match any known format.")



def reformat_and_clean_dataset(ds: Dataset, num_proc: int) -> Dataset:
Expand All @@ -25,6 +84,11 @@ def reformat_and_clean_dataset(ds: Dataset, num_proc: int) -> Dataset:
ds = ds.rename_column("date_built", "added")
## source --> sub-source
#ds = ds.rename_column("source", "sub-source")
# format added: "%a %b %d %H:%M:%S %Y %Z %z" to "%Y-%m-%d"
ds = ds.map(
lambda x: {"added": parse_added(x["added"])}, # type: ignore
num_proc=num_proc,
)

source2domain = {
"retsinformationdk": "Legal",
Expand Down Expand Up @@ -59,31 +123,32 @@ def reformat_and_clean_dataset(ds: Dataset, num_proc: int) -> Dataset:
num_proc=num_proc, # type: ignore
)

source2time = {
"retsinformationdk": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"skat": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"retspraksis": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"hest": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"cc": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"adl": "1700-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"botxt": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"danavis": "1999-01-01T00:00:00.000Z, 2004-01-01T00:00:00.000Z",
"dannet": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"depbank": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"ep": "2004-01-01T00:00:00.000Z, 2009-01-01T00:00:00.000Z",
"ft": "2009-01-01T00:00:00.000Z, 2019-01-01T00:00:00.000Z",
"gutenberg": "1700-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"jvj": "1873-01-01T00:00:00.000Z, 1951-01-01T00:00:00.000Z",
"naat": "1930-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"opensub": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"relig": "NA",
"spont": "2019-01-01T00:00:00.000Z, 2020-01-01T00:00:00.000Z",
"synne": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"tv2r": "2015-01-01T00:00:00.000Z, 2020-01-01T00:00:00.000Z",
"wiki": "2019-01-01T00:00:00.000Z, 2021-01-01T00:00:00.000Z",
"wikibooks": "2019-01-01T00:00:00.000Z, 2021-01-01T00:00:00.000Z",
"wikisource": "1700-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z",
"twfv19": "2000-01-01T00:00:00.000Z, 2022-01-01T00:00:00.000Z", # not present in this version of the dataset
source2time ={
"retsinformationdk": "2000-01-01, 2022-01-01",
"skat": "2000-01-01, 2022-01-01",
"retspraksis": "2000-01-01, 2022-01-01",
"hest": "2000-01-01, 2022-01-01",
"cc": "2000-01-01, 2022-01-01",
"adl": "1700-01-01, 2022-01-01",
"botxt": "2000-01-01, 2022-01-01",
"danavis": "1999-01-01, 2004-01-01",
"dannet": "2000-01-01, 2022-01-01",
"depbank": "2000-01-01, 2022-01-01",
"ep": "2004-01-01, 2009-01-01",
"ft": "2009-01-01, 2019-01-01",
"gutenberg": "1700-01-01, 2022-01-01",
"jvj": "1873-01-01, 1951-01-01",
"naat": "1930-01-01, 2022-01-01",
"opensub": "2000-01-01, 2022-01-01",
# "relig": "NA",
"relig" : "1700-01-01, 2022-01-01", # Take a guess instead
"spont": "2019-01-01, 2020-01-01",
"synne": "2000-01-01, 2022-01-01",
"tv2r": "2015-01-01, 2020-01-01",
"wiki": "2019-01-01, 2021-01-01",
"wikibooks": "2019-01-01, 2021-01-01",
"wikisource": "1700-01-01, 2022-01-01",
"twfv19": "2000-01-01, 2022-01-01" # not present in this version of the dataset
}

# add created
Expand Down Expand Up @@ -215,7 +280,7 @@ def make_markdown(ds: Dataset, directory: str) -> None:
num_records = ds.num_rows
num_records_category = determine_size_category(num_records)
sample = ds[0]
text = sample["text"][:50].replace("'", "\\'") # Escaping single quotes in the YAML-like example
text = sample["text"].strip()[:50].replace("'", "\\'") # Escaping single quotes in the YAML-like example and strip leading/trailing whitespace
source = sample["source"]
id = sample["id"]
added = sample["added"]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'''
Converting infomedia dataset.
Filtering infomedia records that have 'Information', 'Inormation', 'Information (Papermill)' in the field of 'source'.
ndjson -> jsonl.gz:
Expand All @@ -12,7 +13,7 @@
"metadata": { # OPTIONAL: source-specific metadata
"sub-source": "...", # OPTIONAL: E.g. "newspaper_ocr"
...
}
}
}
'''

Expand All @@ -30,7 +31,7 @@ def format_created_range(publish_date, delay_days=365):
"""Create a formatted string representing a time range starting from `publish_date`."""
start_date = datetime.strptime(publish_date, "%Y-%m-%dT%H:%M:%SZ")
end_date = start_date + timedelta(days=delay_days)
return f"{start_date.strftime('%Y-%m-%dT%H:%M:%SZ')}, {end_date.strftime('%Y-%m-%dT%H:%M:%SZ')}"
return f"{start_date.strftime('%Y-%m-%d')}, {end_date.strftime('%Y-%m-%d')}"

def remove_html_tags(text: str) -> str:
"""Remove HTML tags from a string."""
Expand All @@ -44,7 +45,7 @@ def remove_whitespace(text: str) -> str:
clean_text = re.sub(pat_ws, " ", text)
return clean_text

def process_file(filepath):
def process_file(filepath, filter_source=['Information', 'Inormation', 'Information (Papermill)']):
"""Process a single file and write its processed contents to a temporary file."""
# Note: writing to the same file for all workers might lead to some problem.
articles = []
Expand All @@ -55,7 +56,7 @@ def process_file(filepath):
with open(filepath, 'r', encoding='utf-8') as file, open(temp_filename, 'w', encoding='utf-8') as temp_file:
for line in file:
original = json.loads(line)
added = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z')
added = datetime.now().strftime('%Y-%m-%d')
# Extract the fields
heading = original.get("Heading", "")
sub_heading = original.get("SubHeading", "")
Expand All @@ -77,15 +78,27 @@ def process_file(filepath):
text = remove_html_tags(text)
# Remove excess whitespace
text = remove_whitespace(text)

sub_source = original.get("Source", "")
# Filtering
if sub_source in filter_source:
continue

transformed = {
"id": original.get("ArticleId", ""),
"text": text,
"source": original.get("Source", ""),
"added": added,
"created": format_created_range(original.get("PublishDate", "2000-01-01T00:00:00Z")),
"metadata": {key: value for key, value in original.items() if key not in ["ArticleId", "BodyText", "Source", "PublishDate", "Heading", "SubHeading", "Lead", "Paragraph"]}
}
"id": original.get("ArticleId", ""),
"text": text,
"source": "danew2.0", # Fixed source value
"added": added,
"created": format_created_range(original.get("PublishDate", "2000-01-01T00:00:00Z")),
"metadata": {
"sub-source": sub_source, # Moving original source to metadata
}
}

# Add remaining metadata fields excluding specific ones already extracted
for key, value in original.items():
if key not in ["ArticleId", "BodyText", "Source", "PublishDate", "Heading", "SubHeading", "Lead", "Paragraph"]:
transformed["metadata"][key] = value
json.dump(transformed, temp_file)
# Line break
temp_file.write('\n')
Expand Down Expand Up @@ -136,5 +149,5 @@ def main(directory, output_jsonl_gz):

if __name__ == '__main__':
directory = '/work/github/infomedia'
output_jsonl_gz = '/work/dfm-data/pre-training/danews2.0/articles.jsonl.gz'
output_jsonl_gz = '/work/dfm-data/pre-training/danews2.0/documents/danews2.0.jsonl.gz'
main(directory, output_jsonl_gz)
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from tqdm import tqdm
import datetime
import uuid
import nltk
from nltk.tokenize import word_tokenize
import nltk # type: ignore
from nltk.tokenize import word_tokenize # type: ignore

# Ensure nltk punkt tokenizer models are downloaded
nltk.download('punkt', quiet=True)
Expand All @@ -33,9 +33,9 @@ def default_converter(o):

def format_created_range(year):
"""Creates a date range string for the document's creation time."""
start_date = f"{year}-01-01T00:00:00.000Z"
start_date = f"{year}-01-01"
end_year = int(year) + 100
end_date = f"{end_year}-01-01T00:00:00.000Z"
end_date = f"{end_year}-01-01"
return f"{start_date}, {end_date}"

def convert_txt_and_metadata(metadata_path, txt_folder_path, output_path):
Expand All @@ -59,12 +59,12 @@ def convert_txt_and_metadata(metadata_path, txt_folder_path, output_path):

file_id = str(row.get('file_id')) if pd.notna(row.get('file_id')) else str(uuid.uuid4())
metadata = {key: value for key, value in row.to_dict().items() if key not in ['file_id', 'source']}

metadata["sub-source"] = source_value # Moving original source to metadata
document = {
"id": file_id,
"text": text,
"source": source_value,
"added": datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z'),
"source": "memo", # Fixed source value
"added": datetime.datetime.now().strftime('%Y-%m-%d'),
"created": format_created_range(row.get('year', '1870')),
"metadata": clean_nested_dict(metadata)
}
Expand All @@ -82,7 +82,7 @@ def convert_txt_and_metadata(metadata_path, txt_folder_path, output_path):
def main():
metadata_path = '/work/github/Corpus-v1.1/MeMo-corpus-metadata-v1.1-2023-06-20.csv'
txt_folder_path = '/work/github/Corpus-v1.1/normalized'
output_path = '/work/dfm-data/pre-training/memo/normalized_memo.jsonl.gz'
output_path = '/work/dfm-data/pre-training/memo/documents/memo.jsonl.gz'
convert_txt_and_metadata(metadata_path, txt_folder_path, output_path)

if __name__ == "__main__":
Expand Down
11 changes: 6 additions & 5 deletions data-processing/scripts/convert_nordjyllandnews_to_jsonlgz.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def format_created_range(created_date, delay_days=365):
"""Create a formatted string representing a time range starting from `publish_date`."""
start_date = datetime.strptime(created_date, "%Y-%m-%dT%H:%M:%S.000Z")
end_date = start_date + timedelta(days=delay_days)
return f"{start_date.strftime('%Y-%m-%dT%H:%M:%S.000Z')}, {end_date.strftime('%Y-%m-%dT%H:%M:%S.000Z')}"
return f"{start_date.strftime('%Y-%m-%d')}, {end_date.strftime('%Y-%m-%d')}"

def parquet_to_jsonlgz(input_path, output_path):
"""Convert .parquet to newline-delimited json.gz."""
Expand All @@ -30,21 +30,22 @@ def parquet_to_jsonlgz(input_path, output_path):
transformed = {
"id": f'nordjylland-news{idx}',
"text": row.get("text", ""),
"source": "TV2 Nord",
"added": datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z'),
"source": "nordjylland_news",
"added": datetime.now().strftime('%Y-%m-%d'),
"created": format_created_range(datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z')),
"metadata": {
"summary": row.get("summary", ""),
"text_len": row.get("text_len", ""),
"summary_len": row.get("summary_len", "")
"summary_len": row.get("summary_len", ""),
"sub-source": "TV2 Nord"
}
}
json_str = json.dumps(transformed)
out_file.write(json_str + '\n')

def main():
parquet_path = "/work/github/nordjylland-news-summarization/data/train-00000-of-00001-4fb110c0f6314175.parquet"
converted_path = "/work/github/nordjylland-news-summarization/data/converted_train.jsonl.gz"
converted_path = "/work/dfm-data/pre-training/nordjylland_news/documents/nordjylland_news.jsonl.gz"

parquet_to_jsonlgz(parquet_path, converted_path)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,22 @@

def process_file(file_path):
try:
added_timestamp = datetime.utcfromtimestamp(os.path.getctime(file_path)).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
# added_timestamp = datetime.utcfromtimestamp(os.path.getctime(file_path)).strftime('%Y-%m-%d')
added_timestamp = datetime.now().strftime('%Y-%m-%d')
with open(file_path, 'r', encoding='utf-8') as file:
text_content = file.read()

json_object = {
"id": f"doc_hovedstaden_{os.path.splitext(os.path.basename(file_path))[0]}",
"text": text_content,
"source": "University of Southern Denmark (SDU) & Capital Region",
"source": "scrape_hovedstaden",
"added": added_timestamp,
"created": "2023-11-16T13:44:00+01:00, 2024-04-04T09:09:00+02:00",
"created": "2023-11-16, 2024-04-04",
"metadata": {
"subject": "health",
"language": "danish",
"organization": "The Danish Agency for Digitalisation",
"source-pretty": "University of Southern Denmark (SDU) & Capital Region",
"URL": "https://sprogteknologi.dk/dataset/1076892a-14ee-4f14-a9db-32efb03c40c9"
}
}
Expand Down Expand Up @@ -58,7 +60,7 @@ def convert_txt_to_jsonl_gz(directory, output_file):

def main():
directory = '/work/github/capital_region/korpus/renset'
output_file = 'scrape_hovedstaden.jsonl.gz'
output_file = '/work/dfm-data/pre-training/scrape_hovedstaden/documents/scrape_hovedstaden.jsonl.gz'
convert_txt_to_jsonl_gz(directory, output_file)

if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,15 @@ def process_file(bz2_file_path):
year = elem.get('year')
datefrom_raw = elem.get('datefrom') if elem.get('datefrom') else f"{year}0101"
dateto_raw = elem.get('dateto') if elem.get('dateto') else f"{year}1231"
datefrom = datetime.strptime(datefrom_raw, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S.000Z")
dateto = datetime.strptime(dateto_raw, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S.000Z")
datefrom = datetime.strptime(datefrom_raw, "%Y%m%d").strftime("%Y-%m-%d")
dateto = datetime.strptime(dateto_raw, "%Y%m%d").strftime("%Y-%m-%d")
created_range = f"{datefrom}, {dateto}"
added_timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%S.000Z")
added_timestamp = datetime.now().strftime("%Y-%m-%d")

json_obj = {
"id": corpus_id,
"text": document_text,
"source": "Swedish gigaword",
"source": "swedish_gigaword",
"added": added_timestamp,
"created": created_range,
"metadata": {
Expand Down Expand Up @@ -113,7 +113,7 @@ def process_files_in_parallel(directories, output_file_path):

def main():
directories = ['/work/github/swedish_gigaword/gigaword/1950', '/work/github/swedish_gigaword/gigaword/1960', '/work/github/swedish_gigaword/gigaword/1970', '/work/github/swedish_gigaword/gigaword/1980', '/work/github/swedish_gigaword/gigaword/1990', '/work/github/swedish_gigaword/gigaword/2000', '/work/github/swedish_gigaword/gigaword/2010']
output_file_path = '/work/github/swedish_gigaword/gigaword/swedish_gigaword_new.jsonl.gz'
output_file_path = '/work/github/swedish_gigaword/gigaword/swedish_gigaword.jsonl.gz'
process_files_in_parallel(directories, output_file_path)

if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 270180b

Please sign in to comment.