In [56]:
import json
import os
import pandas as pd
import hashlib
from sklearn.model_selection import train_test_split

In [78]:
# Define input and output paths
INPUT_FILE = "./data/raw/amharic-news_dataset/amharic_news_classification_dataset.jsonl"
HOME_DIR = "./dataset/processed/msmarco-amharic-news_dataset"
os.makedirs(HOME_DIR, exist_ok=True)

# Load dataset
def load_dataset(input_file):
    data = []
    with open(input_file, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    print(f"Loaded {len(df)} documents.")
    return df

df = load_dataset(INPUT_FILE)

Loaded 49971 documents.


In [79]:
print(df.columns)

Index(['headline', 'category', 'date', 'views', 'article', 'link', 'word_len',
       'label'],
      dtype='object')


In [80]:
#  Ensure required columns exist
assert "article" in df.columns, "Missing column: 'article'"
assert "headline" in df.columns, "Missing column: 'headline'"
assert "label" in df.columns, "Missing column: 'label'"

In [81]:
# print(df.isnull().sum())
print(df["headline"].isnull().sum())

0


In [82]:
# Clean articles
df["article"] = df["article"].astype(str).str.strip().replace(r"\s+", " ", regex=True)

In [83]:

# Function to generate MD5 hash
def generate_md5(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

# Apply MD5 hashing to remove duplicates
df["hash"] = df["article"].apply(generate_md5)
df = df.drop_duplicates(subset="hash", keep="first").drop(columns=["hash"])  # Drop duplicate articles

print(f"Removed duplicates. Remaining documents: {len(df)}")


Removed duplicates. Remaining documents: 49839


In [84]:
#  Apply filtering conditions
df = df[
    (df["article"].str.split().str.len() > 32) &  # Article must have more than 32 words
    (df["article"].str.split().str.len() < 256) &  # Article must have fewer than 256 words
    (df["headline"].str.split().str.len() < 64)  # Headline must have fewer than 64 words
]


In [85]:
print(f" After filtering, {len(df)} documents remain.")


 After filtering, 31148 documents remain.


In [86]:

# Ensure labels appear at least twice to avoid stratification issues
df = df.groupby("label").filter(lambda x: len(x) > 1)
print(len(df))

31148


In [87]:
# Assign unique passage and query IDs
df["passage_id"] = range(len(df))
passage_dict = dict(zip(df["article"], df["passage_id"]))
print(f"Number of unique passages: {df['passage_id'].nunique()}")
print(df.columns)


Number of unique passages: 31148
Index(['headline', 'category', 'date', 'views', 'article', 'link', 'word_len',
       'label', 'passage_id'],
      dtype='object')


In [88]:
print(df.head(1))

                                            headline category  \
6  በውድድር ወቅት በወረርሽኙ መከላከያ መመሪያዎች አተገባበር ላይ ስጋት እን...     ስፖርት   

                date views                                            article  \
6  December 30, 2020     3  ብርሃን ፈይሳ አዲስ አበባ፡- ስፖርታዊ እንቅስቃሴና ውድድሮች በሚከናወኑበ...   

                                link  word_len  label  passage_id  
6  https://www.press.et/Ama/?p=38515       233      2           0  


In [89]:
# Save Passage Collection
collection_jsonl = os.path.join(HOME_DIR, "collection.jsonl")
collection_tsv = os.path.join(HOME_DIR, "collection.tsv")
i=0

with open(collection_jsonl, "w", encoding="utf-8") as f_jsonl, open(collection_tsv, "w", encoding="utf-8") as f_tsv:
    for _, row in df.iterrows():
        json.dump({"pid": row["passage_id"], "text": row["article"]}, f_jsonl, ensure_ascii=False)
        f_jsonl.write("\n")
        f_tsv.write(f"{row['passage_id']}\t{row['article']}\n")
        i+=1

print("number of unique passages: ", i)
print(f"Collection saved!,({len(df)} passages)")


number of unique passages:  31148
Collection saved!,(31148 passages)


In [90]:
print(df["label"].value_counts())

label
0    12845
2     5584
5     5431
4     5264
3     1949
1       75
Name: count, dtype: int64


In [92]:
# Split into Train & Dev
train_df, dev_df = train_test_split(df, test_size=0.1, stratify=df["label"], random_state=42)
print(f"Train: {len(train_df)} | Dev: {len(dev_df)}")

Train: 28033 | Dev: 3115


In [93]:
train_df = train_df.copy().reset_index(drop=True)
dev_df = dev_df.copy().reset_index(drop=True)

train_df["query_id"] = range(len(train_df))  # Assign unique IDs for train
dev_df["query_id"] = range(len(train_df), len(train_df) + len(dev_df))  # Continue IDs for dev
print(train_df.columns)
print(dev_df.columns)

Index(['headline', 'category', 'date', 'views', 'article', 'link', 'word_len',
       'label', 'passage_id', 'query_id'],
      dtype='object')
Index(['headline', 'category', 'date', 'views', 'article', 'link', 'word_len',
       'label', 'passage_id', 'query_id'],
      dtype='object')


In [94]:
print(train_df["label"].value_counts())
print(dev_df["label"].value_counts())

label
0    11560
2     5026
5     4888
4     4738
3     1754
1       67
Name: count, dtype: int64
label
0    1285
2     558
5     543
4     526
3     195
1       8
Name: count, dtype: int64


In [95]:
print("Train Label Distribution:\n", train_df["label"].value_counts(normalize=True))
print("Dev Label Distribution:\n", dev_df["label"].value_counts(normalize=True))


Train Label Distribution:
 label
0    0.412371
2    0.179289
5    0.174366
4    0.169015
3    0.062569
1    0.002390
Name: proportion, dtype: float64
Dev Label Distribution:
 label
0    0.412520
2    0.179133
5    0.174318
4    0.168860
3    0.062600
1    0.002568
Name: proportion, dtype: float64


In [96]:
def clean_query(text):
    if not isinstance(text, str) or not text.strip():
        return "UNKNOWN_QUERY"
    return text.replace("\n", " ").replace("\t", " ").strip()

In [97]:
def save_queries_and_qrels(dataframe, split_name):
    queries_jsonl = os.path.join(HOME_DIR, f"queries_{split_name}.jsonl")
    queries_tsv = os.path.join(HOME_DIR, f"queries_{split_name}.tsv")
    qrels_tsv = os.path.join(HOME_DIR, f"qrels_{split_name}.tsv")
    
    with open(queries_jsonl, "w", encoding="utf-8") as f_jsonl, open(queries_tsv, "w", encoding="utf-8") as f_tsv, open(qrels_tsv, "w", encoding="utf-8") as f_qrels:
        for _, row in dataframe.iterrows():
            cleaned_query = clean_query(row["headline"])
            json.dump({"query_id": row["query_id"], "headline": cleaned_query}, f_jsonl, ensure_ascii=False)
            f_jsonl.write("\n")
            f_tsv.write(f"{row['query_id']}\t{cleaned_query}\n")
            f_qrels.write(f"{row['query_id']}\t0\t{row['passage_id']}\t1\n")

    print(f"Saved {split_name} queries and qrels.")

In [98]:
assert train_df["query_id"].nunique() == len(train_df), " Duplicate queries in train set!"
assert dev_df["query_id"].nunique() == len(dev_df), " Duplicate queries in dev set!"
assert df["passage_id"].nunique() == len(df), " Duplicate passages in dataset!"

print("No duplicate queries or passages found!")


No duplicate queries or passages found!


In [99]:
#Save train and dev datasets
save_queries_and_qrels(train_df, "train")
save_queries_and_qrels(dev_df, "dev")


Saved train queries and qrels.
Saved dev queries and qrels.
