The original dataset was published as a compressed TAR-ball with several files per language. Each line of those files would contain a JSON. That isn't an issue for data-scientists, but makes it hard to use the dataset for other purposes, such as benchmarks for string-processing libraries. This script joins, cleans, shuffles and converts the dataset into a single CSV file.

In [None]:
import os
import pandas as pd

directory = './XLSum_complete_v1.0/'
output_file = './xlsum.csv'

# Function to extract language from file name
def extract_language(filename):
    return filename.split('_')[0]

In [None]:
# List to hold dataframes
dataframes = []

# Loop through files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.jsonl'):
        file_path = os.path.join(directory, filename)
        # Read the JSONL file
        df = pd.read_json(file_path, lines=True)
        # Add language column
        language = extract_language(filename)
        df['language'] = language
        # Append to the list of dataframes
        dataframes.append(df)

In [None]:
[d.head() for d in dataframes]

In [None]:
# Concatenate all dataframes
combined_df = pd.concat(dataframes)
combined_df.sample(100)

In [None]:

# Shuffle the rows
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

# Write to CSV
combined_df.to_csv(output_file, index=False)

print(f"Data written to {output_file}")