In [None]:
import pandas as pd
from pathlib import Path

# Point to your folder of MADAR .tsv files
data_dir = Path("/path/to/MADAR_Corpus")

# Read all TSVs into one dataframe
dfs = []
for f in data_dir.glob("*.tsv"):
    df = pd.read_csv(f, sep="\t", dtype=str, keep_default_na=False)
    # Normalize column names
    df = df.rename(columns={
        df.columns[0]: "sentID.BTEC",
        df.columns[1]: "split",
        df.columns[2]: "lang",
        df.columns[3]: "sent"
    })
    dfs.append(df)

madar_long = pd.concat(dfs, ignore_index=True)

# Show a peek
madar_long.head()

In [None]:
# how many sentences per dialect/language?
madar_long.groupby("lang")["sentID.BTEC"].count().sort_values(ascending=False)

In [None]:
#how many splits and distribution?
madar_long.groupby("split")["sentID.BTEC"].nunique()

In [None]:
# each (sentID.BTEC, split) becomes a row, and each lang becomes a column
madar_wide = madar_long.pivot_table(
    index=["sentID.BTEC", "split"], 
    columns="lang", 
    values="sent", 
    aggfunc="first"
).reset_index()

madar_wide.head()


In [None]:
#drop rows with ANY missing sentence
madar_complete = madar_wide.dropna()

print(len(madar_wide), "total rows")
print(len(madar_complete), "rows with all languages present")

madar_complete.head()

In [None]:
#make a copy so we don't lose the original
madar_reordered = madar_wide.copy()

#put EN and FR first, then the rest in alphabetical order
cols = ["sentID.BTEC", "split"]
if "EN" in madar_reordered.columns:
    cols.append("EN")
if "FR" in madar_reordered.columns:
    cols.append("FR")

#add the rest of the languages (except ones already added)
other_cols = [c for c in madar_reordered.columns if c not in cols]
cols.extend(sorted(other_cols))

#reorder
madar_reordered = madar_reordered[cols]

#saving the reordered, complete dataframe
madar_complete = madar_reordered.dropna()

#show all columns (disable column truncation in Jupyter)
pd.set_option("display.max_columns", None)

madar_complete.head()


In [None]:
#save to TSV (tab-separated, UTF-8 encoded)
madar_complete.to_csv("madar_complete.tsv", sep="\t", index=False, encoding="utf-8")

print("Saved", len(madar_complete), "rows to madar_complete.tsv")