# raw to text data

**Note: This notebook is used in automation**

This notebook pre-process raw mbox files produced by `gz_to_raw.ipynb` and converts them to csv files that only contain the text, data, and message id fields for each email.

When run directly, it will convert all mbox files in `raw/fedora-devel-list/` and save them as csv's in `/interim/text/`. 

When run as part of the automation workflow, it will only pre-process the last full months worth of data and upload it to remote storage for later use.  


In [2]:
import mailbox
import os
import re
import datetime
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
import sys

load_dotenv("../../.env")
sys.path.append("../..")
from src import utils  # noqa

In [3]:
# collect paths of either the entire dataset, or only the most recent month if running in automation

BASE_PATH = os.getenv("LOCAL_DATA_PATH", "../../data")
path = Path(BASE_PATH).joinpath("raw/fedora-devel-list")
mboxes = [x.name for x in list(path.glob("*.mbox"))]

if os.getenv("RUN_IN_AUTOMATION"):
    LAST_MONTH_DATE = datetime.datetime.now().replace(
        day=1
    ) - datetime.timedelta(days=1)
    y = LAST_MONTH_DATE.year
    m = LAST_MONTH_DATE.month
    mboxes = [x.name for x in list(path.glob(f"*{y}-{m}.mbox"))]

In [4]:
# Define a function to convert mbox data into row,column format for analysis
# using pandas


def mbox_to_text(mbox):

    csv = []
    for msg in mbox:
        msg_id = msg["Message-ID"]
        date = msg["Date"]
        body = []
        for m in msg.get_payload():
            body.append(m.get_payload())

        csv.append((msg_id, date, body))
    df = pd.DataFrame(csv, columns=["Message-ID", "Date", "Body"])
    return df

In [9]:
# Clean the data before storing it
def strip_thread(text):
    text = "".join(text)
    text = text.replace("\r", "")
    lines = text.split("\n")
    lines = [line for line in lines if len(line) > 0]
    lines = [line for line in lines if line[0] != ">"]
    lines = [line for line in lines if line[:3] != "Re:"]
    lines = [line for line in lines if line[:7] != "Subject"]
    lines = [line for line in lines if line[:5] != "From:"]
    lines = [line for line in lines if line[:5] != "Date:"]
    lines = [line for line in lines if "BEGIN PGP SIGNED MESSAGE" not in line]
    lines = [line for line in lines if line[:5] != "Hash:"]
    lines = [line for line in lines if line[:10] != "Version: G"]
    lines = [line for line in lines if "wrote:" not in line]
    lines = [line for line in lines if "wrote :" not in line]
    lines = [line for line in lines if "writes:" not in line]
    lines = [line for line in lines if line[:7] != "Am Mit,"]
    lines = [line for line in lines if line[:7] != "Am Don,"]
    lines = [line for line in lines if line[:7] != "Am Mon,"]
    lines = [line for line in lines if line[:7] != "Quoting"]
    lines = [line for line in lines if line[:10] != "Em Quinta,"]
    lines = [line for line in lines if "said:" not in line]
    lines = [
        line
        for line in lines
        if re.match(
            ".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), .. (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec) 20..*",
            line,
        )
        is None
    ]
    lines = [
        line
        for line in lines
        if re.match(
            (
                ".*n (Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday) .."
                " (January|February|March|April|May|June|July|August|September|October|November|December) 20..*"
            ),
            line,
        )
        is None
    ]
    lines = [
        line
        for line in lines
        if re.match(
            ".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec) .., 20..*",
            line,
        )
        is None
    ]
    lines = [
        line
        for line in lines
        if re.match(
            r".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), 20[\d]{2}-[\d]{2}-[\d]{2} at.*",
            line,
        )
        is None
    ]
    lines = [line for line in lines if line[-6:] != "said: "]
    lines = [line for line in lines if line[-8:] != "babbled:"]
    lines = [line for line in lines if line[-7:] != "wrot=e:"]
    lines = [line for line in lines if line[-8:] != "A9crit :"]
    lines = [line for line in lines if line[0] != "|"]
    return "\n".join(lines)


# format for CSV, clean special characters, and remove extranous emails
def pandas_clean(emails):
    emails["Body"].replace(
        to_replace=[
            r"\n",
            "\n",
        ],
        value=" ",
        regex=True,
        inplace=True,
    )
    emails["Body"].replace(
        to_replace=[r"\'", "'", ">", "<", "= ", "-", r"http\S+"],
        value="",
        regex=True,
        inplace=True,
    )
    emails["Body"].replace(
        to_replace=[r"\\\s+", r"\\s+", "="], value="", regex=True, inplace=True
    )
    emails["Body"].replace(
        to_replace=["   ", "  "], value=" ", regex=True, inplace=True
    )
    emails["Body"].replace(
        to_replace=["_", "3D"], value="", regex=True, inplace=True
    )
    emails["Body"].replace(
        to_replace=["   ", "  "], value=" ", regex=True, inplace=True
    )
    emails["Body"].replace(
        to_replace=["   ", "  "], value=" ", regex=True, inplace=True
    )
    emails["Body"] = emails["Body"].apply(
        lambda x: x.strip().replace(r"\n", "")
    )

    emails.drop(emails.index[emails["Body"] == ""], inplace=True)
    emails.drop(emails.index[emails["Body"] == " "], inplace=True)
    emails.dropna(subset=["Body"], inplace=True)

    emails = emails.reset_index()
    emails.drop("index", axis=1, inplace=True)
    return emails

In [10]:
# Ensure datset location exists
dataset_base_path = Path(f"{BASE_PATH}/interim/text")
dataset_base_path.mkdir(parents=True, exist_ok=True)


# Register all created dataset slices for later upload
new_files = []

# Save each dataset into its own monthly csv
for mbox in mboxes:
    output_path = dataset_base_path.joinpath(f"{mbox}.csv")
    monthly_mbox = mailbox.mbox(path.joinpath(mbox))
    df = mbox_to_text(monthly_mbox)
    df["Body"] = df["Body"].apply(strip_thread)
    df = pandas_clean(df)
    df.to_csv(output_path)
    new_files.append(output_path)
    print(f"{output_path} saved")

../../data/interim/text/fedora-devel-2020-12.mbox.csv saved
../../data/interim/text/fedora-devel-2021-2.mbox.csv saved
../../data/interim/text/fedora-devel-2021-1.mbox.csv saved


In [6]:
# Push all the files to ceph

if os.getenv("RUN_IN_AUTOMATION"):
    utils.upload_files(
        (f.as_posix(), f"interim/text/{Path(f).name}") for f in new_files
    )