In [1]:
%load_ext lab_black

In [2]:
import os

os.chdir("..")

In [16]:
import json
import pandas as pd
import pickle as pkl
from functools import reduce

## Remarks

In original data indexing is started from 1 whilst in this notebook and corresponding pipeline indexing is stared from 0. It allows to use `pandas.merge` function using only `index` as column and `id` column in joins, e.g:

In [4]:
# vocab['index'] = vocab.index
# merged_df = pd.merge(vocab, docword, left_on='index', right_index='vocab_index')

In [5]:
dataset_names = [
    "enron",
    "kos",
    "nips",
    # "nytimes",
    # "pubmed"
]

docword_paths = [
    os.path.join("resources", "data", f"docword.{name}.txt.gz")
    for name in dataset_names
]

vocab_paths = [
    os.path.join("resources", "data", f"vocab.{name}.txt") for name in dataset_names
]

In [6]:
def rename_vocab_df(df):
    df["index"] = df.index
    df.rename(columns={0: "vocab"}, inplace=True)
    return df


def update_docword(df: pd.DataFrame, mapping: dict):
    df.rename(columns={0: "article_id", 1: "old index", 2: "count"}, inplace=True)
    df["vocab_index"] = df[df.columns[1]].map(mapping)
    return df

In [34]:
# NOTE: work only if vocab and names paths are passed in the same order

def unify_vocab(
    vocab_paths,
    names,
    vocab_out="vocab_unified.csv",
):
    # read and preapre dfs
    dfs = [pd.read_csv(df, header=None) for df in vocab_paths]
    dfs = list(map(rename_vocab_df, dfs))

    # create and preapre full vocab df
    df_concat = reduce(
        lambda df1, df2: pd.concat([df1[["vocab"]], df2[["vocab"]]]), dfs
    )
    df_concat.drop_duplicates(inplace=True)
    df_concat["index"] = df_concat.index

    df_concat[["vocab"]].to_csv(
        path_or_buf=os.path.join("resources", "data", f"{vocab_out}"),
        header=False,
        index=False,
    )

    # create mapping for indexes
    merged_dfs = list(
        map(
            lambda df: pd.merge(
                df_concat,
                df,
                how="inner",
                on="vocab",
                suffixes=("_merged", "_original"),
            )[["index_merged", "index_original"]],
            dfs,
        )
    )

    # NOTE: adding 1 to index is necessary to match numeration from 0 (pandas) and 1 (source files)
    index_mapping = [
        {
            (row[1] + 1): row[0]
            for row in df[["index_merged", "index_original"]].to_numpy()
        }
        for df in merged_dfs
    ]

    # save mappings
    for (mapping, name) in zip(index_mapping, names):
        with open(os.path.join("resources", "data", f"{name}_map.pkl"), "bw") as f:
            pkl.dump(mapping, f)

In [32]:
# NOTE: work only if docword and names paths are passed in the same order

def update_docwords(docword_paths, names, docword_out_suffix="_unified"):
    # creating and preapring docwords for update
    docword_dfs = [
        pd.read_csv(docword_path, compression="gzip", skiprows=3, header=None, sep=" ")
        for docword_path in docword_paths
    ]

    index_mapping = []
    for name in names:
        with open(os.path.join("resources", "data", f"{name}_map.pkl"), "rb") as f:
            index_mapping.append(pkl.load(f))

    updated_docword_dfs = list(map(update_docword, docword_dfs, index_mapping))

    # save updated docwords
    for (df, name) in zip(updated_docword_dfs, names):
        df.to_csv(
            path_or_buf=os.path.join(
                "resources", "data", f"{name}{docword_out_suffix}.csv"
            ),
            header=True,
            index=False,
            columns=["article_id", "vocab_index", "count"],
        )

In [35]:
unify_vocab(vocab_paths, dataset_names)

In [36]:
update_docwords(docword_paths, dataset_names)