# Backblaze csv to smartctl json

In [None]:
import os
import datetime as dt
from glob import glob as gg

import pandas as pd

# from joblib import parallel_backend, Parallel, delayed

In [None]:
# path to backblaze csv's
ROOT_DIR = "/home/kachauha/Downloads/drive_stats_2019_Q1/"

# file paths of all relevant csv's
fnames = gg(f"{ROOT_DIR}/*.csv")

# where to save jsons's
SAVE_DIR = "/home/kachauha/Downloads/drive_stats_2019_Q1_jsons"

# create dir if doesnt exist
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

## Method 1

In [None]:
# helper function
def smartctl_reformat(row):
    """
    Converts one-column-per-smart-val formatted dataframe to smartctl json smart_attribute key dict
    """
    global smart_raw_cols, smart_norm_cols
    table = []
    for (rawcol, rawval), (normcol, normval) in zip(
        row[smart_raw_cols].iteritems(), row[smart_norm_cols].iteritems()
    ):
        if not pd.isna([rawval, normval]).all():
            table.append(
                {
                    "id": int(rawcol[6:-4]),
                    "value": int(normval),
                    "raw": {"value": int(rawval), "string": str(int(rawval))},
                }
            )
    return {"table": table}

In [None]:
for fname in fnames[:2]:
    date = os.path.split(fnames[0])[1].split(".")[0]

    # read in raw csv
    df = pd.read_csv(fname)

    # drop columns which are all nans
    df = df.dropna(axis=1, how="all")

    # drop date column
    df = df.drop("date", axis=1)

    # make column names same as smartctl entries
    df = df.rename(
        {
            "model": "model_name",
            "capacity_bytes": "user_capacity",
            "failure": "backblaze_failure_label",
        },
        axis=1,
    )

    # add timestamp. NOTE: this doesnt actually come from backblaze, but is something found in the json files
    df = df.assign(
        backblaze_ts=dt.datetime.timestamp(dt.datetime.strptime(date, "%Y-%m-%d"))
    )

    # if we're converting backblaze data, this is always true
    df = df.assign(is_backblaze=True)

    # add model name column - TODO: verify if this is true outside to backblaze as well
    df = df.assign(model_family=df.model_name)

    # define col types
    fail_cols = ["is_backblaze", "backblaze_ts", "backblaze_failure_label"]
    smart_raw_cols = [col for col in df.columns if "smart" in col and "raw" in col]
    smart_norm_cols = [
        col for col in df.columns if "smart" in col and "normalized" in col
    ]

    # add as hints col, remove now redundant data
    df["hints"] = df.loc[:, fail_cols].to_dict(orient="records")

    # drop stuff we dont need anymore to save some mem
    df = df.drop(fail_cols, axis=1)

    # update capacity format as in smartctl json
    df["user_capacity"] = df["user_capacity"].apply(lambda x: {"bytes": x})

    # apply smart attriutes dict reformatting
    df["ata_smart_attributes"] = df[smart_raw_cols + smart_norm_cols].apply(
        smartctl_reformat, axis=1
    )
    df["smartctl_json"] = df.drop(smart_raw_cols + smart_norm_cols, axis=1).to_dict(
        orient="records"
    )

    # save
    with open(f"{date}.json", "w") as f:
        for row in df.loc[:, ["smartctl_json", "hints"]].to_dict(orient="records"):
            f.write(str(row) + "\n")

## Method 2

In [None]:
# create one json per backblaze csv
fnames = gg(f"{ROOT_DIR}/*.csv")
for fname in fnames:
    # which date's data is this
    date = os.path.split(fnames[0])[1].split(".")[0]

    # read in raw csv
    df = pd.read_csv(fname)

    # list of jsons, one from each device, for this date
    jsonlist = []
    for _, row in df.iterrows():

        # FIXME: backblaze_ts is not available
        currjson = dict()
        currjson["hints"] = {
            "is_backblaze": True,
            "backblaze_failure_label": row["failure"],
        }
        currjson["smartctl_json"] = dict()

        # non smart attribute data
        currjson["smartctl_json"]["model_name"] = row["model"]
        currjson["smartctl_json"]["model_family"] = row["model"]
        currjson["smartctl_json"]["serial_number"] = row["serial_number"]
        currjson["smartctl_json"]["user_capacity"] = {"bytes": row["capacity_bytes"]}

        # smart attributes avaliable for current device
        attr_ids = set(
            int(i.split("_")[1]) for i in row[~row.isna()].index if "smart" in i
        )

        # populate exactly as it appears in smartctl json
        currjson["smartctl_json"]["ata_smart_attributes"] = {
            "table": [None] * len(attr_ids)
        }
        for i, aid in enumerate(attr_ids):
            currjson["smartctl_json"]["ata_smart_attributes"]["table"][i] = {
                "id": aid,
                "value": row[f"smart_{aid}_normalized"],
                "raw": {
                    "value": row[f"smart_{aid}_raw"],
                    "string": str(row[f"smart_{aid}_raw"]),
                },
            }

        # add current device's json to list of jsons
        jsonlist.append(currjson)

    # save data for current date
    with open(os.path.join(SAVE_DIR, f"{date}.json"), "w") as f:
        #         json.dump(jsonlist, f)
        for row in jsonlist:
            f.write(str(row) + "\n")