## Improved Parsing

This notebook is designed to look at the mechanisms used in parsing the results of simulation runs and ensuring they are done in a memory efficent way.

- Need to read the files in chunks
- Combine the result of each chunk into a single result
- Condense the results across files into a single result (this will be done in the actual script not jupyter)

In [None]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from natsort import natsorted

from matplotlib.ticker import FormatStrFormatter

In [None]:
# Markers to use for this run
markers = [".", "o", "v", "^", "<", ">", "1", "2", "3", "4", "8", "s", "p", "P", "*", "h",
           "H", "+", "x", "X", "D", "d", "|", "_", 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [None]:
import json
import os

In [None]:
# JSON file containing the results for this simulation run
configuration_file = "/Users/brianmccarthy/git_repos/results-analysis/configs/cv2x.json"

In [None]:
with open(configuration_file) as config_json:
    config = json.load(config_json)["cv2x"]

In [None]:
def create_bins(lower_bound, width, quantity):
    """ create_bins returns an equal-width (distance) partitioning.
        It returns an ascending list of tuples, representing the intervals.
        A tuple bins[i], i.e. (bins[i][0], bins[i][1])  with i > 0
        and i < quantity, satisfies the following conditions:
            (1) bins[i][0] + width == bins[i][1]
            (2) bins[i-1][0] + width == bins[i][0] and
                bins[i-1][1] + width == bins[i][1]
    """
    bins = []
    for low in range(lower_bound, lower_bound + quantity * width + 1, width):
        bins.append((low, low + width))
    return bins

In [None]:
def bin_fields(df, fields, bin_width=10, bin_quantity=49):
    """
    Bins multiple dfs into a single dictionary that can be used as an average for multiple fields across multiple
    runs
    :param df: dataframe to bin
    :param fields: fields to be binned.
    :param bin_width: width of each bin
    :param bin_quantity: total number of bins
    :return:
    """
    bins = create_bins(lower_bound=0, width=bin_width, quantity=bin_quantity)
    distances = []
    overall_fields = {}
    for interval in bins:
        upper_b = interval[1]
        distances.append(upper_b)

    for field in fields:
        overall_fields[field] = []

    overall_fields["distance"] = distances

    distance_col = config["results"]["distance"]

    for i in range(len(bins)):
        lower_b = bins[i][0]
        upper_b = bins[i][1]
        fields_temp = df[(df[distance_col] >= lower_b) & (df[distance_col] < upper_b)]
        for field in fields:
            overall_fields[field].append(fields_temp[field].mean())

    return overall_fields

In [None]:
def pdr_dist(pdrs, distances, labels, plot_name, show=True, store=False):
    fig, ax = plt.subplots()

    for i in range(len(pdrs)):
        ax.plot(distances, pdrs[i], label=labels[i])

    ax.set(xlabel='Distance (m)', ylabel='Packet Delivery Rate (PDR) %')
    ax.legend(loc='lower left')
    ax.tick_params(direction='in')
    
    ax.set_ylim([0, 100])
    plt.yticks(np.arange(0, 101, step=10))

    ax.set_xlim([0, (max(distances) + 1)])
    plt.xticks(np.arange(0, (max(distances) + 1), step=50))

    fig.suptitle(plot_name, fontsize=12)
    
    if show:
        fig.show()
        
    if store:
        fig.savefig("{}.png".format(plot_name), dpi=300)

## What to do:
1. Figure out where the files are.
2. Read each file in chunks.
3. For each chunk do your calculation on the statistic (e.g. calc pdr @ distances/average messageLatency)
4. Combine the results from each chunk into a single average across the file.
5. Combine the results across the files into a single average across the folder.

In [None]:
def parse_results(folder):
    # 1) Figure out where the files are.
    overall_results = pd.DataFrame()

    bins = create_bins(lower_bound=0, width=10, quantity=49)
    distances = []
    overall_fields = {}
    for interval in bins:
        upper_b = interval[1]
        distances.append(upper_b)

    recorded_results = pd.DataFrame()
    for file in natsorted(os.listdir(folder)):
        if ".csv" in file:
            print("Dealing with file:{}".format(file))

            file_path = os.path.join(folder, file)
            # 2) Read each file in chunks.
            # Tell pandas to read the data in chunks
            chunks = pd.read_csv(file_path, chunksize=1e6)

            chunk_count = 0
            for chunk in chunks:
                # 3) For each chunk do your calculation on the statistic 
                #    (e.g. calc pdr @ distances/average messageLatency)
                # Filter the times down
                if chunk["Time"].max() < 502:
                    # Skip until 502
#                     print("Chunk not far enough into file to use results")
                    continue

                chunk = chunk[chunk["Time"] > 502]

                # Calculate pdr
                binned_fields = bin_fields(chunk, ["tbDecoded"])

                # 4) Combine the results from each chunk into a single average across the file.
                chunk_res = pd.DataFrame([binned_fields["tbDecoded"]],columns=distances)
                if recorded_results.empty:
                    recorded_results = chunk_res
                else:
                    recorded_results.append(chunk_res)
                    
                    
    # 5) Combine the results across the files into a single average across the folder.
    file_mean = pd.DataFrame(recorded_results.mean().to_dict(),index=[recorded_results.index.values[-1]])
    if overall_results.empty:
        overall_results = file_mean
    else:
        overall_results = overall_results.append(file_mean)
    print("Combine all the files in this folder")
    return overall_results

In [None]:
results = {}
for folder in config["raw-results"]:
    if "Analytical-Half" in folder:
        print("Dealing with folder: {}".format(folder))
        results[os.path.basename(folder)] = parse_results(folder)

In [None]:
pdrs = []
for key in results:
    if key == "Analytical-Half":
#         continue
        df = results[key]
        print(key)
        pdrs.append(df.values.tolist()[0])

In [None]:
bins = create_bins(lower_bound=0, width=10, quantity=49)
distances = []
overall_fields = {}
for interval in bins:
    upper_b = interval[1]
    distances.append(upper_b)

In [None]:
for i in range(len(pdrs)):
    for j in range(len(pdrs[i])):
        pdrs[i][j] = pdrs[i][j] * 100

In [None]:
pdr_dist(pdrs, distances, ["Analytical"], "Analytical")