In [1]:
import os
import multiprocessing
import pandas as pd
import json
import logging
import re
from itertools import repeat
import datetime
import time
from natsort import natsorted
import shutil
import numpy as np

import tempfile
import csv

### Original functions

In [2]:
def remove_vectors(json_fields, single=False):
    # Simple function to remove the vector from results in json file might be we remove that part from the json file
    if single:
        return json_fields.replace(":vector", "")
    for i in range(len(json_fields)):
        json_fields[i] = json_fields[i].replace(":vector", "")
    return json_fields

In [3]:
def create_bins(lower_bound, width, quantity):
    """ create_bins returns an equal-width (distance) partitioning.
        It returns an ascending list of tuples, representing the intervals.
        A tuple bins[i], i.e. (bins[i][0], bins[i][1])  with i > 0
        and i < quantity, satisfies the following conditions:
            (1) bins[i][0] + width == bins[i][1]
            (2) bins[i-1][0] + width == bins[i][0] and
                bins[i-1][1] + width == bins[i][1]
    """
    bins = []
    for low in range(lower_bound, lower_bound + quantity * width + 1, width):
        bins.append((low, low + width))
    return bins

In [4]:
def bin_fields(df, fields, bin_width=10, bin_quantity=49):
    """
    Bins multiple dfs into a single dictionary that can be used as an average for multiple fields across multiple
    runs
    :param df: dataframe to bin
    :param fields: fields to be binned.
    :param bin_width: width of each bin
    :param bin_quantity: total number of bins
    :return:
    """
    bins = create_bins(lower_bound=0, width=bin_width, quantity=bin_quantity)
    distances = []
    overall_fields = {}
    for interval in bins:
        upper_b = interval[1]
        distances.append(upper_b)

    for field in fields:
        print("{} being binned".format(field))
        overall_fields[field] = []

    overall_fields["distance"] = distances

    distance_col = config["results"]["distance"]

    for i in range(len(bins)):
        lower_b = bins[i][0]
        upper_b = bins[i][1]
        fields_temp = df[(df[distance_col] >= lower_b) & (df[distance_col] < upper_b)]
        for field in fields:
            if i < len(overall_fields[field]):
                overall_fields[field][i] = (fields_temp[field].mean() + overall_fields[field][i]) / 2
            else:
                overall_fields[field].append(fields_temp[field].mean())

    return overall_fields

In [5]:
def combine_results(combined, results):
    for result in results:
        for field in result:
            if field in combined:
                for i in range(len(result[field])):
                    combined[field][i] = (combined[field][i] + result[field][i]) / 2
            else:
                combined[field] = result[field]
    return combined

### New parsing system

In [6]:
def parse_vector_desc_line(line):
    try:
        # Converts a vector description line to a dictionary for use in parsing later
        node_id_pattern = re.compile("\[\d+\]")

        vector_line_dict = {"nodeID": None, "vectorName": None, "ETV": True}
        split_line = line.split(" ")
        vector_num = int(split_line[1])
        match = node_id_pattern.search(split_line[2])
        nodeID = int(match.group().strip("[]"))
        vector_name = split_line[3]
        vector_name = vector_name.split(":")[0]
        if "ETV" in split_line[4]:
            ETV = True
        else:
            ETV = False

        vector_line_dict["nodeID"] = nodeID
        vector_line_dict["vectorName"] = vector_name
        vector_line_dict["ETV"] = ETV

        return vector_num, vector_line_dict
    except AttributeError as e:
        print("Line: {} : Could not be parsed".format(line))
        return None, None

In [7]:
def parse_vector_line(line):
    # Simple function to split a vector line and convert to floats.
    try:
        line = bytes(line, 'utf-8').decode('utf-8', 'ignore')
        split_nums = line.split()
        for i in range(len(split_nums)):
            split_nums[i] = float(split_nums[i])
        return split_nums
    except ValueError as e:
        print("Line: {} could not be converted due to bad encoding".format(line))
        return

In [8]:
def prepare_csv_line(vector_dict, vector_id, parsed_vec):
    # Parses the vector line information to be written to the csv file.
    node_id = vector_dict[vector_id]["nodeID"]
    vector_name = vector_dict[vector_id]["vectorName"]
    if vector_dict[vector_id]["ETV"]:
        time = parsed_vec[2]
        value = parsed_vec[3]
    else:
        time = parsed_vec[1]
        value = parsed_vec[2]

    csv_line = [node_id, time, vector_name, value]
    return csv_line, time

In [9]:
def setup_chunk_writer(output_file, chunk_num, title_line):
    # Setup our chunk writer

    # First create a folder to hold chunks
    chunk_folder = output_file.split(".")[0]
    os.makedirs(chunk_folder, exist_ok=True)
    chunk_name = "{}/chunk-{}.csv".format(chunk_folder, chunk_num)

    # Create the chunk file and create a csv writer which uses it
    print("Setting up new chunk: {}".format(chunk_name))
    temp_file_pt = open(chunk_name, "w+")
    output_writer = csv.writer(temp_file_pt, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    output_writer.writerow(title_line)

    return temp_file_pt, output_writer

In [10]:
def read_vector_file(output_file, vector_path, stats, chunk_size=1e+8):
    """
    chunk_size: the time between different files 1.5s as default
    """
    # Reads the csv file, parses it and writes to a temp file for use later in generating a DF and CSV file.
    vector_dict = {}
    no_interest_vectors = {}  # Probably don't need to remember one's we don't care for.

    chunk_times = []
    chunk_info = {}

    last_time = -1
    current_chunk_index = 0

    # Patterns which identify vector declaration lines and result lines
    vector_dec_line_pattern = re.compile("^vector")
    vector_res_line_pattern = re.compile("^\d+")

    vector_file = open(vector_path, "r")

    # Stores lines appearing before their declaration. Files are oddly formatted, this is purely safety ensuring we
    # don't accidentally miss anything.
    early_vectors = tempfile.NamedTemporaryFile(mode="r+")

    # Prepare and write out first line format NodeID, EventNumber, Time, Stat1, Stat2, Stat3, ...
    title_line = ["NodeID", "Time", "StatisticName", "Value"]

    temp_file_pt, writer = setup_chunk_writer(output_file, current_chunk_index, title_line)
    chunk_info["CurrentChunk"] = {"file": temp_file_pt, "writer": writer}

    for line in vector_file:
        if vector_dec_line_pattern.match(line):
            # if line matches a vector declaration, parse the vector description
            vector_num, vec_dict = parse_vector_desc_line(line)
            if vector_num is None and vec_dict is None:
                continue
            if vec_dict["vectorName"] in stats:
                # Vector is of interest, add it to our overall dictionary and update it's index.
                vector_dict[vector_num] = vec_dict
            else:
                # Mark this as a vector we don't care about.
                no_interest_vectors[vector_num] = None

        elif vector_res_line_pattern.match(line):
            # {"nodeID": None, "vectorName": None, "ETV": True} This is what it looks like
            parsed_vec = parse_vector_line(line)
            # If the previous step fails then we can simply continue to the next line ignoring this line.
            if parsed_vec is None:
                continue
            vector_id = parsed_vec[0]
            if vector_id in vector_dict:
                # Write out to a csv file correctly
                csv_line, time = prepare_csv_line(vector_dict, vector_id, parsed_vec)

                if time > last_time:
                    chunk_info["CurrentChunk"]["writer"].writerow(csv_line)
                    if chunk_info["CurrentChunk"]["file"].tell() >= chunk_size:
                        print("Time ending this chunk:{}".format(time))

                        # This chunk is old and as such can be placed into the previous chunks
                        chunk_info[time] = {"file": chunk_info["CurrentChunk"]["file"],
                                            "writer": chunk_info["CurrentChunk"]["writer"]}
                        chunk_times.append(time)
                        last_time = time
                        current_chunk_index += 1

                        # This file is at max size, create a new writer
                        temp_file_pt, writer = setup_chunk_writer(output_file, current_chunk_index, title_line)
                        # Update current chunk writer to point at this new one.
                        chunk_info["CurrentChunk"] = {"file": temp_file_pt, "writer": writer}
                if time <= last_time:
                    for chunk_time in chunk_times:
                        if time < chunk_time:
                            chunk_info[chunk_time]["writer"].writerow(csv_line)
            else:
                if vector_id not in no_interest_vectors:
                    # Write the line out in case we found it before declaration. Only if it is of possible interest.
                    early_vectors.write(line)

    # Rewind the early vectors file so we can search it for missed vectors
    early_vectors.seek(0)

    for line in early_vectors:
        print("We have early vectors")
        # Parse the line again.
        parsed_vec = parse_vector_line(line)
        vector_id = parsed_vec[0]
        # check for the vector
        if vector_id in vector_dict:
            # If we have it create the csv line and write it our
            csv_line, time = prepare_csv_line(vector_dict, vector_id, parsed_vec)
            for chunk_time in chunk_times:
                if time < chunk_time:
                    chunk_info[chunk_time]["writer"].writerow(csv_line)

    # Close our vector file.
    vector_file.close()

In [24]:
def csv_pivot(directory, stats):
    orig_loc = os.getcwd()
    os.chdir(directory)

    csv_files = os.listdir(os.getcwd())
    csv_files = natsorted(csv_files)
    header = True
    for csv_file in csv_files:
        if ".csv" in csv_file:
            print("Dealing with chunk file: {}".format(csv_file))
            chunk_df = pd.read_csv(csv_file)

            chunk_df = chunk_df.infer_objects()

            chunk_df = chunk_df.sort_values(by=["NodeID", "Time"])
            # Parse the vector file to ensure it is formatted correclty.
            chunk_df['seq'] = chunk_df.groupby(["Time", "NodeID", "StatisticName"]).cumcount()

            chunk_df = chunk_df.pivot_table("Value", ["Time", "NodeID", "seq"], "StatisticName")
            chunk_df.reset_index(inplace=True)
            chunk_df = chunk_df.drop(["seq"], axis=1)

            # Ensure all fields correctly filled
            for field in stats:
                if field not in chunk_df.columns:
                    chunk_df[field] = np.nan
                    
                    
            chunk_df = chunk_df.reindex(sorted(chunk_df.columns), axis=1)

            chunk_df.to_csv(csv_file, index=False, header=header)
            header = False

            del chunk_df

    os.chdir(orig_loc)

In [12]:
def combine_files(csv_directory, outfile):
    destination = open(outfile,'wb')
    
    orig_loc = os.getcwd()
    os.chdir(csv_directory)
    
    csv_files = os.listdir()
    csv_files = natsorted(csv_files)
    header = True
    for csv_file in csv_files:
        if ".csv" in csv_file and csv_file != outfile:
            print("Dealing with chunk file: {}".format(csv_file))
            shutil.copyfileobj(open(csv_file,'rb'), destination)
            os.remove(csv_file)
    destination.close()
    
    os.chdir(orig_loc)
    
    os.rmdir(csv_directory)

In [13]:
vector_dir = "/Users/brianmccarthy/git_repos/results-analysis/data/omnet/cv2x/test"

vector_file_name_long = "run-1.vec"
vector_file_name_short = "short.vec"

vector_path_long = vector_dir + vector_file_name_long
vector_path_short = vector_dir + vector_file_name_short

config_name_long = "long-test"
config_name_short = "short-test"

experiment_type = "cv2x"

json_path = "/Users/brianmccarthy/git_repos/results-analysis/configs/cv2x.json"

now = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S")

orig_loc = "/Users/brianmccarthy/git_repos/results-analysis"

In [14]:
temp_file = "long.csv"
real_vector_path = "run-1.vec"

### Setup for dealing with results

In [15]:
with open(json_path, "r") as json_file:
    config = json.load(json_file)["cv2x"]
    json_fields = config["results"]
    
real_vector_path = "/Users/brianmccarthy/git_repos/results-analysis/data/omnet/cv2x/test/run-1.vec"
output_csv = "/Users/brianmccarthy/git_repos/results-analysis/data/raw_data/cv2x/test/test.csv"

In [16]:
def tidy_data(real_vector_path, json_fields, output_csv):
    # Simply remove the :vector part of vector names from both sets of vectors.
    found_vector = False
    for field in json_fields:
        if ":vector" in field:
            found_vector = True
            break

    if found_vector:
        json_fields = remove_vectors(json_fields)

    overall_start_time = time.time()
    start_time = overall_start_time
    print("Beginning reading of vector file: {}".format(real_vector_path))

    # Read the vector file into a csv file
    chunk_folder = output_csv.split(".")[0]
    read_vector_file(output_csv, real_vector_path, json_fields)
    
    print("Finished reading of vector file: {} in {}s".format(real_vector_path, time.time() - start_time))

    start_time = time.time()
    print("File read, begin pivoting csv file: {}".format(real_vector_path))
    
    
    print("Finished pivoting csv file: {} in {}s".format(real_vector_path, time.time() - start_time))

    start_time = time.time()
    print("Pivot complete, consolidate chunk files for {}".format(output_csv))
    combine_files(chunk_folder, output_csv)

    print("Finished parsing of vector file: {} in {}s".format(real_vector_path, time.time() - overall_start_time))

In [None]:
tidy_data(real_vector_path, json_fields["filtered_vectors"], output_csv)

In [19]:
chunk_folder = output_csv.split(".")[0]
read_vector_file(output_csv, real_vector_path, json_fields["filtered_vectors"])

Setting up new chunk: /Users/brianmccarthy/git_repos/results-analysis/data/raw_data/cv2x/test/test/chunk-0.csv
Time ending this chunk:500.806
Setting up new chunk: /Users/brianmccarthy/git_repos/results-analysis/data/raw_data/cv2x/test/test/chunk-1.csv
Time ending this chunk:501.134
Setting up new chunk: /Users/brianmccarthy/git_repos/results-analysis/data/raw_data/cv2x/test/test/chunk-2.csv
Time ending this chunk:501.419
Setting up new chunk: /Users/brianmccarthy/git_repos/results-analysis/data/raw_data/cv2x/test/test/chunk-3.csv
Time ending this chunk:501.668
Setting up new chunk: /Users/brianmccarthy/git_repos/results-analysis/data/raw_data/cv2x/test/test/chunk-4.csv
Time ending this chunk:501.947
Setting up new chunk: /Users/brianmccarthy/git_repos/results-analysis/data/raw_data/cv2x/test/test/chunk-5.csv
Time ending this chunk:502.225
Setting up new chunk: /Users/brianmccarthy/git_repos/results-analysis/data/raw_data/cv2x/test/test/chunk-6.csv
Time ending this chunk:502.472
Settin

In [25]:
csv_pivot(chunk_folder, json_fields["filtered_vectors"])

Dealing with chunk file: chunk-0.csv
Dealing with chunk file: chunk-1.csv
Dealing with chunk file: chunk-2.csv
Dealing with chunk file: chunk-3.csv
Dealing with chunk file: chunk-4.csv
Dealing with chunk file: chunk-5.csv
Dealing with chunk file: chunk-6.csv
Dealing with chunk file: chunk-7.csv
Dealing with chunk file: chunk-8.csv
Dealing with chunk file: chunk-9.csv
Dealing with chunk file: chunk-10.csv
Dealing with chunk file: chunk-11.csv
Dealing with chunk file: chunk-12.csv
Dealing with chunk file: chunk-13.csv
Dealing with chunk file: chunk-14.csv
Dealing with chunk file: chunk-15.csv
Dealing with chunk file: chunk-16.csv
Dealing with chunk file: chunk-17.csv
Dealing with chunk file: chunk-18.csv
Dealing with chunk file: chunk-19.csv
Dealing with chunk file: chunk-20.csv
Dealing with chunk file: chunk-21.csv
Dealing with chunk file: chunk-22.csv
Dealing with chunk file: chunk-23.csv
Dealing with chunk file: chunk-24.csv
Dealing with chunk file: chunk-25.csv
Dealing with chunk fil

In [26]:
combine_files(chunk_folder, output_csv)

Dealing with chunk file: chunk-0.csv
Dealing with chunk file: chunk-1.csv
Dealing with chunk file: chunk-2.csv
Dealing with chunk file: chunk-3.csv
Dealing with chunk file: chunk-4.csv
Dealing with chunk file: chunk-5.csv
Dealing with chunk file: chunk-6.csv
Dealing with chunk file: chunk-7.csv
Dealing with chunk file: chunk-8.csv
Dealing with chunk file: chunk-9.csv
Dealing with chunk file: chunk-10.csv
Dealing with chunk file: chunk-11.csv
Dealing with chunk file: chunk-12.csv
Dealing with chunk file: chunk-13.csv
Dealing with chunk file: chunk-14.csv
Dealing with chunk file: chunk-15.csv
Dealing with chunk file: chunk-16.csv
Dealing with chunk file: chunk-17.csv
Dealing with chunk file: chunk-18.csv
Dealing with chunk file: chunk-19.csv
Dealing with chunk file: chunk-20.csv
Dealing with chunk file: chunk-21.csv
Dealing with chunk file: chunk-22.csv
Dealing with chunk file: chunk-23.csv
Dealing with chunk file: chunk-24.csv
Dealing with chunk file: chunk-25.csv
Dealing with chunk fil

OSError: [Errno 66] Directory not empty: '/Users/brianmccarthy/git_repos/results-analysis/data/raw_data/cv2x/test/test'