### Use to Parse through text file for Caffe to obtain train and test loss/accuracy

In [1]:
import os
import matplotlib
import numpy as np
matplotlib.use('AGG') 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import re
from collections import OrderedDict

In [3]:
def parse_line_for_net_output(regex_obj, row, row_dict_list,
                              line, iteration, seconds, learning_rate):
    """Parse a single line for training or test output
    Returns a a tuple with (row_dict_list, row)
    row: may be either a new row or an augmented version of the current row
    row_dict_list: may be either the current row_dict_list or an augmented
    version of the current row_dict_list
    """

    output_match = regex_obj.search(line)
    if output_match:
        if not row or row['NumIters'] != iteration:
            # Push the last row and start a new one
            if row:
                # If we're on a new iteration, push the last row
                # This will probably only happen for the first row; otherwise
                # the full row checking logic below will push and clear full
                # rows
                row_dict_list.append(row)

            row = OrderedDict([
                ('NumIters', iteration),
                ('Seconds', seconds),
                ('LearningRate', learning_rate)
            ])

        # output_num is not used; may be used in the future
        # output_num = output_match.group(1)
        output_name = output_match.group(2)
        output_val = output_match.group(3)
        row[output_name] = float(output_val)

    if row and len(row_dict_list) >= 1 and len(row) == len(row_dict_list[0]):
        # The row is full, based on the fact that it has the same number of
        # columns as the first row; append it to the list
        row_dict_list.append(row)
        row = None

    return row_dict_list, row

In [4]:
def fix_initial_nan_learning_rate(dict_list):
    """Correct initial value of learning rate
    Learning rate is normally not printed until after the initial test and
    training step, which means the initial testing and training rows have
    LearningRate = NaN. Fix this by copying over the LearningRate from the
    second row, if it exists.
    """

    if len(dict_list) > 1:
        dict_list[0]['LearningRate'] = dict_list[1]['LearningRate']

In [5]:
def parse_log(path_to_log):
    """Parse log file
    Returns (train_dict_list, test_dict_list)
    train_dict_list and test_dict_list are lists of dicts that define the table
    rows
    """

    regex_iteration = re.compile('Iteration (\d+)')
    regex_train_output = re.compile('Train net output #(\d+): (\S+) = ([\.\deE+-]+)')
    regex_test_output = re.compile('Test net output #(\d+): (\S+) = ([\.\deE+-]+)')
    regex_learning_rate = re.compile('lr = ([-+]?[0-9]*\.?[0-9]+([eE]?[-+]?[0-9]+)?)')

    # Pick out lines of interest
    iteration = -1
    learning_rate = float('NaN')
    train_dict_list = []
    test_dict_list = []
    train_row = None
    test_row = None
    
    #logfile_year = extract_seconds.get_log_created_year(path_to_log)
    with open(path_to_log) as f:
        #start_time = extract_seconds.get_start_time(f, logfile_year)

        for line in f:
            iteration_match = regex_iteration.search(line)
            if iteration_match:
                iteration = float(iteration_match.group(1))
            if iteration == -1:
                # Only start parsing for other stuff if we've found the first
                # iteration
                continue

            #time = extract_seconds.extract_datetime_from_line(line, logfile_year)
            #seconds = (time - start_time).total_seconds()

            learning_rate_match = regex_learning_rate.search(line)
            if learning_rate_match:
                learning_rate = float(learning_rate_match.group(1))

            train_dict_list, train_row = parse_line_for_net_output(
                regex_train_output, train_row, train_dict_list,
                line, iteration, 0, learning_rate
            )
            test_dict_list, test_row = parse_line_for_net_output(
                regex_test_output, test_row, test_dict_list,
                line, iteration, 0, learning_rate
            )

    fix_initial_nan_learning_rate(train_dict_list)
    fix_initial_nan_learning_rate(test_dict_list)

    return train_dict_list, test_dict_list

In [6]:
train_dict_list, test_dict_list = parse_log('Caffe models/ResNet50/output/1000ResNet50fixedlog.txt')

In [7]:
test_dict_list[-1]

OrderedDict([('NumIters', 1000.0),
             ('Seconds', 0),
             ('LearningRate', 0.0001),
             ('label', 188.64),
             ('probt', 0.00327022)])

In [12]:
train_loss = []
iterstr = []
for t in train_dict_list:
    train_loss.append(t['prob'])
    iterstr.append(t['NumIters'])

In [1]:
test_acc = []
train_loss = []
iters = []
for t,d in zip(train_dict_list,test_dict_list):
    train_loss.append(t['loss'])
    test_acc.append(d['accuracy'])
    iters.append(t['NumIters'])
test_acc = np.array(test_acc)

In [None]:
#Plot
fig, ax1 = plt.subplots()
t = np.arange(0.01, 10.0, 0.01)
ax1.plot(iters, 100*test_acc, 'b-')
ax1.set_xlabel('Number of Iterations')
# Make the y-axis label and tick labels match the line color.
ax1.set_ylabel('Validaiton Accuracy', color='b')
for tl in ax1.get_yticklabels():
    tl.set_color('b')


ax2 = ax1.twinx()
ax2.plot(iterstr, train_loss, 'r-')
ax2.set_ylabel('Training Loss', color='r')
for tl in ax2.get_yticklabels():
    tl.set_color('r')
plt.show()
#fig.savefig('Caffe models/ResNet50/10000iter.png')