In [10]:
import pandas as pd
import re

file_path = '/scratch/tpang/zhliu/checkpoints/nlp/mt/iwslt14_de_en/baseline/transformer_iwslt_de_en_v2_iwslt14_de_en_seed43/train_log.txt'

def parse_line_updates(line):
    # Updated regular expression to match exponential format for 'lr'
    pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \|\s*INFO\s*\|\s*train_inner\s*\|)?\s*epoch\s*(\d+):\s*(\d+) \/ (\d+)\s*loss=(\d+\.\d+), nll_loss=(\d+\.\d+), ppl=(\d+\.\d+), wps=(\d+\.\d+), ups=(\d+\.\d+), wpb=(\d+\.\d+), bsz=(\d+\.\d+), num_updates=(\d+), lr=([\d\.]+e?[-\+]?\d*), gnorm=(\d+\.\d+), train_wall=(\d+), (gb_free=(\d+\.\d+),)?\s*wall=(\d+)'
    match = re.search(pattern, line)
    if match:
        return {
            'epoch': int(match.group(1)),
            'loss': float(match.group(2)),
            'nll_loss': float(match.group(3)),
            'ppl': float(match.group(4)),
            'wps': int(match.group(5)),
            'ups': int(match.group(6)),
            'wpb': float(match.group(7)),
            'bsz': float(match.group(8)),
            'num_updates': int(match.group(9)),
            'lr': float(match.group(10)),  # Converts exponential format to float
            'gnorm': float(match.group(11)),
            'clip': float(match.group(12)),
            'oom': float(match.group(13)),
            'wall': int(match.group(14)),
            'train_wall': int(match.group(15))
        }
    return None

def parse_line_epochs(line):
    # Regular expression for the new format, supporting 'lr' in exponential format
    pattern = r'\|\s*epoch\s*(\d+)\s*\|\s*loss\s*(\d+\.\d+)\s*\|\s*nll_loss\s*(\d+\.\d+)\s*\|\s*ppl\s*(\d+\.\d+)\s*\|\s*wps\s*(\d+)\s*\|\s*ups\s*(\d+)\s*\|\s*wpb\s*(\d+\.\d+)\s*\|\s*bsz\s*(\d+\.\d+)\s*\|\s*num_updates\s*(\d+)\s*\|\s*lr\s*([\d\.]+e?[-\+]?\d*)\s*\|\s*gnorm\s*(\d+\.\d+)\s*\|\s*clip\s*(\d+\.\d+)\s*\|\s*oom\s*(\d+\.\d+)\s*\|\s*wall\s*(\d+)\s*\|\s*train_wall\s*(\d+)'
    match = re.search(pattern, line)
    if match:
        return {
            'epoch': int(match.group(1)),
            'loss': float(match.group(2)),
            'nll_loss': float(match.group(3)),
            'ppl': float(match.group(4)),
            'wps': int(match.group(5)),
            'ups': int(match.group(6)),
            'wpb': float(match.group(7)),
            'bsz': float(match.group(8)),
            'num_updates': int(match.group(9)),
            'lr': float(match.group(10)),
            'gnorm': float(match.group(11)),
            'clip': float(match.group(12)),
            'oom': float(match.group(13)),
            'wall': int(match.group(14)),
            'train_wall': int(match.group(15))
        }
    return None

def parse_line_validation_stats(line):
    # Regular expression for parsing validation stats
    pattern = r'\|\s*epoch\s*(\d+)\s*\|\s*valid on \'valid\' subset\s*\|\s*loss\s*(\d+\.\d+)\s*\|\s*nll_loss\s*(\d+\.\d+)\s*\|\s*ppl\s*(\d+\.\d+)\s*\|\s*num_updates\s*(\d+)'
    match = re.search(pattern, line)
    if match:
        return {
            'epoch': int(match.group(1)),
            'loss': float(match.group(2)),
            'nll_loss': float(match.group(3)),
            'ppl': float(match.group(4)),
            'num_updates': int(match.group(5))
        }
    return None

def parse_file(file_path):
    updates_data = []
    epoch_data = []
    val_data = []
    with open(file_path, 'r') as file:
        for line in file:
            parsed_line_updates = parse_line_updates(line)
            if parsed_line_updates:
                updates_data.append(parsed_line_updates)
            parsed_line_epochs = parse_line_epochs(line)
            if parsed_line_epochs:
                epoch_data.append(parsed_line_epochs)
            parsed_line_val = parse_line_validation_stats(line)
            if parsed_line_val:
                val_data.append(parsed_line_val)

    return pd.DataFrame(updates_data), pd.DataFrame(epoch_data), pd.DataFrame(val_data)

# Example usage
df_updates, df_epochs, df_val = parse_file(file_path)
# df.to_csv('output.csv', index=False)


In [11]:
df_updates, df_epochs, df_val

(      epoch    loss  nll_loss       ppl    wps  ups       wpb      bsz  \
 0         1  13.759    13.710  13397.50  23451    6  3630.902  161.412   
 1         1  12.868    12.720   6748.83  23645    6  3634.762  166.168   
 2         1  12.273    12.059   4266.72  23467    7  3604.808  168.894   
 3         1  11.837    11.571   3042.18  23540    7  3608.935  160.990   
 4         1  11.502    11.190   2336.03  23470    7  3595.649  152.729   
 ...     ...     ...       ...       ...    ...  ...       ...      ...   
 1205     55   3.360     1.774      3.42  22475    6  3580.582  143.706   
 1206     55   3.359     1.773      3.42  22518    6  3581.961  144.655   
 1207     55   3.358     1.772      3.42  22585    6  3586.192  145.957   
 1208     55   3.361     1.776      3.42  22264    6  3584.260  145.301   
 1209     55   3.362     1.777      3.43  22332    6  3586.843  145.540   
 
       num_updates        lr  gnorm  clip  oom  wall  train_wall  
 0              51  0.000010  2