# Load txt file

In [1]:
import pandas as pd

In [2]:
filename = "hyp_optimisation.txt"

In [3]:
with open(filename) as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content] 

# Helper functions

In [4]:
def check_format(line):
    whitelist = ['Job id', 'Train Evaluation summary', 'root_mean', 'Reached prediction', 'Predict Evaluation summary', 
                'Start training iteration', 'num_tree', 'lrn_rate', 'max_dept', 'val_freq']
    for phrase in whitelist:
        if phrase in line:
            if 'Job is success!!' in line:
                return False
            return True
    return False

In [5]:
def filter_jod_id(line):
    for line in line.split(' '):
        if line.isdigit():
            return line
    raise Exception("Job id not found")

# Create clean content

In [6]:
cleaned_content = []
for line in content:
    if check_format(line):
        if 'Job id' in line:
            line = filter_jod_id(line)
        print(line)
        cleaned_content.append(line)   

Start training iteration 0
num_tree: 30
lrn_rate: 0.172
max_dept: 30
val_freq: 15
2021072411355058935654
Train Evaluation summary:
"root_mean_squared_error": 0.18990151808549874
"root_mean_squared_error": 17.78257939198001
Reached prediction
2021072412160053345157
Predict Evaluation summary:
"root_mean_squared_error": 31.992399700130317
Start training iteration 1
num_tree: 30
lrn_rate: 0.224
max_dept: 20
val_freq: 15
2021072412164698702458
Train Evaluation summary:
"root_mean_squared_error": 0.030257388542796852
"root_mean_squared_error": 16.342188673895766
Reached prediction
2021072412481482651361
Predict Evaluation summary:
"root_mean_squared_error": 31.502618917043
Start training iteration 2
num_tree: 40
lrn_rate: 0.274
max_dept: 45
val_freq: 10
2021072412485911012662
Train Evaluation summary:
"root_mean_squared_error": 0.014188182367871645
"root_mean_squared_error": 17.586666063809698
Reached prediction
2021072413234569364265
Predict Evaluation summary:
"root_mean_squared_error": 3

# Convert to df

In [7]:
header = ['num_tree', 'lrn_rate', 'max_dept', 'val_freq', 'train_job_id', 
          'train_rmse', 'validate_rmse', 'pred_job_id', 'pred_rmse']

In [8]:
rows = []

In [9]:
temp = []
for line in cleaned_content:
    if 'Start training' in line:
        if not temp:
            continue
        rows.append(temp)
        temp = []
        continue
    if 'Evaluation' in line:
        continue
    if 'prediction' in line:
        continue
    if line.isdigit():  # if line is job id
        temp.append(line)
    else:
        number = float(line.split(' ')[1])
        temp.append(f'{number:.2f}')

In [10]:
df = pd.DataFrame(rows, columns=header)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   num_tree       49 non-null     object
 1   lrn_rate       49 non-null     object
 2   max_dept       49 non-null     object
 3   val_freq       49 non-null     object
 4   train_job_id   49 non-null     object
 5   train_rmse     49 non-null     object
 6   validate_rmse  49 non-null     object
 7   pred_job_id    49 non-null     object
 8   pred_rmse      49 non-null     object
dtypes: object(9)
memory usage: 3.6+ KB


In [11]:
arranged_header = ['train_job_id', 'pred_job_id', 'num_tree', 'lrn_rate', 'max_dept', 'val_freq', 
                   'train_rmse', 'validate_rmse', 'pred_rmse']

In [12]:
df = df[arranged_header]
df.sort_values(['pred_rmse'], ascending=[True], inplace=True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,train_job_id,pred_job_id,num_tree,lrn_rate,max_dept,val_freq,train_rmse,validate_rmse,pred_rmse
0,2021072707323824568292,2021072707414923544895,30.0,0.11,5.0,20.0,8.34,15.1,28.77
1,2021072415522346842382,2021072416193919513285,40.0,0.07,10.0,15.0,3.26,15.53,29.88
2,2021072706573057027288,2021072707315357888291,30.0,0.06,30.0,25.0,7.71,18.65,30.41
3,202107271312505586985,202107271333541606248,35.0,0.06,10.0,25.0,5.79,18.03,30.53
4,2021072620384476891656,2021072621010220243259,30.0,0.16,10.0,15.0,0.43,16.62,30.76
5,202107271302153259391,202107271312212010264,35.0,0.16,5.0,20.0,4.85,16.88,30.82
6,2021072705322054528680,2021072706191949888183,35.0,0.16,40.0,25.0,0.12,17.99,31.0
7,2021072413242462433666,2021072414040294065469,40.0,0.23,35.0,15.0,0.01,17.45,31.07
8,2021072615145211970716,2021072615243810170819,30.0,0.06,5.0,15.0,13.44,17.33,31.39
9,2021072703103958935068,2021072703550050910371,35.0,0.06,40.0,10.0,5.65,20.81,31.44
