In [None]:
import polars as pl

In [None]:
def process(f, col_starts, col_names, drop_cols, skip=0):
    '''
    Process data from a text file with fixed width columns.
    @param f: path to document which is processed
    @param skip: number of rows to skip
    @param widths: list of column widths
    @param col_names: list of column names
    @return: returns a DataFrame containg the processed columns
    '''
    
    raw = pl.read_csv(
        f,
        has_header = False,
        skip_lines = skip,
        new_columns = ['full_str']
    )
    
    max_len = (raw
        .with_columns(pl.col('full_str').str.len_chars().alias('len'))
        .select(pl.max('len'))
        .item(0, 0)
              )
    
    col_starts.append(max_len)

    column_names = col_names

    return raw.with_columns(
        [
            pl.col('full_str').str.slice(col_starts[i]-1, col_starts[i+1]-col_starts[i]).str.strip_chars().alias(col)
            for i, col in enumerate(column_names)
        ]
    ).drop('full_str').drop(drop_cols)

In [None]:
def fix_columns(df, col_names):
    df = (df.select(pl.all().str.replace(' ', ''))
          .select(pl.all().str.replace('#', ''))
         )

    return df

In [None]:
path = 'https://amdc.impcas.ac.cn/masstables/Ame2020/mass_1.mas20'
starts = [1, 3, 6, 11, 16, 21, 24, 29, 43, 56, 69, 80, 83, 96, 107, 125]
col_names = ['1', 'N-Z', 'N', 'Z', 'Mass Number', 'Atomic Symbol', 'Odd', 'Mass Excess', 'ME error', 
             'Binding Energy per Nucleon', 'BE error', 'Beta Type', 'Beta-Decay Energy', 'BD error', 
             'Atomic Mass', 'AM error']
drop_cols = ['1', 'N-Z']
skip = 36

df = process(path, starts, col_names, drop_cols, skip)

In [None]:
col_names = df.select(pl.exclude(['N', 'Z', 'Atomic Symbol', 'Odd'])).columns

df = fix_columns(df, col_names)

In [None]:
col_types = {'N': pl.Int16, 'Z': pl.Int16, 'Mass Number': pl.Int16, 'Atomic Symbol': pl.String, 'Odd': pl.String, 
             'Mass Excess': pl.Float64, 'ME error': pl.Float64, 'Binding Energy per Nucleon': pl.Float64, 'BE error': pl.Float64,
             'Beta Type': pl.String, 'Beta-Decay Energy': pl.Float64, 'BD error': pl.Float64, 
             'Atomic Mass': pl.Float64, 'AM error': pl.Float64}

final = df.cast(col_types, strict=False)

final.write_parquet('binding_energy.parquet')