In [42]:
import os
import pandas as pd
import subprocess
import re

In [32]:
project_names = ['bmcsoftware_vscode-ispw', 'IBM_example-health-apis',
       'debinix_openjensen', 'Martinfx_Cobol',
       'shamrice_COBOL-RSS-Reader', 'shamrice_COBOL-Roguelike',
       'shamrice_COBOL-Guest-Book-Webapp', 'ibmdbbdev_Samples',
       'brazilofmux_gnucobol',
       'cicsdev_cics-async-api-credit-card-application-example',
       'cicsdev_cics-async-api-redbooks', 'walmartlabs_zECS',
       'cicsdev_cics-banking-sample-application-cbsa',
       'cicsdev_cics-genapp']

In [41]:
data_dir = os.path.join(os.getcwd(),'..', 'projects')
repo_clean_dir = os.path.join(os.getcwd(),'..', 'project_clean')
data_dir

'/home/16jl93/code_summary/replication-package/scripts/../projects'

### get all cbl files from all projects

In [None]:
result = subprocess.run(["find", data_dir, "-type", "f", "-name", "*.cbl"], capture_output=True, text=True, check=True)
file_paths = result.stdout.splitlines()
file_paths

### copy COBOL files in /projects and seperate multiprograms, remove comments and empty lines, write to /project_clean

In [43]:
# Function to detect the length of the numeric prefix
def detect_number_length(s):
    match = re.match(r'^\d+', s)
    if match:
        return len(match.group())
    return 0

# Function to check if all strings start with a number of the detected length
def all_start_with_detected_length_number(strings):
    # Find the first string with a numeric prefix to detect the length
    number_length = 0
    for string in strings:
        number_length = detect_number_length(string)
        if number_length > 0:
            break

    if number_length == 0:
        return False

    # Define the regular expression to match a number of the detected length at the beginning of the string
    pattern = re.compile(rf'^\d{{{number_length}}}')

    # Check if all strings that start with a number have the detected length
    return all(pattern.match(string) or not re.match(r'^\d+', string) for string in strings)


def replace_numeric_edges_with_space(s):
    #if not s.startswith(' ') and not s.startswith('*'):
    if not s.startswith('*'):
        # Replace numeric string at the beginning with corresponding spaces
        s = re.sub(r'^\d+', lambda match: ' ' * len(match.group()), s)
        # Replace numeric string at the end with corresponding spaces
        s = re.sub(r'\d+$', lambda match: ' ' * len(match.group()), s)
    return s

def read_cobol_list(file_path):
    data = []
    try:
        with open(file_path, 'r', errors='ignore') as file:
            for line in file:
                data.append(line)
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            for line in file:
                data.append(line)
    if all(len(s) == len(data[1]) for s in data[1:-1]) or all_start_with_detected_length_number(data[1:-1]):
        data = [replace_numeric_edges_with_space(item) for item in data]
    data = [item for item in data if not item.strip('\n').strip()=='']
    return data

def read_cobol_no_comment_lst(file_path):
    lst = read_cobol_list(file_path)
    lst = [item for item in lst if not item.strip().startswith('*') and not item.startswith('      D ')]
    for i, line in enumerate(lst):
        if '*>' in line:
            idx = line.index('*>')
            lst[i] = line[:idx]+'\n'
    return lst

In [37]:
def extract_program(program_id, all_lines, programs_dict):
    if not 'end' in programs_dict[program_id]:
        print(f'ERROR! Program {program_id} is not ended')
        return None
    start, end = programs_dict[program_id]['start'],programs_dict[program_id]['end']
    nested = []
    lines = [i for i in all_lines]
    for pid in programs_dict:
        if programs_dict[pid]['start'] > start and programs_dict[pid]['end'] < end:
            nested.append(pid)
            if 'identification division' in lines[programs_dict[pid]['start']-1].lower():
                lines[programs_dict[pid]['start']-1:programs_dict[pid]['end']+1] = ' '*(programs_dict[pid]['end']-programs_dict[pid]['start']+2)
            else:
                lines[programs_dict[pid]['start']:programs_dict[pid]['end']+1] = ' '*(programs_dict[pid]['end']-programs_dict[pid]['start']+1)
    extracted = lines[start-1:end+1] if 'identification division' in lines[start-1].lower() else lines[start:end+1]
    extracted = [item for item in extracted if not item == ' ']
    return extracted  

def get_programs(lines):
    programs = {}
    for i,line in enumerate(lines):
        if 'program-id.' in line.lower().strip()[:11]:
            program_id = line.strip().split()[-1].strip('.').strip("'")
            programs[program_id] = {'start':i}
        elif 'end program' in line.lower().strip()[:11]:
            program_id = line.strip().split()[-1].strip('.').strip("'")
            if program_id in programs:
                programs[program_id]['end'] = i
            else:
                print(f'ERROR! program {program_id} start not found')
        elif 'function-id.' in line.lower().strip()[:12]:
            program_id = line.strip().split()[-1].strip('.').strip("'")
            programs[program_id] = {'start':i}
        elif 'end function' in line.lower().strip()[:12]:
            program_id = line.strip().split()[-1].strip('.').strip("'")
            if program_id in programs:
                programs[program_id]['end'] = i
            else:
                print(f'ERROR! function {program_id} start not found')
    if len(programs) <=1:
        return None
    # elif len(programs) == 1 and not 'end' in programs[list(programs.keys())[0]]:
    #     return None
    else:
        return [extract_program(idx, lines, programs) for idx in programs]

In [None]:
multi_program_file = []
seperated = []
for file_path in file_paths:
    file_lst = read_cobol_no_comment_lst(file_path)
    programs = get_programs(file_lst)
    new_folder_dir = os.path.join(repo_clean_dir, file_path.split('/')[-2])
    new_file_dir = os.path.join(new_folder_dir, file_path.split('/')[-1])
    
    # if more than 1 program in the file
    if programs:
        print(f"{file_path} has {len(programs)} programs ")
        multi_program_file.append(file_path)
        for i, program in enumerate(programs):
            program_dir = new_file_dir.replace('.cbl',f'_program_{i}.cbl')
            if not os.path.exists(new_folder_dir):
                os.makedirs(new_folder_dir)
            with open(program_dir, 'w') as file:
                file.writelines(program)
            seperated.append(program_dir)
    else:
        if not os.path.exists(new_folder_dir):
            os.makedirs(new_folder_dir)
        with open(new_file_dir, 'w') as file:
            file.writelines(file_lst)
        pass

In [54]:
multi_program_file

['/home/16jl93/code_summary/replication-package/scripts/../projects/shamrice_COBOL-RSS-Reader/string_helpers.cbl',
 '/home/16jl93/code_summary/replication-package/scripts/../projects/shamrice_COBOL-RSS-Reader/application_configurator.cbl',
 '/home/16jl93/code_summary/replication-package/scripts/../projects/shamrice_COBOL-Guest-Book-Webapp/web-helpers.cbl',
 '/home/16jl93/code_summary/replication-package/scripts/../projects/brazilofmux_gnucobol/clinkages.cbl',
 '/home/16jl93/code_summary/replication-package/scripts/../projects/brazilofmux_gnucobol/linear_to_fielded.cbl',
 '/home/16jl93/code_summary/replication-package/scripts/../projects/brazilofmux_gnucobol/holidays.cbl',
 '/home/16jl93/code_summary/replication-package/scripts/../projects/brazilofmux_gnucobol/fielded_to_linear.cbl',
 '/home/16jl93/code_summary/replication-package/scripts/../projects/ibmdbbdev_Samples/epscsmrd.cbl',
 '/home/16jl93/code_summary/replication-package/scripts/../projects/Martinfx_Cobol/Hello_SQLITE.cbl']