In [None]:
import os
import pandas as pd
import subprocess
import re

In [None]:
project_names = ['bmcsoftware_vscode-ispw', 'IBM_example-health-apis',
       'debinix_openjensen', 'Martinfx_Cobol',
       'shamrice_COBOL-RSS-Reader', 'shamrice_COBOL-Roguelike',
       'shamrice_COBOL-Guest-Book-Webapp', 'ibmdbbdev_Samples',
       'brazilofmux_gnucobol',
       'cicsdev_cics-async-api-credit-card-application-example',
       'cicsdev_cics-async-api-redbooks', 'walmartlabs_zECS',
       'cicsdev_cics-banking-sample-application-cbsa',
       'cicsdev_cics-genapp']

In [None]:
data_dir = os.path.join(os.getcwd(),'..','data', 'projects')
repo_clean_dir = os.path.join(os.getcwd(),'..','data', 'project_clean')
data_dir

In [None]:
file_reference_df = pd.read_csv(os.path.join('..','data','file_level_reference_dataset.csv'), index_col=0)
file_reference_df

# Preprocessing

### get all cbl files from all projects

In [None]:
result = subprocess.run(["find", data_dir, "-type", "f", "-name", "*.cbl"], capture_output=True, text=True, check=True)
file_paths = result.stdout.splitlines()

### copy COBOL files in /projects and seperate multiprograms, remove comments and empty lines, write to /project_clean

In [None]:
# Function to detect the length of the numeric prefix
def detect_number_length(s):
    match = re.match(r'^\d+', s)
    if match:
        return len(match.group())
    return 0

# Function to check if all strings start with a number of the detected length
def all_start_with_detected_length_number(strings):
    # Find the first string with a numeric prefix to detect the length
    number_length = 0
    for string in strings:
        number_length = detect_number_length(string)
        if number_length > 0:
            break

    if number_length == 0:
        return False

    # Define the regular expression to match a number of the detected length at the beginning of the string
    pattern = re.compile(rf'^\d{{{number_length}}}')

    # Check if all strings that start with a number have the detected length
    return all(pattern.match(string) or not re.match(r'^\d+', string) for string in strings)


def replace_numeric_edges_with_space(s):
    #if not s.startswith(' ') and not s.startswith('*'):
    if not s.startswith('*'):
        # Replace numeric string at the beginning with corresponding spaces
        s = re.sub(r'^\d+', lambda match: ' ' * len(match.group()), s)
        # Replace numeric string at the end with corresponding spaces
        s = re.sub(r'\d+$', lambda match: ' ' * len(match.group()), s)
    return s

def read_cobol_list(file_path):
    data = []
    try:
        with open(file_path, 'r', errors='ignore') as file:
            for line in file:
                data.append(line)
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            for line in file:
                data.append(line)
    if all(len(s) == len(data[1]) for s in data[1:-1]) or all_start_with_detected_length_number(data[1:-1]):
        data = [replace_numeric_edges_with_space(item) for item in data]
    data = [item for item in data if not item.strip('\n').strip()=='']
    return data

def read_cobol_no_comment_lst(file_path):
    lst = read_cobol_list(file_path)
    lst = [item for item in lst if not item.strip().startswith('*') and not item.startswith('      D ')]
    for i, line in enumerate(lst):
        if '*>' in line:
            idx = line.index('*>')
            lst[i] = line[:idx]+'\n'
    return lst

In [None]:
def extract_program(program_id, all_lines, programs_dict):
    if not 'end' in programs_dict[program_id]:
        print(f'ERROR! Program {program_id} is not ended')
        return None
    start, end = programs_dict[program_id]['start'],programs_dict[program_id]['end']
    nested = []
    lines = [i for i in all_lines]
    for pid in programs_dict:
        if programs_dict[pid]['start'] > start and programs_dict[pid]['end'] < end:
            nested.append(pid)
            if 'identification division' in lines[programs_dict[pid]['start']-1].lower():
                lines[programs_dict[pid]['start']-1:programs_dict[pid]['end']+1] = ' '*(programs_dict[pid]['end']-programs_dict[pid]['start']+2)
            else:
                lines[programs_dict[pid]['start']:programs_dict[pid]['end']+1] = ' '*(programs_dict[pid]['end']-programs_dict[pid]['start']+1)
    extracted = lines[start-1:end+1] if 'identification division' in lines[start-1].lower() else lines[start:end+1]
    extracted = [item for item in extracted if not item == ' ']
    return extracted  

def get_programs(lines):
    programs = {}
    for i,line in enumerate(lines):
        if 'program-id.' in line.lower().strip()[:11]:
            program_id = line.strip().split()[-1].strip('.').strip("'")
            programs[program_id] = {'start':i}
        elif 'end program' in line.lower().strip()[:11]:
            program_id = line.strip().split()[-1].strip('.').strip("'")
            if program_id in programs:
                programs[program_id]['end'] = i
            else:
                print(f'ERROR! program {program_id} start not found')
        elif 'function-id.' in line.lower().strip()[:12]:
            program_id = line.strip().split()[-1].strip('.').strip("'")
            programs[program_id] = {'start':i}
        elif 'end function' in line.lower().strip()[:12]:
            program_id = line.strip().split()[-1].strip('.').strip("'")
            if program_id in programs:
                programs[program_id]['end'] = i
            else:
                print(f'ERROR! function {program_id} start not found')
    if len(programs) <=1:
        return None
    # elif len(programs) == 1 and not 'end' in programs[list(programs.keys())[0]]:
    #     return None
    else:
        return [extract_program(idx, lines, programs) for idx in programs]

In [None]:
# Fix the indentation issue of this repo, otherwise read cobol functions cannot work
for i, path in enumerate(os.listdir('../data/projects/brazilofmux_gnucobol')):
    with open(os.path.join('../data/projects/brazilofmux_gnucobol', path), "r") as file:
        lines = file.readlines()  # Reads all lines into a list
    lines = list(map(lambda x: ' '*7 + x, lines))
    with open(os.path.join('../data/projects/brazilofmux_gnucobol', path), 'w') as newfile:
        newfile.writelines(lines)
    

In [None]:
multi_program_file = []
seperated = []
for file_path in file_paths:
    file_lst = read_cobol_no_comment_lst(file_path)
    programs = get_programs(file_lst)
    new_folder_dir = os.path.relpath(os.path.dirname(file_path.replace('projects','project_clean')))
    new_file_dir = os.path.join(new_folder_dir, file_path.split('/')[-1])
    
    # if more than 1 program in the file
    if programs:
        print(f"{file_path} has {len(programs)} programs ")
        multi_program_file.append(file_path)
        for i, program in enumerate(programs):
            program_dir = new_file_dir.replace('.cbl',f'_program_{i}.cbl')
            if not os.path.exists(new_folder_dir):
                os.makedirs(new_folder_dir)
            with open(program_dir, 'w') as file:
                file.writelines(program)
            seperated.append(program_dir)
    else:
        if not os.path.exists(new_folder_dir):
            os.makedirs(new_folder_dir)
        with open(new_file_dir, 'w') as file:
            file.writelines(file_lst)
        pass

## get lists of files with and without reference data

In [None]:
# the cobol files without reference data (i.e., with no proper head comments)
cbl_files_not_in_dataset = []
# the cobol files with reference data (i.e., with no proper head comments)
cbl_files_in_dataset = file_reference_df['file path'].apply(lambda x:x.replace('/projects/','/project_clean/')).to_list()
result = subprocess.run(["find", repo_clean_dir, "-type", "f", "-name", "*.cbl"], capture_output=True, text=True, check=True)
cbl_files_not_in_dataset = result.stdout.splitlines()
cbl_files_not_in_dataset = list(map(lambda x: os.path.relpath(x),cbl_files_not_in_dataset))
cbl_files_not_in_dataset = [item for item in cbl_files_not_in_dataset if not item in cbl_files_in_dataset]

## get copybooks for each file and insert back to data division

In [None]:
def read_cobol_no_comment_str(file_path):
    lst = read_cobol_list(file_path)
    lst = [item for item in lst if not item.strip().startswith('*') and not item.startswith('      D ')]
    for i, line in enumerate(lst):
        if '*>' in line:
            idx = line.index('*>')
            lst[i] = line[:idx]+'\n'
    return ''.join(lst)

def read_cobol_no_comment_lst(file_path):
    lst = read_cobol_list(file_path)
    lst = [item for item in lst if not item.strip().startswith('*') and not item.startswith('      D ')]
    for i, line in enumerate(lst):
        if '*>' in line:
            idx = line.index('*>')
            lst[i] = line[:idx]+'\n'
    return lst
    

In [None]:
def get_copy(filename, file_path):
    if 'processed_repos' in file_path:
        path = file_path.replace('processed_repos', 'cobol_repos')
    else:
        path = file_path
    if 'cics-banking-sample-application-cbsa' in path:
        repo_path = '../data/projects'
        copybook_path = os.path.join(repo_path,'cics-banking-sample-application-cbsa/src/base/cobol_copy')
        code_path = os.path.join(repo_path,'cics-banking-sample-application-cbsa/src/base/cobol_src')
        # bms_path = os.path.join(repo_path,'cics-banking-sample-application-cbsa/src/base/bms_src')
    else:
        copybook_path = path.replace('/'+path.split('/')[-1],'')
        code_path = path.replace('/'+path.split('/')[-1],'')
        # bms_path = path.replace('/'+path.split('/')[-1],'')
    
    copybook = None
    if filename.strip().endswith('.cpy'):
        if os.path.exists(os.path.join(copybook_path,filename)):
            #print('copybook')
            copybook = read_cobol_no_comment_str(os.path.join(copybook_path,filename))
        # else:
        #     print(f'Cannot find copybook {os.path.join(copybook_path,filename)}')
    elif filename.strip().endswith('.cbl'):
        if os.path.exists(os.path.join(code_path,filename)):
            print('cbl')
            copybook = read_cobol_no_comment_str(os.path.join(code_path,filename))
        # else:
        #     print(f'Cannot find copybook {os.path.join(code_path,filename)}')
    elif os.path.exists(os.path.join(copybook_path,filename+'.cpy')):
        #print('copybook')
        copybook = read_cobol_no_comment_str(os.path.join(copybook_path,filename+'.cpy'))
    elif os.path.exists(os.path.join(code_path,filename+'.cbl')):
        # TODO: get data division here
        print('cbl')
        copybook = read_cobol_no_comment_str(os.path.join(code_path,filename+'.cbl'))
    # elif os.path.exists(os.path.join(bms_path,filename+'.bms')):
    #     #print('bms')
    #     copybook = read_cobol_no_comment_str(os.path.join(bms_path,filename+'.bms'))
    # else:
    #     print(f'Cannot find copybook <{filename}> for {path}')
    return copybook

def is_contain_copy(copybook):
    copybook_lst = copybook.split('\n')
    for line in copybook_lst:
        if line.strip().lower().startswith('copy'):
            return True
    return False

In [None]:
# The following code find the copybook referenced in the data dividion of a cobol file, 
# and insert the data in the copybook back to the data division. Now, the data division 
# in the cobol file contains the complete variable information.
#
# There will be a few copybooks not found from the repository.
# Therefore, we cannot find the data included in those copybooks, and we have to neglect those data.
# not_found: the names of the copybooks not found
# not_found_original: the names of the files containing copybooks not found
# new_dataset: the content of files with the copybook info inserted
not_found, not_found_original = [], []
new_dataset = []
for i, file_path in enumerate(file_paths):
    #path = os.path.join(repo_clean_dir, file_path)
    file = read_cobol_no_comment_lst(file_path)
    for idx, line in enumerate(file):
        if line.strip().lower().startswith('copy'):
            #print(line)
            filename = line.strip().split()[-1]
            filename = filename.strip().strip('.').strip('"').strip("'").strip('.')
            #print(filename)
            if 'replacing' in line.lower():
                filename = line.strip().split()[1]
                filename = filename.strip().strip('.').strip('"').strip("'").strip('.')
            if '/' in filename:
                filename = filename.split('/')[-1]
            copy_content = get_copy(filename, file_path)
            if not copy_content:
                copy_content = get_copy(filename.lower(), file_path)
            if copy_content:
                if 'replacing' in line.lower():
                    try:
                        words = line.lower().strip().split()
                        to_be_replace = line.strip().split()[words.index('replacing')+1].strip('.').strip('"').strip("'")
                        replace_with = line.strip().split()[words.index('by')+1].strip('.').strip('"').strip("'")
                        copy_content = copy_content.replace(to_be_replace, replace_with )
                    except:
                        # only one exception found here, so i hard code it
                        copy_content = copy_content.replace('NUMBER-OF-ACCOUNTS.', 'NUMBER-OF-ACCOUNTS')
                        if not 'COPY INQACCCU REPLACING ==NUMBER-OF-ACCOUNTS.==' in line:
                            print(f'Error in replacing copybook keywords! Cannot find copybook <{filename}> for {file_path}')
                file[idx] = copy_content +'\n' if not copy_content.endswith('\n') else copy_content
            else:
                print(f'{i} Cannot find copybook <{filename}> for {file_path}')
                print('------------------------------------------------------------------')
                not_found.append(filename)
                not_found_original.append(file_path)
                file[idx] = ''
    new_dataset.append(''.join(file))
            

In [None]:
# the majority of the following copybooks are found because their names are stored in variable or referenced to another copybook
set(not_found)

In [None]:
# write to project_clean dir 
for i, file_path in enumerate(file_paths):
    path = os.path.relpath(file_path.replace('projects','project_clean'))
    # donot write to the file with mutiple programs into the project_clean dir, as we have prepared a version of these programs into seperated files
    if file_path in multi_program_file:
        continue
    with open(path, 'w') as file:
        file.write(''.join(new_dataset[i]))

In [None]:
# This code check the files still containing unmatched reference to copybook
# Output should be 3 files in ibmdbbdev_Samples repo, which we were not able to find
def read_cobol_str(file_path):
    data = read_cobol_list(file_path)
    return '\n'.join(data)

for file_path in file_paths:
    if file_path in multi_program_file:
        continue
    path = os.path.relpath(file_path.replace('projects','project_clean'))
    file = read_cobol_str(path)
    if is_contain_copy(file):
        print(path)

## Fix indentation

In [None]:
# read all files that are in the reference dataset
file_reference_dataset = []
for i, path in enumerate(cbl_files_in_dataset):
    file_reference_dataset.append(read_cobol_list(path))

# Fix the files that have indentation error
for file_idx, file_lst in enumerate(file_reference_dataset):
    # reset all lines where indent = 8 to indent = 7
    # reset all function/section names to indent = 7
    data_idx = 0
    procedure_idx = 0
    for i, line in enumerate(file_lst):
        if 'data division' in line.lower():
            data_idx = i
        elif 'procedure division' in line.lower():
            procedure_idx = i

    for i, line in enumerate(file_lst):
        if len(line) - len(line.lstrip()) == 8:
            file_lst[i] = ' '*7 + line.lstrip()
        if 'bmcsoftware_vscode-ispw' in cbl_files_in_dataset[file_idx]:
            if len(line) - len(line.lstrip()) == 9:
                if not line.strip().startswith('2 ') and not line.strip().startswith('0'):
                    file_lst[i] = ' '*7 + line.lstrip()
        if i > procedure_idx:
            if line.strip().lower().endswith('section.') and len(line.strip().split()) == 2:
                file_lst[i] = ' '*7 + line.lstrip()

for i, path in enumerate(cbl_files_in_dataset):
    with open(path, 'w') as newfile:
        newfile.writelines(file_reference_dataset[i])

In [None]:
# read all files that are not in the reference dataset
not_file_reference_dataset = []
for i, path in enumerate(cbl_files_not_in_dataset):
    not_file_reference_dataset.append(read_cobol_list(path))

# Fix the files that have indentation error
for file_idx, file_lst in enumerate(not_file_reference_dataset):
    # reset all lines where indent = 8 to indent = 7
    # reset all function/section names to indent = 7
    data_idx = 0
    procedure_idx = 0
    for i, line in enumerate(file_lst):
        if 'data division' in line.lower():
            data_idx = i
        elif 'procedure division' in line.lower():
            procedure_idx = i

    for i, line in enumerate(file_lst):
        if len(line) - len(line.lstrip()) == 8:
            file_lst[i] = ' '*7 + line.lstrip()
        if 'bmcsoftware_vscode-ispw' in cbl_files_not_in_dataset[file_idx]:
            if len(line) - len(line.lstrip()) == 9:
                if not line.strip().startswith('2 ') and not line.strip().startswith('0'):
                    file_lst[i] = ' '*7 + line.lstrip()
        if i > procedure_idx:
            if line.strip().lower().endswith('section.') and len(line.strip().split()) == 2:
                file_lst[i] = ' '*7 + line.lstrip()

for i, path in enumerate(cbl_files_not_in_dataset):
    with open(path, 'w') as newfile:
        newfile.writelines(not_file_reference_dataset[i])

# Extract function-level artifact

## Utilities

In [None]:
def get_data_division(file_lst):
    data_idx = 0
    procedure_idx = 0
    for i, line in enumerate(file_lst):
        if 'data division' in line.lower():
            data_idx = i
        elif 'procedure division' in line.lower():
            procedure_idx = i
    return file_lst[data_idx+1:procedure_idx]

## extract data division info for variables used a piece of code snippet
def has_parent(parent_dict, num_indent):
    for item in parent_dict:
        if item < num_indent:
            return True
        
def get_parents(parent_dict, num_indent):
    lst = []
    for item in parent_dict:
        if item < num_indent:
            lst.append(parent_dict[item])
    return lst

def get_variable_in_code(data_div, code):
    variables = data_div[:]
    variables = [item for item in variables if not item.strip()=='']
    line_nums = []
    entry_end, isincode = True, False
    parents = {} #<num ident>:<line number>
    for i, line in enumerate(variables):
        if entry_end: # if previous entry ended  
            if line.strip()[0].isdigit() and len(line.strip().split()) > 1:
                num_indent = len(line) - len(line.lstrip())
                if num_indent <= 8:
                    parents = {}
                parents[num_indent] = i
                #print(line.strip().strip('.').split())
                # print(entry_end)
                # if i >0:
                #     print(variables[i-1])
                # print(line)
                var = line.strip().strip('.').split()[1]
                if var.lower() in code.lower():
                    if has_parent(parents, num_indent):
                        parent_nodes = get_parents(parents, num_indent)
                        for parent_node in parent_nodes:
                            parent_line_idx = parent_node
                            if variables[parent_line_idx].strip().endswith('.'):
                                line_nums.append(parent_line_idx)
                            else:
                                parent_entry_count = 0
                                while(True):
                                    line_nums.append(parent_line_idx+parent_entry_count)
                                    if variables[parent_line_idx+parent_entry_count].strip().endswith('.'):
                                        break
                                    parent_entry_count += 1     
                    line_nums.append(i)
                    isincode = True
                    if line.strip().endswith('.'): #if this entry ends in current line
                        isincode = False 
                        entry_end = True
                    else:
                        entry_end = False
        else:
            if isincode:
                line_nums.append(i)
                if line.strip().endswith('.'):
                    entry_end = True
                    isincode = False

    line_nums = list(sorted(set(line_nums)))
    variable_info = '\n'.join([variables[item] for item in line_nums])
    return variable_info

In [None]:
## file used in these functions must have comments removed
def get_called_paragraphs(code):
    code_lst = code.split('\n')
    call_lines = code.split('\n')
    #call_lines = [item.strip().strip('.') for item in code_lst if item.strip().lower().startswith('perform') or item.strip().lower().startswith('go to')]
    called_paragraphs = []
    multiple_goto = False
    for i, line in enumerate(call_lines):
        if line.strip().lower().startswith('perform'):
            words = line.split()
            if len(words) == 2:
                called_paragraphs.append(words[-1].strip().strip('.'))
            elif 'thru' in line.lower():
                lower_words = line.lower().split()
                called_paragraphs.append(words[lower_words.index('perform')+1].strip().strip('.'))
                called_paragraphs.append(words[lower_words.index('thru')+1].strip().strip('.'))
            elif 'through' in line.lower():
                lower_words = line.lower().split()
                called_paragraphs.append(words[lower_words.index('perform')+1].strip().strip('.'))
                called_paragraphs.append(words[lower_words.index('through')+1].strip().strip('.'))
        elif line.strip().lower().startswith('go to'):
            words = line.split()
            if len(words) == 3:
                called_paragraphs.append(words[-1].strip().strip('.'))
            elif line.strip().lower() == 'go to':
                multiple_goto = True
        elif multiple_goto:
            if 'depending on' in line.lower().strip():
                if line.strip().lower().startswith('depending'):
                    multiple_goto = False
            elif len(line.strip().split())==1:
                called_paragraphs.append(line.strip().strip('.'))
                # print('------------------')
                # print(call_lines[i-1])
                # print(line)
                # print(call_lines[i+1])
                # print(code)

    #call_lines = [item.split()[-1] for item in call_lines if len(item.split())==2]
    return list(set(called_paragraphs))

def get_section_code_with_name(pname, file):
    # file: a string of lines in a file with with comments removed
    file_lst = file.split('\n')
    line_num = [item for item in file_lst if item.strip().lower() == pname.lower()+' section.']
    if len(line_num) <= 0:
        return None
    line_num = file_lst.index(line_num[0])
    section_lst = [file_lst[line_num]]
    for line in file_lst[line_num+1:]:
        if line.strip().lower().endswith('section.'):
            break
        section_lst.append(line)
    return '\n'.join(section_lst)


def get_paragraph_code_with_name(pname, file):
    # file: a string of lines in a file with with comments removed
    file_lst = file.split('\n')
    line_num = [item for item in file_lst if item.strip().lower() == pname.lower()+'.']
    if len(line_num) <= 0:
        return None
    num_indent = len(line_num[0]) - len(line_num[0].lstrip())
    line_num = file_lst.index(line_num[0])
    paragraph_lst = [file_lst[line_num]]
    for line in file_lst[line_num+1:]:
        if len(line) - len(line.lstrip()) <= num_indent:
            break
        paragraph_lst.append(line)
    return '\n'.join(paragraph_lst)

def get_call_hierarchy(code, file):
    # file: a string of lines in a file with with comments removed
    hierarchy = {}
    called_names = get_called_paragraphs(code)
    for pname in called_names:
        section = get_section_code_with_name(pname, file)
        paragraph = get_paragraph_code_with_name(pname, file)
        if section and paragraph:
            print(f'ERROR! {pname} is both a section and a paragraph')
            hierarchy[pname]  ='NOT FOUND'
        elif section or paragraph:
            new_code = section if section else paragraph
            sub_called = get_called_paragraphs(new_code)
            if len(sub_called) <= 0:
                hierarchy[pname] = None
            else:
                hierarchy[pname] = get_call_hierarchy(new_code, file)
        else:
            print(f'ERROR! {pname} is not found in file')
            hierarchy[pname]  ='NOT FOUND'
    return hierarchy
        
def print_call_hierarchy(d, indent=0):
    for key, value in d.items():
        print('    ' * indent + str(key))
        if isinstance(value, dict):
            print_call_hierarchy(value, indent + 1)
        else:
            print('    ' * (indent + 1)) 

In [None]:
## get the code for all functions (i.e., section or paragraph in a procedure division)
## return: section_dict    -> {<section name>: <section code>, ...}
##         paragraph_dict  -> {<paragraph name>: {'section': <section name>/None, 
##                                                'code': <paragraph code>}, 
##                             ...}
def get_functions(program_lst):
    procedure_start = [i for i,line in enumerate(program_lst) if 'procedure division' in line.lower()]
    if len(procedure_start) > 1:
        print('ERROR! More than 1 procedure division found in program.')
        return None
    elif len(procedure_start) <= 0:
        print('ERROR! No procedure division found in program.')
        return None

    procedure_start = procedure_start[0]
    standard_indent =  len(program_lst[procedure_start])-len(program_lst[procedure_start].lstrip())
    sections, paragraphs = [], []
    current_section, current_paragraph = {}, {}
    start_using = False
    for i, line in enumerate(program_lst[procedure_start+1:]):
        if len(line)-len(line.lstrip()) < standard_indent:
            print(f'ERROR! Anomaly indentation detected in line {i+procedure_start+1} in program:\n{line}')
        else:
            if len(line)-len(line.lstrip()) == standard_indent:
                if line.strip().lower().endswith('section.') and len(line.strip().split())==2:
                    if 'start' in current_section:
                        current_section['end'] = i-1
                        sections.append(current_section)
                        current_section = {}
                    current_section['name'] = line.strip().split()[0].strip('.').strip("'").strip('"')
                    current_section['start'] = i
                    if 'start' in current_paragraph:
                        current_paragraph['end'] = i-1
                        paragraphs.append(current_paragraph)
                        current_paragraph = {}
                    
                elif line.strip().endswith('.') and len(line.strip().split())==1:
                    if 'start' in current_paragraph:
                        current_paragraph['end'] = i-1
                        paragraphs.append(current_paragraph)
                        current_paragraph = {}
                    current_paragraph['name'] = line.strip().strip('.').strip("'").strip('"')
                    current_paragraph['start'] = i
                    
                elif line.lower().strip().startswith('end program') or line.lower().strip().startswith('end function'):
                    i -=1
                    break
                else:
                    print(f'Line {i+procedure_start+1} in program is not a section or a program:\n{line}')

            if len(sections) == 0 and len(paragraphs) == 0 and not 'start' in current_paragraph and not 'start' in current_section:
                if i == 0 and not 'using' in program_lst[procedure_start].lower() and not line.strip().lower().startswith('using'): # if next line of procedure division, but have bigger indentation, still consider this as a paragraph
                    current_paragraph = {}
                    current_paragraph['name'] = 'Top function with no name'
                    current_paragraph['start'] = i
                elif i == 0 and 'using' in program_lst[procedure_start].lower() and program_lst[procedure_start].strip().endswith('.'):
                    current_paragraph = {}
                    current_paragraph['name'] = 'Top function with no name'
                    current_paragraph['start'] = i
                elif i==0 and ('using' in line.lower() or 'using' in program_lst[procedure_start].lower()) and line.strip().endswith('.'):
                    start_using = False
                elif i==0 and ('using' in line.lower() or 'using' in program_lst[procedure_start].lower()):
                    start_using = True
                elif start_using == True and line.strip().endswith('.'):
                    start_using = False
                elif start_using == False:
                    current_paragraph = {}
                    current_paragraph['name'] = 'Top function with no name'
                    current_paragraph['start'] = i



    if 'start' in current_section:
        current_section['end'] = i
        sections.append(current_section)
    if 'start' in current_paragraph:
        current_paragraph['end'] = i
        paragraphs.append(current_paragraph)
        
    ## check if any end > start 
    if len(sections) > 0:
        for section in sections:
            if section['end'] <= section['start']:
                print(f'Incorrect lines detected for section: {section["name"]}')
    for paragraph in paragraphs:
        if paragraph['end'] <= paragraph['start']:
            print(f'Incorrect lines detected for paragraph: {paragraph["name"]}')

    # final results: dict for section and paragraphs
    section_dict, paragraph_dict = {}, {}
    for section in sections:
        section_dict[section['name']] = ''.join(program_lst[procedure_start+1+section['start']:procedure_start+1+section['end']+1])
    for paragraph in paragraphs:
        umbrella_section = list(filter(lambda x: paragraph['start']> x['start'] and paragraph['end'] <= x['end'], sections))
        if len(umbrella_section) == 0:
            umbrella_section = None
        elif len(umbrella_section) == 1:
            umbrella_section = umbrella_section[0]['name']
        else:
            print(f'ERROR! Multiple section for paragraph: {paragraph["name"]} detected: [{", ".join([item["name"] for item in umbrella_section])}]')
            umbrella_section = None
        paragraph_dict[paragraph['name']] = {'section':umbrella_section, 'code':''.join(program_lst[procedure_start+1+paragraph['start']:procedure_start+1+paragraph['end']+1])}
    section_dict.pop('',None)
    paragraph_dict.pop('',None)
    return section_dict, paragraph_dict

## code, variable, function call relation extraction

###  functions in function-level reference dataset

In [None]:
function_reference_df = pd.read_csv(os.path.join('..','data','function_level_reference_dataset.csv'), index_col = 0)
function_reference_df

In [None]:
# the result of the following code is already contained in "data/function_level_reference_dataset.csv"
function_reference_df['called paragraphs'] = function_reference_df['code'].apply(get_called_paragraphs)
function_reference_df['variables'] = function_reference_df.apply(lambda x: get_variable_in_code(get_data_division(read_cobol_list(os.path.relpath(x['file path'].replace('projects','project_clean')))), x['code']), axis = 1)
function_reference_df

### functions within the files in the file-level reference dataset 

#### get sections and paragraphs

In [None]:
# It is fine if there are a few errors here, as long as the errors are not on the line of paragraph or function name. 
# These errors are already internally handled.
# merged dict contains all the section names and paragraph names, even if the paragraph is under a section. This is because the code can directly call a paragraph by its name, even if it is under a section.
merged = []
paragraphs, sections = [], []
for i, path in enumerate(cbl_files_in_dataset):
    section_dict, paragraph_dict = get_functions(read_cobol_list(os.path.relpath(path.replace('projects','project_clean'))))
    for key in section_dict:
        merged.append({'file path':path, 'function name':key,'code':section_dict[key], 'is section': True, 'is paragraph': False, 'section name': key })
    for key in paragraph_dict:
        merged.append({'file path':path, 'function name':key,'code':paragraph_dict[key]['code'], 'is section': False, 'is paragraph': True, 'section name': paragraph_dict[key]['section'] })
    paragraphs.append(paragraph_dict)
    sections.append(section_dict)

In [None]:
# check for files with no section and no paragraph
# The correct output should be empty
for i in range(len(sections)):
    if len(sections[i]) == 0 and len(paragraphs[i]) == 0:
        print(cbl_files_in_dataset[i])

In [None]:
merged_function_df = pd.DataFrame(merged)
merged_function_df['file path'] = merged_function_df['file path'].apply(lambda x:x.replace(repo_clean_dir+'/',''))
merged_function_df['repo'] = None
merged_function_df.loc[merged_function_df['repo'].isna(), 'repo'] = merged_function_df.loc[merged_function_df['repo'].isna()]['file path'].apply(lambda x: x.split('/')[3])
merged_function_df['called paragraphs'] = merged_function_df['code'].apply(get_called_paragraphs)

In [None]:
def check_called(row, df):
    all_called = df.loc[df['file path']==row['file path']]['called paragraphs']
    functions = []
    for item in all_called:
        functions.extend(item)
    return row['function name'] in functions

merged_function_df['is called'] = merged_function_df.apply(lambda x: check_called(x,merged_function_df),axis=1)
merged_function_df['variables'] = merged_function_df.apply(lambda x: get_variable_in_code(get_data_division(read_cobol_list(os.path.relpath(path.replace('projects','project_clean')))), x['code']), axis = 1)
merged_function_df.loc[(merged_function_df['function name']=='Top function with no name')&(merged_function_df['code'].apply(lambda x:x.strip().lower().startswith('set environment'))), 'function name'] = 'set environment'
function_for_file_df = merged_function_df[['repo', 'file path', 'function name', 'code', 'variables', 'called paragraphs', 'is called', 'is section', 'is paragraph', 'section name']]
function_for_file_df
# The result of this dataframe is included in  /data/function_refined_summary.csv

### other functions

In [None]:
# It is fine if there are a few errors here, as long as the errors are not on the line of paragraph or function name. 
# These errors are already internally handled.
# merged dict contains all the section names and paragraph names, even if the paragraph is under a section. This is because the code can directly call a paragraph by its name, even if it is under a section.
merged = []
paragraphs, sections = [], []
for i, path in enumerate(cbl_files_not_in_dataset):
    print(os.path.relpath(path.replace('projects','project_clean')))
    code_snippet = read_cobol_list(os.path.relpath(path.replace('projects','project_clean')))
    section_dict, paragraph_dict = get_functions(code_snippet)
    for key in section_dict:
        merged.append({'file path':path, 'function name':key,'code':section_dict[key], 'is section': True, 'is paragraph': False, 'section name': key })
    for key in paragraph_dict:
        merged.append({'file path':path, 'function name':key,'code':paragraph_dict[key]['code'], 'is section': False, 'is paragraph': True, 'section name': paragraph_dict[key]['section'] })
    paragraphs.append(paragraph_dict)
    sections.append(section_dict)

In [None]:
merged_other_function_df = pd.DataFrame(merged)
merged_other_function_df['file path'] = merged_other_function_df['file path'].apply(lambda x:x.replace(repo_clean_dir+'/',''))
merged_other_function_df['repo'] = None
merged_other_function_df.loc[merged_other_function_df['repo'].isna(), 'repo'] = merged_other_function_df.loc[merged_other_function_df['repo'].isna()]['file path'].apply(lambda x: x.split('/')[3])
merged_other_function_df['called paragraphs'] = merged_other_function_df['code'].apply(get_called_paragraphs)
merged_other_function_df['is called'] = merged_other_function_df.apply(lambda x: check_called(x,merged_other_function_df),axis=1)
merged_other_function_df['variables'] = merged_other_function_df.apply(lambda x: get_variable_in_code(get_data_division(read_cobol_list(os.path.relpath(path.replace('projects','project_clean')))), x['code']), axis = 1)
merged_other_function_df.loc[(merged_other_function_df['function name']=='Top function with no name')&(merged_other_function_df['code'].apply(lambda x:x.strip().lower().startswith('set environment'))), 'function name'] = 'set environment'
other_function_df = merged_other_function_df[['repo', 'file path', 'function name', 'code', 'variables', 'called paragraphs', 'is called', 'is section', 'is paragraph', 'section name']]
other_function_df
# The result of this dataframe is included in  /data/function_refined_summary.csv

### merge all functions

In [None]:
# The result of this dataframe is included in  /data/function_refined_summary.csv
all_functions_df = pd.concat([function_for_file_df, other_function_df], ignore_index=True)
all_functions_df['file path'] = all_functions_df['file path'].apply(lambda x:x.replace('project_clean', 'project'))
all_functions_df

## extract calls to exernal files

In [None]:
def check_link(content):
    link_programs = set()  # Using a set to avoid duplicates
    call_programs = set()  # Using a set to avoid duplicates
    file_calls = set()     # Using a set to avoid duplicates
    maps_used = set()      # Using a set to avoid duplicates
    trans_ids = set()      # Using a set to avoid duplicates
    sql_tables = set()     # Using a set to avoid duplicates
    lines = content.splitlines()
    print(lines)
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        line_upper = line.upper()
        link_pattern = re.compile(r"\(([^)]+)\)")
        if "EXEC CICS LINK PROGRAM" in line_upper:
            # We check both the original line and the line with comments removed

            matches = link_pattern.findall(line_upper)
            for match in matches:
                # Keep the content inside parentheses
                link_program = match.strip()
                link_programs.add(link_program)
        if "CALL '" in line_upper:
            start = line_upper.find("CALL '") + len("CALL '")
            end = line_upper.find("'", start)
            call_program = line[start:end]
            call_programs.add(call_program)
        if "EXEC CICS READ" in line_upper or "EXEC CICS WRITE" in line_upper:
            start = line_upper.find('(') + 1
            end = line_upper.find(')', start)
            file_call = line[start:end]
            file_calls.add(file_call)
        if "EXEC CICS SEND MAP" in line_upper:
            start = line_upper.find('(') + 1
            end = line_upper.find(')', start)
            map_used = line[start:end]
            maps_used.add(map_used)
        if "EXEC CICS RETURN" in line_upper:
            j = i
            while j < len(lines) - 1:
                j += 1
                line_upper = lines[j].upper()
                if "TRANSID('" in line_upper:
                    transid_start = line_upper.find("TRANSID('") + len("TRANSID('")
                    transid_end = line_upper.find("')", transid_start)
                    trans_id = lines[j][transid_start:transid_end]
                    trans_ids.add(trans_id)
                    break
        if "EXEC SQL" in line_upper:
            k = i
            while k < len(lines) - 1:
                k += 1
                line_upper = lines[k].upper()
                if "FROM" in line_upper:
                    start = line_upper.find("FROM") + len("FROM")
                    table_names = lines[k][start:].split(',')
                    for table_name in table_names:
                        sql_tables.add(table_name.strip())
                    break
        i += 1

    link_status = "LINK" if link_programs else "NO LINK"
    call_status = "CALL" if call_programs else "NO CALL"
    file_call_status = "FILE CALL" if file_calls else "NO FILE CALL"
    map_status = "MAP USED" if maps_used else "NO MAP USED"
    return_status = "RETURN" if trans_ids else "NO RETURN"
    sql_status = "SQL" if sql_tables else "NO SQL"
    return([", ".join(link_programs), link_status,
                        ", ".join(call_programs), call_status,
                        ", ".join(file_calls), file_call_status,
                        ", ".join(maps_used), map_status,
                        ", ".join(trans_ids), return_status,
                        ", ".join(sql_tables), sql_status])

In [None]:
# The result of this dataframe is included in  /data/function_refined_summary.csv
all_functions_df[["Linked Programs", "LINK Status", 
                                       "Called Programs", "CALL Status", "File Calls", "FILE CALL Status",
                                       "Maps Used", "MAP Status", "Transaction IDs", "RETURN Status",
                                       "SQL Tables", "SQL Status"]] = all_functions_df['code'].apply(check_link).apply(pd.Series)
all_functions_df

# Extract file-level artifacts

### procedure/data division, filename, program id, paragragh names, ...

In [None]:
def get_paragraphs_from_function(path, all_function_df):
    functions = all_function_df.loc[all_function_df['file path']==path]
    return functions['function name'].to_list()
def get_program_id(code):
    code_lst = code.splitlines()
    for line in code_lst:
        if line.lower().strip().startswith('program-id'):
            return line.strip().split()[-1].strip().strip('.').strip()
    return None
def get_procedure_division(file_content):
    file_lst = file_content.splitlines()
    procedure_idx = None
    for i, line in enumerate(file_lst):
        if line.lower().strip().startswith('procedure division'):
            procedure_idx = i
            break
    return '\n'.join(file_lst[procedure_idx:])

In [None]:
all_file_df = pd.DataFrame(data={"file path": cbl_files_in_dataset + cbl_files_not_in_dataset})
all_file_df['repo'] = all_file_df['file path'].apply(lambda x:x.split('/')[3])
all_file_df['filename'] = all_file_df['file path'].apply(lambda x:x.split('/')[-1])
all_file_df['program id'] = all_file_df['file path'].apply(lambda x:get_program_id(read_cobol_str(x)))
all_file_df['code'] = all_file_df['file path'].apply(lambda x:read_cobol_str(x))
all_file_df['procedure division'] = all_file_df['file path'].apply(lambda x:get_procedure_division(read_cobol_str(x)))
all_file_df['data division'] = all_file_df['file path'].apply(lambda x:''.join(get_data_division(read_cobol_list(x))))
all_file_df['file path'] = all_file_df['file path'].apply(lambda x:x.replace('project_clean', 'project'))
all_file_df['paragraphs'] = all_file_df['file path'].apply(lambda x: get_paragraphs_from_function(x, all_functions_df))
all_file_df
# The result of this dataframe is included in  /data/file_generated_summary.csv

### paragraph call relations

In [None]:
def get_call_relation_from_paragraph_df(row, all_function_df):
    relation = ''
    for p in row['paragraphs']:
        temp = all_function_df.loc[(all_function_df['file path']==row['file path'])&(all_function_df['function name']==p)]#['called paragraphs']
        if len(temp) >= 1:
            if temp['is paragraph'].values[0] and isinstance(temp['section name'].values[0], str):
                continue
            else:
                temp = temp['called paragraphs'].values[0]
                if len(temp) >=1 :
                    relation += f'Paragraph {p} calls paragraph '+', '.join(temp) + '.\n'
    return relation

In [None]:
all_file_df['call relations'] = all_file_df.apply(lambda x: get_call_relation_from_paragraph_df(x, all_functions_df),axis=1)
all_file_df
# The result of this dataframe is included in  /data/file_generated_summary.csv

### file call relations

In [None]:
all_file_df[["Linked Programs", "LINK Status", 
                                       "Called Programs", "CALL Status", "File Calls", "FILE CALL Status",
                                       "Maps Used", "MAP Status", "Transaction IDs", "RETURN Status",
                                       "SQL Tables", "SQL Status"]] = all_file_df['code'].apply(check_link).apply(pd.Series)
all_file_df
# The result of this dataframe is included in  /data/file_generated_summary.csv

# Extract project-level artifacts