In [1]:
import os
import subprocess as sp
import pandas as pd

# set directories
project_dir = '/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/'
results_dir = os.path.join(project_dir, 'results/')

### Check HiCCUPs First

In [3]:
# locate all link files
hiccups_names = 'results/loops/hiccups/whole_genome'
hiccups_dir = os.path.join(project_dir, 'results/loops/hiccups/whole_genome/')
cmd = 'find {} -wholename {} -type l'.format(results_dir)
link_files = sp.check_output(cmd, shell=True)
link_files_list = link_files.decode().split('\n')

In [5]:
# filter for those not part of a the hub/shortcuts paths
nonhub_link_files_list = [x for x in link_files_list if 'shortcuts' not in x]
nonhub_link_files_list = [x for x in nonhub_link_files_list if 'logs/' not in x]

In [67]:
def determine_path_type(path):
    '''
        Check the validity of a filepath.
        sym-sym refers to symlink that points to another symlink
        sym-file refers to symlink that points to a regular file
        sym-to-sym-broken-cyclic refers to a symlink broken by a cyclic linking pattern
        sym-to-sym-broken-missing refers to a symlink broken by a missing source path

        Example:
        --------
        fns = ['symlink1', 'symlink2', 'symlink3', 'symlink4', 'symlink5', 'test']
        for fn in fns:
            if os.path.exists(fn):
                os.remove(fn)

        os.symlink('symlink1', 'symlink2')
        os.symlink('symlink2', 'symlink3')
        os.symlink('symlink3', 'symlink1')
        with open('test', 'w') as f:
            pass
        os.symlink('test', 'symlink4')
        os.symlink('test2', 'symlink5')
        os.symlink('symlink4', 'symlink6')

        fns = ['symlink1', 'symlink2', 'symlink3', 'symlink4', 'symlink5', 'test', 'symlink6']
        for fn in fns:
            print(fn)
            print(determine_path_type(fn))
            print()
    '''

    # leveraging stat to locate broken links, an error 
    # indicates either a symlink broken by a missing file (sym-to-sym-broken-missing)
    # or by a cyclic link (sym-to-sym-broken-cyclic)
    try:
        stat = os.stat(path)

        # working symlink cases 
        if os.path.islink(path):
            target_path = os.readlink(path)
            if os.path.islink(target_path):
                return('sym-sym')
            elif os.path.isfile(target_path):
                return('sym-file')
            else:
                return('bug')
        
        # regular path cases
        elif os.path.isfile(path):
            return('regular-file')
        elif os.path.isdir(path):
            return('regular-dir')
        else:
            return('bug')
    except OSError as e:

        # broken symlink with cyclic case
        if '[Errno 40] Too many levels of symbolic links:' in str(e):
            return('sym-to-sym-broken-cyclic')

        # broken symlink with missing case
        elif '[Errno 2] No such file or directory:' in str(e):
            return('sym-to-sym-broken-missing')
            
        else:
            return('but - {}'.format(e))

In [106]:
symlinks_data = []
for filepath in nonhub_link_files_list:
    filetype = determine_path_type(filepath)
    
    if filetype in ['sym-sym', 'sim-file', 'sym-to-sym-broken-cyclic', 'sym-to-sym-broken-missing']:
        symlinks_data.append([filepath, filetype])


In [None]:
symlinks_df = pd.DataFrame(symlinks_data)

In [108]:
symlinks_df.values

array([['/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/results/t2t-chm13v2.0/biorep_merged/results/hicpro/Substantia-Nigra.GSE147672.Homo_Sapiens.H3K27ac.biorep_merged/rawdata_allValidPairs',
        'sym-sym'],
       ['/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/results/t2t-chm13v2.0/biorep_merged/results/hicpro/OCI-Ly7-OCABi.GSE183797.Homo_Sapiens.H3K27ac.biorep_merged/rawdata_allValidPairs',
        'sym-sym'],
       ['/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/results/t2t-chm13v2.0/biorep_merged/results/hicpro/Th17.GSE101498.Homo_Sapiens.H3K27ac.biorep_merged/rawdata_allValidPairs',
        'sym-sym']], dtype=object)

In [107]:
symlinks_df.shape

(3, 2)

In [67]:
print('Current filepath is: \t\t {}'.format(filepath))
print('This file has a symlink to: \t{}'.format(os.readlink(filepath)))
print('The realpath of this file is: {}'.format(os.path.realpath(filepath)))

Current filepath is: 		 /mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/results/t2t-chm13v2.0/biorep_merged/results/hicpro/Substantia-Nigra.GSE147672.Homo_Sapiens.H3K27ac.biorep_merged/rawdata_allValidPairs
This file has a symlink to: 	/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling//ref_genome/biorep_merged/results/hicpro/Substantia-Nigra.GSE147672.Homo_Sapiens.H3K27ac.biorep_merged/rawdata_allValidPairs
The realpath of this file is: /mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/ref_genome/biorep_merged/results/hicpro/Substantia-Nigra.GSE147672.Homo_Sapiens.H3K27ac.biorep_merged/Substantia-Nigra.GSE147672.Homo_Sapiens.H3K27ac.biorep_merged.allValidPairs
