In [None]:
"""
筛选距离小于3的盐桥对(非经典全保守筛选)
condition1 = pi1_cation2 <= 3 
condition2 = cation1_pi2 <= 3
condition1 and condition2
"""
import os
import re
from pathlib import Path

def parse_comparison_line(line):
    """解析比较行，提取距离信息"""
    # 匹配 pi1-cation2, cation1-pi2 的值
    pi_match = re.search(r'pi1-cation2:([\d.]+)', line)
    cation_match = re.search(r'cation1-pi2:([\d.]+)', line)
    
    if pi_match and cation_match:
        pi1_cation2 = float(pi_match.group(1))
        cation1_pi2 = float(cation_match.group(1))
        return pi1_cation2, cation1_pi2
    return None, None

def check_condition(pi1_cation2, cation1_pi2):
    """检查是否符合条件"""
    if pi1_cation2 is None or cation1_pi2 is None:
        return False
    
    # 条件1: pi1-cation2 <= 3
    condition1 = pi1_cation2 <= 3 
    
    # 条件2: pi1-pi2 < 3 且 cation1-cation2 < 2
    condition2 = cation1_pi2 <= 3
    
    return condition1 and condition2

def process_txt_file(input_file, output_file_filtered, output_file_other):
    """处理单个txt文件，分别输出到两个文件"""
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    filtered_lines = []
    other_lines = []
    i = 0
    
    while i < len(lines):
        line = lines[i].strip()
        
        # 检查是否是比较行（包含"与"和":"）
        if '与' in line and 'pi1-cation2:' in line:
            pi1_cation2, cation1_pi2 = parse_comparison_line(line)
            
            # 收集当前组的三行数据
            group_lines = [lines[i]]
            if i + 1 < len(lines):
                group_lines.append(lines[i + 1])
            if i + 2 < len(lines):
                group_lines.append(lines[i + 2])
            
            if check_condition(pi1_cation2, cation1_pi2):
                # 符合条件，添加到 filtered
                filtered_lines.extend(group_lines)
                filtered_lines.append('\n')
            else:
                # 不符合条件，添加到 other
                other_lines.extend(group_lines)
                other_lines.append('\n')
            
            i += 3  # 跳过已处理的行
        else:
            i += 1
    
    # 写入符合条件的文件
    if filtered_lines:
        with open(output_file_filtered, 'w', encoding='utf-8') as f:
            f.writelines(filtered_lines)
    
    # 写入不符合条件的文件
    if other_lines:
        with open(output_file_other, 'w', encoding='utf-8') as f:
            f.writelines(other_lines)
    
    return len(filtered_lines) > 0, len(other_lines) > 0

def main(input_dir, output_dir_filtered, output_dir_other):
    """主函数：处理整个目录"""
    input_path = Path(input_dir)
    output_path_filtered = Path(output_dir_filtered)
    output_path_other = Path(output_dir_other)
    
    # 创建输出目录
    output_path_filtered.mkdir(parents=True, exist_ok=True)
    output_path_other.mkdir(parents=True, exist_ok=True)
    
    # 处理所有txt文件
    txt_files = list(input_path.glob('*.txt'))
    filtered_count = 0
    other_count = 0
    
    for txt_file in txt_files:
        output_file_filtered = output_path_filtered / txt_file.name
        output_file_other = output_path_other / txt_file.name
        
        has_filtered, has_other = process_txt_file(
            txt_file, output_file_filtered, output_file_other
        )
        
        if has_filtered:
            filtered_count += 1
        if has_other:
            other_count += 1
        
        print(f"已处理: {txt_file.name} - 符合条件: {has_filtered}, 其他: {has_other}")
    
    print(f"\n总共 {len(txt_files)} 个文件")
    print(f"有符合条件数据的文件: {filtered_count} 个")
    print(f"有其他数据的文件: {other_count} 个")

# 使用示例
if __name__ == "__main__":
    input_directory = "/home/databank_70t/pengziyu/scop/251018/2A-con-disCA/nocla"  # 输入目录
    output_directory_filtered = "/home/databank_70t/pengziyu/scop/251020/nocla_filtered"  # 符合条件的输出目录
    output_directory_other = "/home/databank_70t/pengziyu/scop/251020/nocla_other"  # 其他的输出目录
    
    main(input_directory, output_directory_filtered, output_directory_other)已完成

In [6]:
"""
筛选距离小于2的盐桥对(经典全保守筛选)
condition1 = pi1_pi2 <= 2 
condition2 = cation1_cation2 <= 2
condition1 and condition2
"""
import os
import re
from pathlib import Path

def parse_comparison_line(line):
    """解析比较行，提取距离信息"""
    # 匹配 pi1-pi2, cation1-cation2 的值
    pi_match = re.search(r'pi1-pi2:([\d.]+)', line)
    cation_match = re.search(r'cation1-cation2:([\d.]+)', line)
    
    if pi_match and cation_match:
        pi1_pi2 = float(pi_match.group(1))
        cation1_cation2 = float(cation_match.group(1))
        return pi1_pi2, cation1_cation2
    return None, None

def check_condition(pi1_pi2, cation1_cation2):
    """检查是否符合条件"""
    if pi1_pi2 is None or cation1_cation2 is None:
        return False
    
    # 条件1: pi1-pi2 < 2 且 cation1-cation2 < 3
    condition1 = pi1_pi2 <= 2 
    
    # 条件2: pi1-pi2 < 3 且 cation1-cation2 < 2
    condition2 = cation1_cation2 <= 2
    
    return condition1 and condition2

def process_txt_file(input_file, output_file_filtered, output_file_other):
    """处理单个txt文件，分别输出到两个文件"""
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    filtered_lines = []
    other_lines = []
    i = 0
    
    while i < len(lines):
        line = lines[i].strip()
        
        # 检查是否是比较行（包含"与"和":"）
        if '与' in line and 'pi1-pi2:' in line:
            pi1_pi2, cation1_cation2 = parse_comparison_line(line)
            
            # 收集当前组的三行数据
            group_lines = [lines[i]]
            if i + 1 < len(lines):
                group_lines.append(lines[i + 1])
            if i + 2 < len(lines):
                group_lines.append(lines[i + 2])
            
            if check_condition(pi1_pi2, cation1_cation2):
                # 符合条件，添加到 filtered
                filtered_lines.extend(group_lines)
                filtered_lines.append('\n')
            else:
                # 不符合条件，添加到 other
                other_lines.extend(group_lines)
                other_lines.append('\n')
            
            i += 3  # 跳过已处理的行
        else:
            i += 1
    
    # 写入符合条件的文件
    if filtered_lines:
        with open(output_file_filtered, 'w', encoding='utf-8') as f:
            f.writelines(filtered_lines)
    
    # 写入不符合条件的文件
    if other_lines:
        with open(output_file_other, 'w', encoding='utf-8') as f:
            f.writelines(other_lines)
    
    return len(filtered_lines) > 0, len(other_lines) > 0

def main(input_dir, output_dir_filtered, output_dir_other):
    """主函数：处理整个目录"""
    input_path = Path(input_dir)
    output_path_filtered = Path(output_dir_filtered)
    output_path_other = Path(output_dir_other)
    
    # 创建输出目录
    output_path_filtered.mkdir(parents=True, exist_ok=True)
    output_path_other.mkdir(parents=True, exist_ok=True)
    
    # 处理所有txt文件
    txt_files = list(input_path.glob('*.txt'))
    filtered_count = 0
    other_count = 0
    
    for txt_file in txt_files:
        output_file_filtered = output_path_filtered / txt_file.name
        output_file_other = output_path_other / txt_file.name
        
        has_filtered, has_other = process_txt_file(
            txt_file, output_file_filtered, output_file_other
        )
        
        if has_filtered:
            filtered_count += 1
        if has_other:
            other_count += 1
        
        print(f"已处理: {txt_file.name} - 符合条件: {has_filtered}, 其他: {has_other}")
    
    print(f"\n总共 {len(txt_files)} 个文件")
    print(f"有符合条件数据的文件: {filtered_count} 个")
    print(f"有其他数据的文件: {other_count} 个")

# 使用示例
if __name__ == "__main__":
    input_directory = "/home/databank_70t/pengziyu/scop/251018/2A-con-disCA/cla"  # 输入目录
    output_directory_filtered = "/home/databank_70t/pengziyu/scop/251020/cla_filtered"  # 符合条件的输出目录
    output_directory_other = "/home/databank_70t/pengziyu/scop/251020/cla_other"  # 其他的输出目录
    
    main(input_directory, output_directory_filtered, output_directory_other)已完成

已处理: 4001094.txt - 符合条件: True, 其他: False
已处理: 4002619.txt - 符合条件: True, 其他: False
已处理: 4004801.txt - 符合条件: True, 其他: False
已处理: 4003194.txt - 符合条件: True, 其他: True
已处理: 4002415.txt - 符合条件: False, 其他: True
已处理: 4002230.txt - 符合条件: True, 其他: False
已处理: 4001593.txt - 符合条件: True, 其他: True
已处理: 4001181.txt - 符合条件: True, 其他: False
已处理: 4001771.txt - 符合条件: True, 其他: False
已处理: 4005301.txt - 符合条件: True, 其他: True
已处理: 4007442.txt - 符合条件: True, 其他: False
已处理: 4003672.txt - 符合条件: True, 其他: True
已处理: 4003441.txt - 符合条件: True, 其他: True
已处理: 4006144.txt - 符合条件: True, 其他: True
已处理: 4001168.txt - 符合条件: True, 其他: True
已处理: 4003193.txt - 符合条件: False, 其他: True
已处理: 4003405.txt - 符合条件: True, 其他: True
已处理: 4000607.txt - 符合条件: True, 其他: False
已处理: 4005930.txt - 符合条件: True, 其他: False
已处理: 4004470.txt - 符合条件: True, 其他: False
已处理: 4002486.txt - 符合条件: True, 其他: True
已处理: 4000820.txt - 符合条件: True, 其他: False
已处理: 4001858.txt - 符合条件: True, 其他: True
已处理: 4005307.txt - 符合条件: True, 其他: True
已处理: 4000040.txt - 符合条件: Tr

In [10]:
"""
分类半保守守距离
pi1_pi2 <= 2 or cation1_cation2 <= 2: 半经典
pi1_cation2 <= 3 or cation1_pi2 <= 3:半非经典
"""
import os
import re

# 设置输入和输出目录
input_dir = "/home/databank_70t/pengziyu/scop/251020/nocla_other"
output_dir_half_cla = "/home/databank_70t/pengziyu/scop/251020/noclaother_half-cla-2A"
output_dir_half_nocla = "/home/databank_70t/pengziyu/scop/251020/half-nocla-3A"
output_dir_others = "/home/databank_70t/pengziyu/scop/251020/others"

# 创建输出目录
os.makedirs(output_dir_half_cla, exist_ok=True)
os.makedirs(output_dir_half_nocla, exist_ok=True)
os.makedirs(output_dir_others, exist_ok=True)

# 遍历输入目录中的所有txt文件
for filename in os.listdir(input_dir):
    if filename.endswith('.txt'):
        input_path = os.path.join(input_dir, filename)
        
        with open(input_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        # 三个类别的内容（使用列表存储每组盐桥）
        half_cla_groups = []
        half_nocla_groups = []
        others_groups = []
        
        i = 0
        
        while i < len(lines):
            line = lines[i].strip()
            
            # 匹配包含距离信息的行
            if ' 与 ' in line and ':' in line:
                # 提取所有距离值
                match = re.search(r'pi1-pi2:([\d.]+),pi1-cation2:([\d.]+),cation1-pi2:([\d.]+),cation1-cation2:([\d.]+)', line)
                
                if match:
                    pi1_pi2 = float(match.group(1))
                    pi1_cation2 = float(match.group(2))
                    cation1_pi2 = float(match.group(3))
                    cation1_cation2 = float(match.group(4))
                    
                    # 获取当前行和接下来的两行
                    current_block = [lines[i]]
                    if i + 1 < len(lines):
                        current_block.append(lines[i + 1])
                    if i + 2 < len(lines):
                        current_block.append(lines[i + 2])
                    
                    # 判断属于哪个类别
                    # 条件1: pi1_pi2 <= 2 或 cation1_cation2 <= 2 (有一个在2内)
                    if pi1_pi2 <= 2 or cation1_cation2 <= 2:
                        half_cla_groups.append(current_block)
                    # 条件2: pi1_cation2 <= 3 或 cation1_pi2 <= 3
                    elif pi1_cation2 <= 3 or cation1_pi2 <= 3:
                        half_nocla_groups.append(current_block)
                    # 条件3: 其他情况
                    else:
                        others_groups.append(current_block)
                    
                    i += 3
                    continue
            
            i += 1
        
        # 写入对应的输出文件，每组之间添加空行
        if half_cla_groups:
            output_path = os.path.join(output_dir_half_cla, filename)
            need_sep = os.path.exists(output_path) and os.path.getsize(output_path) > 0
            with open(output_path, 'a', encoding='utf-8') as f:
                if need_sep:
                    f.write('\n')
                for idx, group in enumerate(half_cla_groups):
                    f.writelines(group)
                    if idx < len(half_cla_groups) - 1:
                        f.write('\n')
            print(f"{filename}: half-cla-2A 找到 {len(half_cla_groups)} 个")
        
        if half_nocla_groups:
            output_path = os.path.join(output_dir_half_nocla, filename)
            need_sep = os.path.exists(output_path) and os.path.getsize(output_path) > 0
            with open(output_path, 'a', encoding='utf-8') as f:
                if need_sep:
                    f.write('\n')
                for idx, group in enumerate(half_nocla_groups):
                    f.writelines(group)
                    if idx < len(half_nocla_groups) - 1:
                        f.write('\n')
            print(f"{filename}: half-nocla-3A 找到 {len(half_nocla_groups)} 个")
        
        if others_groups:
            output_path = os.path.join(output_dir_others, filename)
            need_sep = os.path.exists(output_path) and os.path.getsize(output_path) > 0
            with open(output_path, 'a', encoding='utf-8') as f:  # 改为 'a' 追加模式
                if need_sep:
                    f.write('\n')  # 添加分隔空行
                for idx, group in enumerate(others_groups):
                    f.writelines(group)
                    if idx < len(others_groups) - 1:
                        f.write('\n')
            print(f"{filename}: others 找到 {len(others_groups)} 个")

print("\n所有文件处理完成！")
print(f"结果已分别保存到:")
print(f"  - {output_dir_half_cla}")
print(f"  - {output_dir_half_nocla}")
print(f"  - {output_dir_others}")#nocla已完成

4001094.txt: half-nocla-3A 找到 2 个
4003194.txt: half-nocla-3A 找到 6 个
4003194.txt: others 找到 7 个
4003672.txt: others 找到 1 个
4001168.txt: half-nocla-3A 找到 49 个
4001168.txt: others 找到 18 个
4001858.txt: half-nocla-3A 找到 20 个
4001858.txt: others 找到 14 个
4000040.txt: half-nocla-3A 找到 1 个
4001887.txt: half-nocla-3A 找到 28 个
4001887.txt: others 找到 15 个
4003142.txt: half-nocla-3A 找到 2 个
4002199.txt: half-nocla-3A 找到 5 个
4002133.txt: half-nocla-3A 找到 1 个
4001128.txt: others 找到 2 个
4003128.txt: others 找到 1 个
4001630.txt: half-nocla-3A 找到 1 个
4000833.txt: half-nocla-3A 找到 8 个
4000833.txt: others 找到 2 个
4002497.txt: half-nocla-3A 找到 1 个
4004188.txt: half-nocla-3A 找到 1 个
4004188.txt: others 找到 1 个
4002772.txt: half-nocla-3A 找到 20 个
4006921.txt: half-nocla-3A 找到 2 个
4001408.txt: others 找到 1 个
4003381.txt: half-nocla-3A 找到 1 个
4000246.txt: half-nocla-3A 找到 1 个
4000246.txt: others 找到 1 个
4002008.txt: half-nocla-3A 找到 1 个
4003576.txt: half-nocla-3A 找到 7 个
4003576.txt: others 找到 1 个
4001108.txt: others 找到 