In [1]:
# 删除.data/html_data/重复的html文件
import os
from bs4 import BeautifulSoup
from datetime import datetime

def extract_timestamp(filename):
    # 从文件名中提取时间戳，例如 "100Cr6_20241122_143510.html"
    try:
        date_str = filename.split('_')[1] + filename.split('_')[2].split('.')[0]
        return datetime.strptime(date_str, '%Y%m%d%H%M%S')
    except:
        return datetime.min

def find_and_remove_duplicates(directory):
    title_map = {}  # {title_text: [(timestamp, file_path), ...]}
    
    # 首先收集所有文件信息
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.html'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    soup = BeautifulSoup(f, 'html.parser')
                    title_tag = soup.find('title')
                    
                    if title_tag:
                        title_text = title_tag.get_text(strip=True)
                        timestamp = extract_timestamp(file)
                        
                        if title_text not in title_map:
                            title_map[title_text] = []
                        title_map[title_text].append((timestamp, file_path))

    # 处理重复文件
    for title_text, files in title_map.items():
        if len(files) > 1:
            # 按时间戳排序，保留最新的文件
            sorted_files = sorted(files, key=lambda x: x[0], reverse=True)
            # 保留第一个（最新的），删除其他的
            for _, file_path in sorted_files[1:]:
                print(f"Removing duplicate file: {file_path}")
                os.remove(file_path)

if __name__ == "__main__":
    directory = './data/html_data'
    find_and_remove_duplicates(directory)

Removing duplicate file: ./data/html_data/100Cr6/100Cr6_20241122_144548.html
Removing duplicate file: ./data/html_data/100Cr6/100Cr6_20241122_144541.html
Removing duplicate file: ./data/html_data/100Cr6/100Cr6_20241122_143508.html
Removing duplicate file: ./data/html_data/100Cr6/100Cr6_20241122_143513.html
Removing duplicate file: ./data/html_data/100Cr6/100Cr6_20241122_143503.html
Removing duplicate file: ./data/html_data/X45NiCrMo4/X45NiCrMo 4_20241122_154011.html
Removing duplicate file: ./data/html_data/10/10_20241204_173239.html
Removing duplicate file: ./data/html_data/10/10_20241204_173236.html
Removing duplicate file: ./data/html_data/10#/10#_20241121_213731.html
Removing duplicate file: ./data/html_data/10#/10#_20241121_213727.html
Removing duplicate file: ./data/html_data/M50/M50_20241122_152822.html
Removing duplicate file: ./data/html_data/M2/M2_20241122_153941.html
Removing duplicate file: ./data/html_data/20Cr/20Cr_20241125_155901.html
Removing duplicate file: ./data/html

In [3]:
# 清理clean_html_data文件夹中重复的html文件

def clean_duplicate_html(directory):
    title_map = {}  # {title_text: file_path}
    
    # 遍历所有html文件
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.html'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    soup = BeautifulSoup(f, 'html.parser')
                    title_tag = soup.find('title')
                    
                    if title_tag:
                        title_text = title_tag.get_text(strip=True)
                        
                        # 如果标题已存在,删除当前文件
                        if title_text in title_map:
                            print(f"删除重复文件: {file_path}")
                            os.remove(file_path)
                        else:
                            # 保存第一个出现的文件
                            title_map[title_text] = file_path

if __name__ == "__main__":
    directory = './data/clean_html_data'
    clean_duplicate_html(directory)


删除重复文件: ./data/clean_html_data/100Cr6/100Cr6_20241122_144548.html
删除重复文件: ./data/clean_html_data/100Cr6/100Cr6_20241122_144541.html
删除重复文件: ./data/clean_html_data/100Cr6/100Cr6_20241122_143508.html
删除重复文件: ./data/clean_html_data/100Cr6/100Cr 6_20241122_144533.html
删除重复文件: ./data/clean_html_data/100Cr6/100Cr6_20241122_143513.html
删除重复文件: ./data/clean_html_data/100Cr6/100Cr6_20241122_143503.html
删除重复文件: ./data/clean_html_data/X45NiCrMo4/X45NiCrMo 4_20241122_154011.html
删除重复文件: ./data/clean_html_data/10#/10#_20241121_213731.html
删除重复文件: ./data/clean_html_data/10#/10#_20241121_213727.html
删除重复文件: ./data/clean_html_data/M50/M50_20241122_152822.html
删除重复文件: ./data/clean_html_data/M2/M2_20241122_153941.html
删除重复文件: ./data/clean_html_data/SAE1055/SAE1055_20241122_153218.html
删除重复文件: ./data/clean_html_data/SAE1055/SAE1055_20241122_153211.html


In [4]:
# 删除.data/JsonData/重复的json文件
import json
def clean_duplicate_json(directory):
    # 遍历所有材料文件夹
    for root, dirs, _ in os.walk(directory):
        for material_dir in dirs:
            material_path = os.path.join(root, material_dir)
            standard_map = {}  # {standard_code: (file_path, json_data)}
            
            # 遍历材料文件夹中的所有json文件
            for file in os.listdir(material_path):
                if file.endswith('.json'):
                    file_path = os.path.join(material_path, file)
                    
                    with open(file_path, 'r', encoding='utf-8') as f:
                        try:
                            json_data = json.load(f)
                            standard_code = json_data.get('Material', {}).get('BelongsToStandard', {}).get('StandardCode')
                            
                            if standard_code:
                                if standard_code in standard_map:
                                    # 发现重复的标准代码,删除当前文件
                                    print(f"删除重复文件: {file_path}")
                                    os.remove(file_path)
                                else:
                                    # 保存第一个出现的文件
                                    standard_map[standard_code] = (file_path, json_data)
                        except json.JSONDecodeError:
                            print(f"无法解析JSON文件: {file_path}")
                            continue

if __name__ == "__main__":
    directory = './data/JsonData'
    clean_duplicate_json(directory)


删除重复文件: ./data/JsonData/100Cr6/100Cr6_20241122_144530.json
删除重复文件: ./data/JsonData/100Cr6/100Cr6_20241122_144538.json
删除重复文件: ./data/JsonData/100Cr6/100Cr6_20241122_143510.json
删除重复文件: ./data/JsonData/100Cr6/100Cr 6_20241122_143505.json
删除重复文件: ./data/JsonData/100Cr6/100Cr6_20241122_143516.json
删除重复文件: ./data/JsonData/100Cr6/100Cr6_20241122_144545.json
删除重复文件: ./data/JsonData/X45NiCrMo4/X45NiCrMo 4_20241122_153615.json
删除重复文件: ./data/JsonData/18CrMo4/18CrMo4_20241121_214420.json
删除重复文件: ./data/JsonData/18CrMo4/18CrMo4_20241121_214341.json
删除重复文件: ./data/JsonData/18CrMo4/18CrMo4_20241121_214415.json
删除重复文件: ./data/JsonData/18CrMo4/18CrMo4_20241121_214408.json
删除重复文件: ./data/JsonData/SCM420H/SCM420H_20241122_153256.json
删除重复文件: ./data/JsonData/W6Mo5Cr4V2/W6Mo5Cr4V2_20241122_153447.json
删除重复文件: ./data/JsonData/W6Mo5Cr4V2/W6Mo5Cr4V2_20241122_153442.json
