# 源域处理

In [1]:
import os
import pandas as pd
# 设置目标文件夹路径
target_dir = 'E:\硕士\研究生数模\中文赛题ACDEF题\E题\数据集\源域数据集'

# 存储所有找到的CSV文件信息（一个文件名对应一个time向量）
csv_data_dict = {}

# 递归遍历目标文件夹及其所有子文件夹
for root, dirs, files in os.walk(target_dir):
    # 遍历当前目录下的所有文件
    for file in files:
        # 检查文件是否为CSV文件
        if file.lower().endswith('.csv'):
            # 获取文件的完整路径
            file_path = os.path.join(root, file)
            # 去掉.csv后缀，作为file name变量
            file_name = os.path.splitext(file)[0]
            
            try:
                # 读取CSV文件，只读取A列，并且不将第一行作为列名（header=None）
                df = pd.read_csv(file_path, usecols=[0], header=None)  # 添加header=None
                # 将A列的所有值作为time向量（列表形式）
                time_vector = df.iloc[:, 0].tolist()
                
                # 将文件名和对应的time向量添加到字典中
                csv_data_dict[file_name] = time_vector
                
            except Exception as e:
                print(f"处理文件 {file_path} 时出错: {str(e)}")

# 转换为DataFrame格式以方便查看和处理
# 创建一个包含文件名和time向量的DataFrame
wide_table = pd.DataFrame({
    'file_name': list(csv_data_dict.keys()),
    'time_vector': list(csv_data_dict.values())
})

# 显示找到的文件数量和前几行数据
print(f"总共处理了 {len(csv_data_dict)} 个CSV文件")

wide_table

  target_dir = 'E:\硕士\研究生数模\中文赛题ACDEF题\E题\数据集\源域数据集'


总共处理了 562 个CSV文件


Unnamed: 0,file_name,time_vector
0,B007_0_X118RPM,[1796]
1,B007_0_X118_BA_time,"[-0.0138830129577656, -0.0583453274978249, -0...."
2,B007_0_X118_DE_time,"[0.1961220931776215, -0.0104953918523185, -0.0..."
3,B007_0_X118_FE_time,"[-0.0709659387026888, 0.1266813340245839, -0.1..."
4,B007_1_X119RPM,[1772]
...,...,...
557,N_2_(1750rpm)_X099_DE_time,"[0.0993486295841983, 0.0688907834303522, 0.053..."
558,N_2_(1750rpm)_X099_FE_time,"[-0.0786006764519622, -0.0541515855428713, 0.0..."
559,N_3_X100RPM,[1725]
560,N_3_X100_DE_time,"[-0.014753283453705, -0.082553283453705, -0.12..."


In [2]:
# 提取所有RPM结尾的文件及其值
rpm_values = {}
for idx, row in wide_table.iterrows():
    file_name = row['file_name']
    # 检查文件名是否以RPM结尾
    if file_name.endswith('RPM'):
        # 获取RPM值（假设time_vector只有一个值）
        if len(row['time_vector']) >= 1:
            rpm_value = row['time_vector'][0]
            rpm_values[file_name] = rpm_value

# 函数：根据文件名查找对应的RPM值
def find_matching_rpm(file_name, rpm_values):
    # 根据文件名模式匹配对应的RPM文件
    
    # 示例：B007_0_X118_BA_time 应该匹配 B007_1_X119RPM
    # 提取文件名中的前缀部分（如B007）
    parts = file_name.split('_')
    if len(parts) >= 3:
        # 构建可能的RPM文件名模式
        base_pattern = f"{parts[0]}_{parts[1]}_{parts[2]}"
        
        # 在所有RPM文件名中查找匹配的模式
        for rpm_file, rpm_value in rpm_values.items():
            if base_pattern in rpm_file:
                return rpm_value
    
    # 如果没有找到匹配的RPM文件，返回None（空值）
    return None

# 为每行添加RPM列
wide_table['RPM'] = wide_table['file_name'].apply(lambda x: find_matching_rpm(x, rpm_values))

# 显示找到的文件数量和前几行数据
print(f"总共处理了 {len(csv_data_dict)} 个CSV文件")
print(f"其中以RPM结尾的文件有 {len(rpm_values)} 个")
wide_table

总共处理了 562 个CSV文件
其中以RPM结尾的文件有 151 个


Unnamed: 0,file_name,time_vector,RPM
0,B007_0_X118RPM,[1796],1796.0
1,B007_0_X118_BA_time,"[-0.0138830129577656, -0.0583453274978249, -0....",1796.0
2,B007_0_X118_DE_time,"[0.1961220931776215, -0.0104953918523185, -0.0...",1796.0
3,B007_0_X118_FE_time,"[-0.0709659387026888, 0.1266813340245839, -0.1...",1796.0
4,B007_1_X119RPM,[1772],1772.0
...,...,...,...
557,N_2_(1750rpm)_X099_DE_time,"[0.0993486295841983, 0.0688907834303522, 0.053...",
558,N_2_(1750rpm)_X099_FE_time,"[-0.0786006764519622, -0.0541515855428713, 0.0...",
559,N_3_X100RPM,[1725],1725.0
560,N_3_X100_DE_time,"[-0.014753283453705, -0.082553283453705, -0.12...",1725.0


In [3]:
#展示RPM为空值的行
wide_table[wide_table['RPM'].isna()]

Unnamed: 0,file_name,time_vector,RPM
48,B028_0_(1797rpm)_X048_DE_time,"[0.1821754971912484, 0.5125782971912484, -0.17...",
49,B028_1_(1772rpm)_X049_DE_time,"[0.1551806773112706, 0.2943404773112706, -0.36...",
50,B028_2_(1750rpm)_X050_DE_time,"[-0.5853555224277591, -2.253645522427759, 1.76...",
51,B028_3_(1730rpm)_X051_DE_time,"[1.2424327528582293, 2.644610152858229, -0.935...",
100,IR028_0_(1797rpm)_X056_DE_time,"[2.0711510726252267, 1.7110445726252268, -0.65...",
101,IR028_1_(1772rpm)_X057_DE_time,"[-0.3662581435686562, -0.5603494435686562, 0.2...",
102,IR028_2_(1750rpm)_X058_DE_time,"[-0.3001356445451624, 1.2652086554548376, 1.05...",
103,IR028_3_(1730rpm)_X059_DE_time,"[0.8382640225688074, 1.2769022225688074, -0.12...",
555,N_1_(1772rpm)_X098_DE_time,"[0.0648322052388439, 0.0594082052388439, 0.043...",
556,N_1_(1772rpm)_X098_FE_time,"[0.0625447439221947, 0.0216592893767402, -0.02...",


In [4]:
import re
# 函数：从文件名括号中提取RPM值
def extract_rpm_from_filename(file_name):
    # 使用正则表达式匹配括号中的数字（例如：(1797rpm)）
    match = re.search(r'\((\d+)\s*rpm\)', file_name, re.IGNORECASE)
    if match:
        return int(match.group(1))
    return None
rpm_null_mask = wide_table['RPM'].isna()
wide_table.loc[rpm_null_mask, 'RPM'] = wide_table.loc[rpm_null_mask, 'file_name'].apply(extract_rpm_from_filename)
wide_table

Unnamed: 0,file_name,time_vector,RPM
0,B007_0_X118RPM,[1796],1796.0
1,B007_0_X118_BA_time,"[-0.0138830129577656, -0.0583453274978249, -0....",1796.0
2,B007_0_X118_DE_time,"[0.1961220931776215, -0.0104953918523185, -0.0...",1796.0
3,B007_0_X118_FE_time,"[-0.0709659387026888, 0.1266813340245839, -0.1...",1796.0
4,B007_1_X119RPM,[1772],1772.0
...,...,...,...
557,N_2_(1750rpm)_X099_DE_time,"[0.0993486295841983, 0.0688907834303522, 0.053...",1750.0
558,N_2_(1750rpm)_X099_FE_time,"[-0.0786006764519622, -0.0541515855428713, 0.0...",1750.0
559,N_3_X100RPM,[1725],1725.0
560,N_3_X100_DE_time,"[-0.014753283453705, -0.082553283453705, -0.12...",1725.0


In [5]:
wide_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 562 entries, 0 to 561
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   file_name    562 non-null    object 
 1   time_vector  562 non-null    object 
 2   RPM          562 non-null    float64
dtypes: float64(1), object(2)
memory usage: 13.3+ KB


In [6]:
# 筛选出不以RPM结尾的行
mask = ~wide_table['file_name'].str.endswith('RPM')
wide_table = wide_table[mask]

wide_table



Unnamed: 0,file_name,time_vector,RPM
1,B007_0_X118_BA_time,"[-0.0138830129577656, -0.0583453274978249, -0....",1796.0
2,B007_0_X118_DE_time,"[0.1961220931776215, -0.0104953918523185, -0.0...",1796.0
3,B007_0_X118_FE_time,"[-0.0709659387026888, 0.1266813340245839, -0.1...",1796.0
5,B007_1_X119_BA_time,"[-0.0249494402814408, -0.0365378082339631, -0....",1772.0
6,B007_1_X119_DE_time,"[0.1609798214890538, -0.1171091206267146, -0.0...",1772.0
...,...,...,...
556,N_1_(1772rpm)_X098_FE_time,"[0.0625447439221947, 0.0216592893767402, -0.02...",1772.0
557,N_2_(1750rpm)_X099_DE_time,"[0.0993486295841983, 0.0688907834303522, 0.053...",1750.0
558,N_2_(1750rpm)_X099_FE_time,"[-0.0786006764519622, -0.0541515855428713, 0.0...",1750.0
560,N_3_X100_DE_time,"[-0.014753283453705, -0.082553283453705, -0.12...",1725.0


In [9]:
#归纳至同一name下
# 创建一个字典来存储处理后的数据
processed_data = {}

# 遍历每一行数据
for idx, row in wide_table.iterrows():
    file_name = row['file_name']
    time_vector = row['time_vector']
    
    # 检查并获取RPM值（如果存在）
    rpm_value = row.get('RPM', None)
    
    # 检查文件名是否以DE_time、FE_time或_BA_time结尾
    if file_name.endswith('_DE_time'):
        base_name = file_name[:-8]  # 去掉_DE_time
        suffix = 'DE_time'
    elif file_name.endswith('_FE_time'):
        base_name = file_name[:-8]  # 去掉_FE_time
        suffix = 'FE_time'
    elif file_name.endswith('_BA_time'):
        base_name = file_name[:-8]  # 去掉_BA_time
        suffix = 'BA_time'
    else:
        # 如果不是目标后缀，跳过此行
        continue
    
    # 处理base_name中包含(1772rpm)这类部分的情况
    # 分割base_name，检查是否有4个部分（3个下划线）
    parts = base_name.split('_')
    if len(parts) == 4 and '(' in parts[2] and 'rpm' in parts[2].lower():
        # 去除第3部分，重新组合base_name
        base_name = '_'.join([parts[0], parts[1], parts[3]])
    
    # 将数据添加到processed_data字典中
    if base_name not in processed_data:
        processed_data[base_name] = {
            'RPM': rpm_value  # 存储RPM值
        }
    # 只有当第一次遇到这个base_name时才设置RPM值（避免被后续文件覆盖）
    elif 'RPM' not in processed_data[base_name] or processed_data[base_name]['RPM'] is None:
        processed_data[base_name]['RPM'] = rpm_value
    
    # 存储time_vector
    processed_data[base_name][suffix] = time_vector

# 将处理后的数据转换为DataFrame
data_for_df = []
for base_name, suffix_data in processed_data.items():
    row = {'base_name': base_name}
    # 添加RPM列
    row['RPM'] = suffix_data.get('RPM', None)
    # 确保所有三个后缀都存在，即使值为空
    row['DE_time'] = suffix_data.get('DE_time', [])
    row['FE_time'] = suffix_data.get('FE_time', [])
    row['BA_time'] = suffix_data.get('BA_time', [])
    data_for_df.append(row)

# 创建新的宽表
result_table = pd.DataFrame(data_for_df)

# 显示处理结果
print(f"总共处理了 {len(processed_data)} 个基础文件名")
print(f"成功归纳了 {len(result_table)} 组数据")

# 解析base_name，提取各个变量
# 创建新的列来存储提取的变量
result_table['id'] = None
result_table['load'] = None
result_table['status'] = None
result_table['size'] = None
result_table['position'] = None

# 遍历result_table，处理每一行的base_name
def parse_base_name(base_name):
    # 分割base_name为3部分
    parts = base_name.split('_')
    if len(parts) < 3:
        return None, None, None, None, None
    
    # 第三部分作为id
    id_value = parts[2]
    
    # 第二部分作为load
    load_value = parts[1]
    
    # 解析第一部分
    first_part = parts[0]
    
    # 提取status：B、IR、OR、N
    if first_part.startswith('B'):
        status = 'B'
        remaining = first_part[1:]
    elif first_part.startswith('IR'):
        status = 'IR'
        remaining = first_part[2:]
    elif first_part.startswith('OR'):
        status = 'OR'
        remaining = first_part[2:]
    elif first_part.startswith('N'):
        status = 'N'
        remaining = first_part[1:]
    else:
        status = None
        remaining = first_part
    
    # 提取size和position
    size_value = None
    position_value = None
    
    if remaining:
        # 检查是否有@符号
        if '@' in remaining:
            size_part, position_part = remaining.split('@', 1)
            # 提取数字部分作为size
            digits = ''.join(filter(str.isdigit, size_part))
            size_value = digits if digits else None
            # 提取position
            position_value = position_part if position_part else None
        else:
            # 没有@符号，只提取size
            digits = ''.join(filter(str.isdigit, remaining))
            size_value = digits if digits else None
    
    return id_value, load_value, status, size_value, position_value

# 应用解析函数到每一行
def apply_parse(row):
    base_name = row['base_name']
    id_value, load_value, status, size_value, position_value = parse_base_name(base_name)
    row['id'] = id_value
    row['load'] = load_value
    row['status'] = status
    row['size'] = size_value
    row['position'] = position_value
    return row

# 应用函数到整个DataFrame
result_table = result_table.apply(apply_parse, axis=1)

result_table

总共处理了 161 个基础文件名
成功归纳了 161 组数据


Unnamed: 0,base_name,RPM,DE_time,FE_time,BA_time,id,load,status,size,position
0,B007_0_X118,1796.0,"[0.1961220931776215, -0.0104953918523185, -0.0...","[-0.0709659387026888, 0.1266813340245839, -0.1...","[-0.0138830129577656, -0.0583453274978249, -0....",X118,0,B,007,
1,B007_1_X119,1772.0,"[0.1609798214890538, -0.1171091206267146, -0.0...","[0.002356657217093, -0.0216815246010887, -0.01...","[-0.0249494402814408, -0.0365378082339631, -0....",X119,1,B,007,
2,B007_2_X120,1748.0,"[-0.101050582052191, -0.1302889054054843, 0.13...","[0.0983212263408709, 0.0307266808863254, -0.00...","[0.0747937868643513, 0.0793406117901673, 0.118...",X120,2,B,007,
3,B007_3_X121,1722.0,"[-0.1674470103726207, -0.1500664514903852, 0.2...","[-0.0110394075906652, 0.0421733196820619, 0.02...","[0.0098243492584238, 0.0214931919883942, 0.031...",X121,3,B,007,
4,B014_0_X185,1796.0,"[0.1147019725901184, -0.1916506821005003, -0.3...","[0.4769270594625406, 0.2692125140079952, 0.049...","[0.0246583549151965, -0.0034675798029043, 0.01...",X185,0,B,014,
...,...,...,...,...,...,...,...,...,...,...
156,OR021@3_3_X253,1719.0,"[-0.4945780919399304, -0.3059434252732638, -0....","[-0.0817050953713713, -0.056222469108745, -0.0...",[],X253,3,OR,021,3
157,N_0_X097,1796.0,"[0.0406387118769775, 0.0761033272615929, 0.087...","[0.1142914584108182, 0.0664205493199091, 0.023...",[],X097,0,N,,
158,N_1_X098,1772.0,"[0.0648322052388439, 0.0594082052388439, 0.043...","[0.0625447439221947, 0.0216592893767402, -0.02...",[],X098,1,N,,
159,N_2_X099,1750.0,"[0.0993486295841983, 0.0688907834303522, 0.053...","[-0.0786006764519622, -0.0541515855428713, 0.0...",[],X099,2,N,,


In [10]:
# 定义解析base_name的函数来提取所需变量
def parse_base_name(base_name):
    # 分割base_name为3部分
    parts = base_name.split('_')
    if len(parts) < 3:
        return None, None, None, None, None
    
    # 第三部分作为id
    id_value = parts[2]
    
    # 第二部分作为load
    load_value = parts[1]
    
    # 解析第一部分
    first_part = parts[0]
    
    # 提取status：B、IR、OR、N
    if first_part.startswith('B'):
        status = 'B'
        remaining = first_part[1:]
    elif first_part.startswith('IR'):
        status = 'IR'
        remaining = first_part[2:]
    elif first_part.startswith('OR'):
        status = 'OR'
        remaining = first_part[2:]
    elif first_part.startswith('N'):
        status = 'N'
        remaining = first_part[1:]
    else:
        status = None
        remaining = first_part
    
    # 提取size和position
    size_value = None
    position_value = None
    
    if remaining:
        # 检查是否有@符号
        if '@' in remaining:
            size_part, position_part = remaining.split('@', 1)
            # 提取数字部分作为size
            digits = ''.join(filter(str.isdigit, size_part))
            size_value = digits if digits else None
            # 提取position
            position_value = position_part if position_part else None
        else:
            # 没有@符号，只提取size
            digits = ''.join(filter(str.isdigit, remaining))
            size_value = digits if digits else None
    
    return id_value, load_value, status, size_value, position_value

# 应用解析函数到每一行
def apply_parse(row):
    base_name = row['base_name']
    id_value, load_value, status, size_value, position_value = parse_base_name(base_name)
    row['id'] = id_value
    row['load'] = load_value
    row['status'] = status
    row['size'] = size_value
    row['position'] = position_value
    return row

# 处理result_table
# 创建新的列来存储提取的变量
result_table['id'] = None
result_table['load'] = None
result_table['status'] = None
result_table['size'] = None
result_table['position'] = None

# 应用函数到整个DataFrame
result_table = result_table.apply(apply_parse, axis=1)

result_table

Unnamed: 0,base_name,RPM,DE_time,FE_time,BA_time,id,load,status,size,position
0,B007_0_X118,1796.0,"[0.1961220931776215, -0.0104953918523185, -0.0...","[-0.0709659387026888, 0.1266813340245839, -0.1...","[-0.0138830129577656, -0.0583453274978249, -0....",X118,0,B,007,
1,B007_1_X119,1772.0,"[0.1609798214890538, -0.1171091206267146, -0.0...","[0.002356657217093, -0.0216815246010887, -0.01...","[-0.0249494402814408, -0.0365378082339631, -0....",X119,1,B,007,
2,B007_2_X120,1748.0,"[-0.101050582052191, -0.1302889054054843, 0.13...","[0.0983212263408709, 0.0307266808863254, -0.00...","[0.0747937868643513, 0.0793406117901673, 0.118...",X120,2,B,007,
3,B007_3_X121,1722.0,"[-0.1674470103726207, -0.1500664514903852, 0.2...","[-0.0110394075906652, 0.0421733196820619, 0.02...","[0.0098243492584238, 0.0214931919883942, 0.031...",X121,3,B,007,
4,B014_0_X185,1796.0,"[0.1147019725901184, -0.1916506821005003, -0.3...","[0.4769270594625406, 0.2692125140079952, 0.049...","[0.0246583549151965, -0.0034675798029043, 0.01...",X185,0,B,014,
...,...,...,...,...,...,...,...,...,...,...
156,OR021@3_3_X253,1719.0,"[-0.4945780919399304, -0.3059434252732638, -0....","[-0.0817050953713713, -0.056222469108745, -0.0...",[],X253,3,OR,021,3
157,N_0_X097,1796.0,"[0.0406387118769775, 0.0761033272615929, 0.087...","[0.1142914584108182, 0.0664205493199091, 0.023...",[],X097,0,N,,
158,N_1_X098,1772.0,"[0.0648322052388439, 0.0594082052388439, 0.043...","[0.0625447439221947, 0.0216592893767402, -0.02...",[],X098,1,N,,
159,N_2_X099,1750.0,"[0.0993486295841983, 0.0688907834303522, 0.053...","[-0.0786006764519622, -0.0541515855428713, 0.0...",[],X099,2,N,,
