In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Step 1: Load the dataset
df = pd.read_excel('RC_Shear_Wall_Database_20240514.xlsx')
print("Initial data preview:")
print(df.head())

Initial data preview:
   Num                Author Specimen  FailureMode      τ/fc  Drift_at_yield  \
0  1.0  Lefas et al. (1990a)     SW11            1  0.094692        0.006061   
1  2.0  Lefas et al. (1990a)     SW12            1  0.120824        0.007273   
2  3.0  Lefas et al. (1990a)     SW13            1  0.154821        0.006061   
3  4.0  Lefas et al. (1990a)     SW14            1  0.119896        0.007273   
4  5.0  Lefas et al. (1990a)     SW15            1  0.140768        0.007273   

   Drift_at_Max        θu  M/Vlw      lw/tw  ...  Moment32  Moment33  \
0      0.009697  0.010000    1.1  10.714286  ...       NaN       NaN   
1      0.010909  0.010739    1.1  10.714286  ...       NaN       NaN   
2      0.012121  0.010764    1.1  10.714286  ...       NaN       NaN   
3      0.012121  0.013588    1.1  10.714286  ...       NaN       NaN   
4      0.010909  0.009758    1.1  10.714286  ...       NaN       NaN   

   Moment34  Moment35  Moment36 Moment37  Moment38  Moment39  Mo

In [2]:
# Step 2: Remove missing values
missing_rate = df.isnull().mean()
# Filter out columns with a missing rate of less than 5% and retain these columns
df_filtered = df.loc[:, missing_rate <= 0.05]
print("Retain attributes that are missing less than 5%：", df_filtered.columns.tolist())

Retain attributes that are missing less than 5%： ['Author', 'Specimen', 'FailureMode', 'τ/fc', 'θu', 'M/Vlw', 'lw/tw', 'ρvwFy,vw/fc', 'ρhwFy,vw/fc', 'ρvcFy,vc/fc', 'ρhcFy,hc/fc', 'P/fcAg', 'Section', 'Ab/Ag', 'Vtest/Ag/sqrt(fc)', 'V@M/Vn_GB', 'V@M/Vn_ACI', 'V@M/Vtest', 'Vn_GB/Vtest', 'Vn_ACI/Vtest', 'Vtest', 'Vn_ACI', 'Vn_GB', 'alpha_c', 'lamda', 'N', 'Vc', 'Vs', 'P', 'fc', 'ft', 'alpha1', 'beta1', 'lw', "as'", 'lw0', 'tw', 'h', 'Aw', 'A', 'Ag', 'Fy,hw', 'Fy,vw', 'Fy,vc', 'ρvw', 'ρhw', 'V@moment', 'ForceEquilibrium', 'Moment', 'x_real', 'eps_cu', 'Ec', 'Es', 'DRein.1', 'DRein.2', 'DRein.3', 'DRein.4', 'DRein.5', 'Area1', 'Area2', 'Area3', 'Area4', 'Area5', 'Force1', 'Force2', 'Force3', 'Force4', 'Force5', 'Moment1', 'Moment2', 'Moment3', 'Moment4', 'Moment5', 'No']


In [3]:
cols_with_missing = df_filtered.columns[df_filtered.isnull().any()]
print("Columns that still contain missing values：", cols_with_missing.tolist())

Columns that still contain missing values： ['DRein.5', 'Area5', 'Force5', 'Moment5']


In [4]:
df = df_filtered.drop(columns=['Author', 'Specimen','No','DRein.5', 'Area5', 'Force5', 'Moment5'])

In [5]:
print(df.isnull().sum())

FailureMode    0
τ/fc           0
θu             0
M/Vlw          0
lw/tw          0
              ..
Force4         0
Moment1        0
Moment2        0
Moment3        0
Moment4        0
Length: 67, dtype: int64


In [7]:
df.to_csv('processed_dataset.csv', index=True)

In [16]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# 读取处理后的数据集（假设 'FailureMode' 为目标变量）
df_processed = pd.read_csv('processed_dataset.csv')
print(df_processed.head(5))
# 分离特征和目标变量
X = df_processed.drop(columns=['FailureMode'])
y = df_processed['FailureMode']

num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# 将数据划分为训练集和测试集，并使用 stratify 参数保持类别分布一致
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("训练集原始类别分布：", Counter(y_train))

# 1. 对训练集和测试集进行标准化处理（SMOTE 基于距离度量，标准化有助于其效果）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. 在标准化后的训练集上应用 SMOTE 平衡数据
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)
print("应用 SMOTE 后的训练集类别分布：", Counter(y_train_res))

# 3. 在 SMOTE 处理后的训练集上应用 PCA 降维
#    这里选择 n_components=0.95 表示保留 95% 的方差信息，您也可以根据实际情况调整
pca = PCA(n_components=0.95, random_state=42)
X_train_res_pca = pca.fit_transform(X_train_res)
X_test_pca = pca.transform(X_test_scaled)  # 对测试集应用相同的 PCA 变换

print("PCA 降维后的训练集形状：", X_train_res_pca.shape)
print("PCA 降维后的测试集形状：", X_test_pca.shape)

   Unnamed: 0  FailureMode      τ/fc        θu  M/Vlw      lw/tw  ρvwFy,vw/fc  \
0           0            1  0.094692  0.010000    1.1  10.714286     0.268879   
1           1            1  0.120824  0.010739    1.1  10.714286     0.262358   
2           2            1  0.154821  0.010764    1.1  10.714286     0.346364   
3           3            1  0.119896  0.013588    1.1  10.714286     0.334023   
4           4            1  0.140768  0.009758    1.1  10.714286     0.324766   

   ρhwFy,vw/fc  ρvcFy,vc/fc  ρhcFy,hc/fc  ...       Area3       Area4  \
0     0.136711     0.348231     0.149140  ...  100.530965  100.530965   
1     0.133396     0.339785     0.145522  ...  100.530965  100.530965   
2     0.176108     0.448584     0.192118  ...  100.530965  100.530965   
3     0.169834     0.432601     0.185273  ...  100.530965  100.530965   
4     0.165127     0.420612     0.180139  ...  100.530965  100.530965   

        Force1        Force2        Force3        Force4       Moment1  \


ValueError: could not convert string to float: 'B'