In [1]:
# 导入必要的库
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# 1. 加载数据
data_path = "fraudulent.csv"  # 请确保数据路径正确
data = pd.read_csv(data_path)

# 2. 数据预处理
# 查看数据基本信息
print("数据集基本信息：")
print(data.info())
print("\n缺失值统计：")
print(data.isnull().sum())

# 处理缺失值：使用众数填充缺失值
imputer = SimpleImputer(strategy='most_frequent')
data_filled = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# 将特征与标签分离
X = data_filled.drop(columns=['y'])  # 特征
y = data_filled['y']  # 标签

# 标准化特征值（只对数值特征进行标准化）
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. 数据集划分
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=1)

# 4. 定义多个分类模型
models = {
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=1),
    "Logistic Regression": LogisticRegression(random_state=1),
    "Support Vector Machine": SVC(random_state=1)
}

# 5. 训练模型并评估
results = {}
for name, model in models.items():
    print(f"\n正在训练模型：{name}")
    model.fit(X_train, y_train)  # 训练模型
    y_pred = model.predict(X_test)  # 测试集预测
    f1 = f1_score(y_test, y_pred)  # 计算 F1 值
    results[name] = f1
    print(f"F1 Score: {f1:.4f}")

# 6. 输出结果
print("\n各模型的 F1 值：")
for model_name, f1 in results.items():
    print(f"{model_name}: {f1:.4f}")

# 7. 找出最佳模型
best_model = max(results, key=results.get)
print(f"\n最佳模型是：{best_model}，F1 值为：{results[best_model]:.4f}")


数据集基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10086 entries, 0 to 10085
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   contain_IP             9996 non-null   float64
 1   is_long                9997 non-null   float64
 2   is_tinyurl             9998 non-null   float64
 3   contain_at             10004 non-null  float64
 4   contain_double_slash   9970 non-null   float64
 5   contain_dash           9992 non-null   float64
 6   contain_subdomain      9989 non-null   float64
 7   is_SSL                 9990 non-null   float64
 8   with_long_history      7291 non-null   float64
 9   contain_icon           8728 non-null   float64
 10  contain_ext_domain     8559 non-null   float64
 11  contain_email_to       8007 non-null   float64
 12  allow_right_click      6679 non-null   float64
 13  contain_pop_up_window  9807 non-null   float64
 14  contain_Iframe         9427 non-null   float6