In [None]:
# 导入库
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# 下载 NLTK 资源
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# 读取数据集
df = pd.read_csv('Query_Classification_Dataset.csv')

# 初始化停用词和标点符号
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# 初始化词干提取器
stemmer = PorterStemmer()

# 文本预处理函数
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words and word not in punctuation]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# 对数据集中的查询进行预处理
df['Processed_Query'] = df['Query'].apply(preprocess_text)

# 创建 TF-IDF 向量化器
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Processed_Query'])
tfidf_array = tfidf_matrix.toarray()

# 划分数据集
X = tfidf_matrix
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 定义模型训练和评估函数
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    return accuracy, f1

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_accuracy, lr_f1 = train_and_evaluate_model(lr_model, X_train, X_test, y_train, y_test)
print("Logistic Regression Accuracy:", lr_accuracy)
print("Logistic Regression F1-score:", lr_f1)

# SVM
svm_model = SVC(kernel='linear')
svm_accuracy, svm_f1 = train_and_evaluate_model(svm_model, X_train, X_test, y_train, y_test)
print("SVM Accuracy:", svm_accuracy)
print("SVM F1-score:", svm_f1)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_accuracy, rf_f1 = train_and_evaluate_model(rf_model, X_train, X_test, y_train, y_test)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest F1-score:", rf_f1)

# Gradient Boosting
gbm_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbm_accuracy, gbm_f1 = train_and_evaluate_model(gbm_model, X_train, X_test, y_train, y_test)
print("Gradient Boosting Accuracy:", gbm_accuracy)
print("Gradient Boosting F1-score:", gbm_f1)

# AdaBoost
adaboost_model = AdaBoostClassifier(n_estimators=100, random_state=42)
adaboost_accuracy, adaboost_f1 = train_and_evaluate_model(adaboost_model, X_train, X_test, y_train, y_test)
print("AdaBoost Accuracy:", adaboost_accuracy)
print("AdaBoost F1-score:", adaboost_f1)

# KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_accuracy, knn_f1 = train_and_evaluate_model(knn_model, X_train, X_test, y_train, y_test)
print("KNN Accuracy:", knn_accuracy)
print("KNN F1-score:", knn_f1)

# BERT 模型部分
def bert_classification():
    # 使用 BERT 的预训练 tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # 将文本编码为 BERT 所需的格式
    def encode_text(text):
        return tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=64,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

    # 创建自定义 Dataset 类
    class TextDataset(Dataset):
        def __init__(self, texts, labels):
            self.texts = texts
            self.labels = labels

        def __len__(self):
            return len(self.texts)

        def __getitem__(self, idx):
            text = self.texts[idx]
            label = self.labels[idx]
            encoding = encode_text(text)
            return {'input_ids': encoding['input_ids'].squeeze(),
                    'attention_mask': encoding['attention_mask'].squeeze(),
                    'labels': torch.tensor(label, dtype=torch.long)}

    # 准备数据集
    X = df['Query'].values
    y = df['Category'].factorize()[0]  # 类别标签

    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 创建训练和测试集
    train_dataset = TextDataset(X_train, y_train)
    test_dataset = TextDataset(X_test, y_test)

    # 创建 DataLoader
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # 加载预训练 BERT 模型
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['Category'].unique()))

    # 设置优化器
    optimizer = AdamW(model.parameters(), lr=1e-5)

    # 训练模型
    model.train()
    for epoch in range(3):  # 训练 3 个 epoch
        for batch in train_loader:
            optimizer.zero_grad()

            # 获取输入和标签
            input_ids = batch['input_ids'].to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
            attention_mask = batch['attention_mask'].to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
            labels = batch['labels'].to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

            # 前向传播
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # 测试模型
    model.eval()
    y_pred = []
    y_true = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
            attention_mask = batch['attention_mask'].to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
            labels = batch['labels'].to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            _, predicted = torch.max(logits, dim=1)
            y_pred.extend(predicted.cpu().numpy())
            y_true.extend(labels.cpu().numpy())

    # 评估模型
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')

    print(f"BERT Accuracy: {accuracy}")
    print(f"BERT F1-score: {f1}")

# 运行 BERT 分类
bert_classification()