<a href="https://colab.research.google.com/github/Zhouyx713/jpmc-task-1/blob/main/Corpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Define the directory containing the CSV files
directory_path = r"C:\Users\25410\University of Warwick\Sunar, Ayse - Yuxin paper's\Collection data"

# Loop through all files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".csv"):  # Process only CSV files
        file_path = os.path.join(directory_path, filename)
        print(f"Processing file: {file_path}")
         #the CSV file
        df = pd.read_csv(file_path)

        # Check if the sentiment columns exist and drop them
        columns_to_drop = ['Sentiment', 'Sentiment_Category']
        df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

        # Save the updated CSV file (overwrite the original file)
        df.to_csv(file_path, index=False, encoding='utf-8-sig')
        print(f"Processed and updated: {file_path}")

In [None]:
import os
import re
import pandas as pd
import thulac
from hanlp_restful import HanLPClient

# Initialize THULAC for word segmentation
thu = thulac.thulac(seg_only=True)

# Initialize HanLPClient for sentiment analysis
HanLP = HanLPClient('https://www.hanlp.com/api', auth='NzAzNkBiYnMuaGFubHAuY29tOmdaUVprQXZlelN3SFIzc1c=', language='zh')

# Function to clean text by removing URLs, usernames, and topics
def clean_text(text):
    # Remove emojis
    text = re.sub(r'[^\w\s,]', '', text)  # Retain only alphanumeric, commas, and spaces
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove hashtags
    text = re.sub(r'#\S+', '', text)
    # Remove usernames
    text = re.sub(r'@\S+', '', text)
    # Remove topics (e.g., #XXX#)
    text = re.sub(r'#\S+#', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Define the directory containing the CSV files
directory_path = r"C:\Users\25410\University of Warwick\Sunar, Ayse - Yuxin paper's\Collection data"
output_directory = os.path.join(directory_path, "Processed_Files")

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Loop through all files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".csv"):  # Process only CSV files
        input_file_path = os.path.join(directory_path, filename)
        output_file_path = os.path.join(output_directory, f"processed_{filename}")

        # Skip processing if the output file already exists
        if os.path.exists(output_file_path):
            print(f"Skipping already processed file: {filename}")
            continue

        print(f"Processing file: {input_file_path}")

        # Load the CSV file
        df = pd.read_csv(input_file_path)

        # Ensure the text column exists
        if 'Example' not in df.columns:
            raise KeyError(f"Column 'Example' not found in the DataFrame for file: {filename}")

        # Preprocess the 'Example' column to handle non-string or missing values
        df['Example'] = df['Example'].fillna('').astype(str)

        # Clean the 'Example' column by filtering out unwanted elements
        df['Cleaned_Text'] = df['Example'].apply(clean_text)

        # Perform word segmentation using THULAC on the cleaned text
        df['Segmented_Text'] = df['Cleaned_Text'].apply(lambda text: thu.cut(text, text=True))

        # Perform sentiment analysis on the cleaned text
        df['Sentiment'] = df['Segmented_Text'].apply(lambda text: HanLP.sentiment_analysis(text))

        # Categorize the sentiment scores
        df['Sentiment_Category'] = df['Sentiment'].apply(
            lambda score: 'negative' if score < -0.3 else 'neutral' if score <= 0.4 else 'positive'
        )

        # Save results to a separate CSV file
        df.to_csv(output_file_path, index=False, encoding='utf-8-sig')
        print(f"Results saved to {output_file_path}")


In [None]:
import pandas as pd
import os

# Define the directory containing processed files
processed_dir = r"C:\Users\25410\University of Warwick\Sunar, Ayse - Yuxin paper's\Collection data\Processed_Files"
corpus_dir = r"C:\Users\25410\University of Warwick\Sunar, Ayse - Yuxin paper's"
# Initialize an empty list to store DataFrames
dataframes = []

# Load all processed CSV files and combine them
for filename in os.listdir(processed_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(processed_dir, filename)
        df = pd.read_csv(file_path)
        dataframes.append(df)

# Combine all data into a single DataFrame
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    # Save the combined dataset for training
    combined_output_path = os.path.join(corpus_dir ,"Completed_dataset.csv")
    combined_df.to_csv(combined_output_path, index=False, encoding='utf-8-sig')
    print(f"Combined dataset saved to {combined_output_path}")
else:
    print("No processed files found to combine!")


In [17]:
from google.colab import files

# 弹出文件上传对话框
#uploaded = files.upload()
#df = pd.read_csv(r"C:\Users\25410\University of Warwick\Sunar, Ayse - Yuxin paper's\Completed_dataset.csv")
df = pd.read_csv(r"Completed_dataset.csv")
print(df)
# Ensure necessary columns exist
if 'Cleaned_Text' not in df.columns or 'Sentiment_Category' not in df.columns:
    raise KeyError("Columns 'Cleaned_Text' and 'Sentiment_Category' are required in the dataset.")

# Clean up the sentiment categories first
df['Sentiment_Category'] = df['Sentiment_Category'].str.strip()

# Then apply the mapping
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['Sentiment_Label'] = df['Sentiment_Category'].map(sentiment_mapping)

                                                 Example  \
0                       汪苏泷超话 求问 纪录片是啥 在哪看 打工的孩子2g冲浪了 ​   
1                                          璐璐公主（2g冲浪版） ​   
2      #周震南求生欲三连 到底是什么让南南如此害怕开口？VvvVvV周震南 带着#来看我们的演唱会...   
3                         以闪亮之名超话问问超话主持人是个啥事啊 2g冲浪不清楚 ​   
4             什么2G冲浪速度才知道内娱现役最爱的两个人昨晚坐一桌杨幂王鹤棣你俩啥时候合作一下 ​   
...                                                  ...   
65995                       #射雕疑似被恶意打分#啊对对对，全天下都是哥哥的黑粉 ​   
65996                                         极极批早该当黑粉打了   
65997  应该是宾度黑粉和它们正主一样的洗头佬基本盘的特性。上次见到这种生物还是这种基本盘凑堆侮辱珍古...   
65998  #春节档六部电影豆瓣开分#哈哈哈哈哈哈哈哈，某部电影给别人打一分的时候怎么不说黑粉，被路人盘...   
65999    #射雕豆瓣5.5#33万人打分，票房还才5.6亿，一群恶意低分！黑粉智商也不好啊，闹笑话了 ​   

                                            Cleaned_Text  \
0                          汪苏泷超话 求问 纪录片是啥 在哪看 打工的孩子2g冲浪了   
1                                              璐璐公主2g冲浪版   
2      周震南求生欲三连 到底是什么让南南如此害怕开口VvvVvV周震南 带着来看我们的演唱会 来做...   
3                            以闪亮之名超话问问超

In [None]:
# Ensure necessary columns exist
if 'Segmented_Text' not in df.columns or 'Sentiment_Category' not in df.columns:
    raise KeyError("Columns 'Segmented_Text' and 'Sentiment_Category' are required in the dataset.")

# Clean up the sentiment categories first
df['Sentiment_Category'] = df['Sentiment_Category'].str.strip()

# Then apply the mapping
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['Sentiment_Label'] = df['Sentiment_Category'].map(sentiment_mapping)

# Split the dataset into training and testing sets
X = df['Segmented_Text']  # Features (text data)
y = df['Sentiment_Label']  # Labels (sentiment category)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from tqdm.auto import tqdm

# 1. 轻量级特征工程优化
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),    # 降低计算量
    max_features=8000,     # 减少特征维度
    min_df=10,             # 过滤低频词
    sublinear_tf=True
)

# 2. 关键参数快速搜索空间
param_dist = {
    'clf__n_estimators': [100, 150],          # 减少树的数量范围
    'clf__max_depth': [None, 15],             # 限制深度选项
    'clf__min_samples_split': [5, 10],        # 快速验证关键参数
    'clf__max_features': ['sqrt', 0.6],       # 动态特征抽样
    'clf__class_weight': ['balanced', {0:2, 1:1, 2:1}]  # 调整类别权重
}

# 3. 构建高效管道
pipeline = Pipeline([
    ('tfidf', tfidf),
    ('clf', RandomForestClassifier(n_jobs=-1, random_state=42))
])

# 4. 随机搜索（仅50次迭代）
search = RandomizedSearchCV(
    pipeline,
    param_dist,
    n_iter=20,                # 减少迭代次数
    cv=3,                     # 3折交叉验证
    scoring='f1_weighted',
    random_state=42,
    n_jobs=1                  # 避免并行冲突
)

# 带进度条的搜索
with tqdm(total=20, desc="Optimizing") as pbar:
    search.fit(X_train, y_train)
    pbar.update(20)

# 5. 快速评估
print("最佳参数:", search.best_params_)
y_pred = search.predict(X_test)
print(classification_report(y_test, y_pred))

Optimizing:   0%|          | 0/20 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m41.0/73.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313466 sha256=92af0

In [12]:
import pandas as pd
import numpy as np
import fasttext
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_imb_pipeline
import matplotlib.pyplot as plt

# 1. 数据准备 ---------------------------------------------------------------
# 假设df已包含清洗后的'Segmented_Text'和'Sentiment_Label'
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['Sentiment_Label'] = df['Sentiment_Category'].map(sentiment_mapping)
texts = df['Segmented_Text'].values
labels = df['Sentiment_Label'].values

# 保存为FastText需要的格式
with open('fasttext_corpus.txt', 'w', encoding='utf-8') as f:
    for i, text in enumerate(texts):
        f.write(f'__label__{labels[i]} {text}\n')  # 添加标签用于监督学习

# 2. 训练FastText词向量模型 -------------------------------------------------
print("训练FastText词向量模型中...")
vector_model = fasttext.train_supervised(
    input='fasttext_corpus.txt',
    dim=100,
    ws=5,
    minn=2,
    maxn=4,
    epoch=50,
    thread=8,
    verbose=2
)

# 3. 生成文本特征 -----------------------------------------------------------
def text_to_vector(text, model):
    """使用FastText生成文本向量"""
    words = text.split()
    vectors = [model.get_word_vector(word) for word in words]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.dim)

print("\n生成文本特征:")
X = np.array([text_to_vector(text, vector_model) for text in tqdm(texts)])
y = labels

# 4. 数据预处理 ------------------------------------------------------------
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 标准化处理
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 处理类别不平衡
sampler = make_imb_pipeline(
    RandomUnderSampler(random_state=42),
    SMOTE(random_state=42)
)
X_resampled, y_resampled = sampler.fit_resample(X_train_scaled, y_train)


# SVM模型测试
def test_svm():
    model = SVC(kernel='rbf', C=1.0, class_weight='balanced')
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('sampler', sampler),
        ('model', model)
    ])
    pipeline.fit(X_resampled, y_resampled)
    y_pred = pipeline.predict(X_test_scaled)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print("SVM模型结果：")
    print(f"F1-Weighted: {f1_weighted}")
    print(f"F1-Macro: {f1_macro}")


# KNN模型测试
def test_knn():
    model = KNeighborsClassifier(n_neighbors=5, weights='distance')
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('sampler', sampler),
        ('model', model)
    ])
    pipeline.fit(X_resampled, y_resampled)
    y_pred = pipeline.predict(X_test_scaled)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print("KNN模型结果：")
    print(f"F1-Weighted: {f1_weighted}")
    print(f"F1-Macro: {f1_macro}")


# 朴素贝叶斯模型测试
def test_naive_bayes():
    model = GaussianNB()
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('sampler', sampler),
        ('model', model)
    ])
    pipeline.fit(X_resampled, y_resampled)
    y_pred = pipeline.predict(X_test_scaled)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print("NaiveBayes模型结果：")
    print(f"F1-Weighted: {f1_weighted}")
    print(f"F1-Macro: {f1_macro}")


# XGBoost模型测试
def test_xgboost():
    model = XGBClassifier(
        objective='multi:softmax',
        num_class=3,
        eval_metric='mlogloss',
        use_label_encoder=False
    )
    params = {
        'model__learning_rate': [0.1, 0.05],
        'model__max_depth': [5, 7]
    }
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('sampler', sampler),
        ('model', model)
    ])
    search = RandomizedSearchCV(
        pipeline,
        params,
        n_iter=10,
        cv=3,
        scoring='f1_weighted',
        n_jobs=-1,
        random_state=42
    )
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    print(f"XGBoost 最佳参数: {search.best_params_}")
    y_pred = best_model.predict(X_test_scaled)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print("XGBoost模型结果：")
    print(f"F1-Weighted: {f1_weighted}")
    print(f"F1-Macro: {f1_macro}")


# 随机森林模型测试
def test_random_forest():
    model = RandomForestClassifier(class_weight='balanced')
    params = {
        'model__n_estimators': [100, 150],
        'model__max_depth': [None, 15]
    }
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('sampler', sampler),
        ('model', model)
    ])
    search = RandomizedSearchCV(
        pipeline,
        params,
        n_iter=10,
        cv=3,
        scoring='f1_weighted',
        n_jobs=-1,
        random_state=42
    )
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    print(f"RandomForest 最佳参数: {search.best_params_}")
    y_pred = best_model.predict(X_test_scaled)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print("RandomForest模型结果：")
    print(f"F1-Weighted: {f1_weighted}")
    print(f"F1-Macro: {f1_macro}")


# 你可以选择调用下面的函数来单独测试每个模型
# test_svm()
# test_knn()
# test_naive_bayes()
# test_xgboost()
# test_random_forest()


训练FastText词向量模型中...

生成文本特征:


  0%|          | 0/3 [00:00<?, ?it/s]

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
import pandas as pd
import numpy as np
import fasttext
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt

# 假设df已包含清洗后的'Segmented_Text'和'Sentiment_Label'
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
# 这里需要确保df已经定义，以下是模拟定义
data = {
    'Segmented_Text': ['text1', 'text2', 'text3'],
    'Sentiment_Category': ['negative', 'neutral', 'positive']
}
df = pd.DataFrame(data)
df['Sentiment_Label'] = df['Sentiment_Category'].map(sentiment_mapping)
texts = df['Segmented_Text'].values
labels = df['Sentiment_Label'].values

# 保存为FastText需要的格式
with open('fasttext_corpus.txt', 'w', encoding='utf-8') as f:
    for i, text in enumerate(texts):
        f.write(f'__label__{labels[i]} {text}\n')  # 添加标签用于监督学习

# 2. 训练FastText词向量模型 -------------------------------------------------
print("训练FastText词向量模型中...")
vector_model = fasttext.train_supervised(
    input='fasttext_corpus.txt',
    dim=100,
    ws=5,
    minn=2,
    maxn=4,
    epoch=50,
    thread=8,
    verbose=2
)

# 3. 生成文本特征 -----------------------------------------------------------
def text_to_vector(text, model):
    """使用FastText生成文本向量"""
    words = text.split()
    vectors = [model.get_word_vector(word) for word in words]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.dim)

print("\n生成文本特征:")
X = np.array([text_to_vector(text, vector_model) for text in tqdm(texts)])
y = labels

# 4. 数据预处理 ------------------------------------------------------------
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# SVM模型测试
def test_svm():
    model = SVC(kernel='rbf', C=1.0, class_weight='balanced')
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('under_sampler', RandomUnderSampler(random_state=42)),
        ('over_sampler', SMOTE(random_state=42)),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print("SVM模型结果：")
    print(f"F1-Weighted: {f1_weighted}")
    print(f"F1-Macro: {f1_macro}")


# KNN模型测试
def test_knn():
    model = KNeighborsClassifier(n_neighbors=5, weights='distance')
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('under_sampler', RandomUnderSampler(random_state=42)),
        ('over_sampler', SMOTE(random_state=42)),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print("KNN模型结果：")
    print(f"F1-Weighted: {f1_weighted}")
    print(f"F1-Macro: {f1_macro}")


# 朴素贝叶斯模型测试
def test_naive_bayes():
    model = GaussianNB()
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('under_sampler', RandomUnderSampler(random_state=42)),
        ('over_sampler', SMOTE(random_state=42)),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print("NaiveBayes模型结果：")
    print(f"F1-Weighted: {f1_weighted}")
    print(f"F1-Macro: {f1_macro}")


# XGBoost模型测试
def test_xgboost():
    model = XGBClassifier(
        objective='multi:softmax',
        num_class=3,
        eval_metric='mlogloss',
        use_label_encoder=False
    )
    params = {
        'model__learning_rate': [0.1, 0.05],
        'model__max_depth': [5, 7]
    }
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('under_sampler', RandomUnderSampler(random_state=42)),
        ('over_sampler', SMOTE(random_state=42)),
        ('model', model)
    ])
    search = RandomizedSearchCV(
        pipeline,
        params,
        n_iter=10,
        cv=3,
        scoring='f1_weighted',
        n_jobs=-1,
        random_state=42
    )
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    print(f"XGBoost 最佳参数: {search.best_params_}")
    y_pred = best_model.predict(X_test)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print("XGBoost模型结果：")
    print(f"F1-Weighted: {f1_weighted}")
    print(f"F1-Macro: {f1_macro}")


# 随机森林模型测试
def test_random_forest():
    model = RandomForestClassifier(class_weight='balanced')
    params = {
        'model__n_estimators': [100, 150],
        'model__max_depth': [None, 15]
    }
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('under_sampler', RandomUnderSampler(random_state=42)),
        ('over_sampler', SMOTE(random_state=42)),
        ('model', model)
    ])
    search = RandomizedSearchCV(
        pipeline,
        params,
        n_iter=10,
        cv=3,
        scoring='f1_weighted',
        n_jobs=-1,
        random_state=42
    )
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    print(f"RandomForest 最佳参数: {search.best_params_}")
    y_pred = best_model.predict(X_test)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print("RandomForest模型结果：")
    print(f"F1-Weighted: {f1_weighted}")
    print(f"F1-Macro: {f1_macro}")


# 你可以选择调用下面的函数来单独测试每个模型
test_svm()
test_knn()
test_naive_bayes()
test_xgboost()
test_random_forest()

训练FastText词向量模型中...

生成文本特征:


  0%|          | 0/3 [00:00<?, ?it/s]

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [19]:
import pandas as pd
import numpy as np
import fasttext
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_imb_pipeline
import matplotlib.pyplot as plt

# --------------------- 1. 数据准备 ---------------------
# 请确保 df 包含原始66k行数据
print("原始数据条数:", len(df))
# 假设 df 包含 'Segmented_Text' 和 'Sentiment_Category'
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['Sentiment_Label'] = df['Sentiment_Category'].map(sentiment_mapping)

# 使用完整数据
texts = df['Segmented_Text'].values
labels = df['Sentiment_Label'].values

# 保存为 FastText 格式
with open('fasttext_corpus.txt', 'w', encoding='utf-8') as f:
    for i, text in enumerate(texts):
        f.write(f'__label__{labels[i]} {text}\n')  # fastText格式要求

# --------------------- 2. 训练 FastText 词向量模型 ---------------------
print("训练FastText词向量模型中...")
vector_model = fasttext.train_supervised(
    input='fasttext_corpus.txt',
    dim=100,
    ws=5,
    minn=2,
    maxn=4,
    epoch=50,
    thread=8,
    verbose=2
)

# --------------------- 3. 生成文本特征 ---------------------
def text_to_vector(text, model):
    """使用FastText生成文本向量"""
    words = text.split()
    # 兼容不同fastText版本
    try:
        dim = model.dim
    except AttributeError:
        dim = model.get_dimension()
    vectors = [model.get_word_vector(word) for word in words]
    return np.mean(vectors, axis=0) if vectors else np.zeros(dim)

print("\n生成文本特征:")
X = np.array([text_to_vector(text, vector_model) for text in tqdm(texts)])
y = labels
print(X)
print(y)

# --------------------- 4. 数据预处理 ---------------------
# 使用原始X、y进行划分，预处理将在Pipeline内部进行
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 定义采样器（在Pipeline中使用，确保每次fit时进行采样）
sampler = make_imb_pipeline(
    RandomUnderSampler(random_state=42),
    SMOTE(random_state=42)
)

# --------------------- 5. 模型测试 ---------------------
# 每个模型的Pipeline内部都包含StandardScaler和采样器，使用原始训练数据进行预处理

# SVM 模型测试
def test_svm():
    model = SVC(kernel='rbf', C=1.0, class_weight='balanced', probability=True)
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('sampler', sampler),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print("SVM模型结果：")
    print(classification_report(y_test, y_pred))
    print(f"F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Macro: {f1_score(y_test, y_pred, average='macro'):.4f}")

# KNN 模型测试
def test_knn():
    model = KNeighborsClassifier(n_neighbors=5, weights='distance')
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('sampler', sampler),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print("KNN模型结果：")
    print(classification_report(y_test, y_pred))
    print(f"F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Macro: {f1_score(y_test, y_pred, average='macro'):.4f}")

# 朴素贝叶斯 模型测试（GaussianNB）
def test_naive_bayes():
    model = GaussianNB()
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('sampler', sampler),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print("NaiveBayes模型结果：")
    print(classification_report(y_test, y_pred))
    print(f"F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Macro: {f1_score(y_test, y_pred, average='macro'):.4f}")

# XGBoost 模型测试
def test_xgboost():
    model = XGBClassifier(
        objective='multi:softmax',
        num_class=3,
        eval_metric='mlogloss',
        use_label_encoder=False
    )
    params = {
        'model__learning_rate': [0.1, 0.05],
        'model__max_depth': [5, 7]
    }
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('sampler', sampler),
        ('model', model)
    ])
    search = RandomizedSearchCV(
        pipeline,
        params,
        n_iter=10,
        cv=3,
        scoring='f1_weighted',
        n_jobs=-1,
        random_state=42
    )
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    print(f"XGBoost 最佳参数: {search.best_params_}")
    y_pred = best_model.predict(X_test)
    print("XGBoost模型结果：")
    print(classification_report(y_test, y_pred))
    print(f"F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Macro: {f1_score(y_test, y_pred, average='macro'):.4f}")

# 随机森林 模型测试
def test_random_forest():
    model = RandomForestClassifier(class_weight='balanced')
    params = {
        'model__n_estimators': [100, 150],
        'model__max_depth': [None, 15]
    }
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('sampler', sampler),
        ('model', model)
    ])
    search = RandomizedSearchCV(
        pipeline,
        params,
        n_iter=10,
        cv=3,
        scoring='f1_weighted',
        n_jobs=-1,
        random_state=42
    )
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    print(f"RandomForest 最佳参数: {search.best_params_}")
    y_pred = best_model.predict(X_test)
    print("RandomForest模型结果：")
    print(classification_report(y_test, y_pred))
    print(f"F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Macro: {f1_score(y_test, y_pred, average='macro'):.4f}")

# 你可以根据需要调用下面的函数测试各个模型
# test_svm()
# test_knn()
# test_naive_bayes()
# test_xgboost()
# test_random_forest()


原始数据条数: 66000
训练FastText词向量模型中...

生成文本特征:


  0%|          | 0/66000 [00:00<?, ?it/s]

[[ 6.4657792e-03  4.1832379e-03 -2.6658941e-03 ...  1.2073573e-03
   3.5101380e-03  4.0739649e-03]
 [ 4.0883929e-03  1.7671980e-03  2.3813545e-05 ...  3.4008283e-04
   1.9360840e-03  2.3465469e-03]
 [-2.8680328e-03 -4.4188234e-03  1.0651827e-03 ... -1.1067734e-03
  -1.8755017e-03 -5.2394951e-04]
 ...
 [-1.5442401e-02 -1.7118618e-02  8.2987184e-03 ... -3.7225063e-03
  -1.0728191e-02 -9.9051762e-03]
 [-1.5752677e-02 -1.9128088e-02  1.0888919e-02 ... -2.8834832e-03
  -1.3976880e-02 -1.4870948e-02]
 [-2.3918500e-02 -3.3045124e-02  7.0988885e-03 ... -8.9623062e-03
  -8.1822341e-03 -5.5941097e-03]]
[2 2 0 ... 0 0 0]


In [29]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score

def test_svm():
    model = SVC(kernel='rbf', C=1.0, class_weight='balanced', probability=True, random_state=42)

    pipeline = ImbPipeline([
        ('scaler', StandardScaler()),
        ('rus', RandomUnderSampler(random_state=42)),
        ('smote', SMOTE(random_state=42)),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print("【SVM 模型结果】")
    print(classification_report(y_test, y_pred))
    print(f"F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Macro: {f1_score(y_test, y_pred, average='macro'):.4f}")

# 调用测试
test_svm()


【SVM 模型结果】
              precision    recall  f1-score   support

           0       0.89      0.92      0.90      3660
           1       0.86      0.85      0.86      3984
           2       0.93      0.92      0.92      5556

    accuracy                           0.90     13200
   macro avg       0.89      0.90      0.89     13200
weighted avg       0.90      0.90      0.90     13200

F1-Weighted: 0.8975
F1-Macro: 0.8943


In [27]:
from sklearn.naive_bayes import GaussianNB

def test_naive_bayes():
    model = GaussianNB()

    pipeline = ImbPipeline([
        ('scaler', StandardScaler()),
        ('rus', RandomUnderSampler(random_state=42)),
        ('smote', SMOTE(random_state=42)),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print("【Naive Bayes 模型结果】")
    print(classification_report(y_test, y_pred))
    print(f"F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Macro: {f1_score(y_test, y_pred, average='macro'):.4f}")

# 调用测试
test_naive_bayes()


【Naive Bayes 模型结果】
              precision    recall  f1-score   support

           0       0.90      0.80      0.85      3660
           1       0.71      0.86      0.78      3984
           2       0.93      0.86      0.89      5556

    accuracy                           0.84     13200
   macro avg       0.85      0.84      0.84     13200
weighted avg       0.86      0.84      0.85     13200

F1-Weighted: 0.8461
F1-Macro: 0.8400


In [25]:
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.neighbors import KNeighborsClassifier

def test_knn():
    model = KNeighborsClassifier(n_neighbors=5, weights='distance')

    pipeline = ImbPipeline([
        ('scaler', StandardScaler()),
        ('rus', RandomUnderSampler(random_state=42)),
        ('smote', SMOTE(random_state=42)),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print("【KNN 模型结果】")
    print(classification_report(y_test, y_pred))
    print(f"F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Macro: {f1_score(y_test, y_pred, average='macro'):.4f}")

# 调用测试
test_knn()


【KNN 模型结果】
              precision    recall  f1-score   support

           0       0.87      0.92      0.89      3660
           1       0.88      0.79      0.83      3984
           2       0.91      0.93      0.92      5556

    accuracy                           0.89     13200
   macro avg       0.88      0.88      0.88     13200
weighted avg       0.89      0.89      0.89     13200

F1-Weighted: 0.8850
F1-Macro: 0.8810


In [21]:
from imblearn.pipeline import Pipeline  as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

def test_xgboost():
    model = XGBClassifier(
        objective='multi:softmax',
        num_class=3,
        eval_metric='mlogloss',
        use_label_encoder=False
    )
    params = {
        'model__learning_rate': [0.1, 0.05],
        'model__max_depth': [5, 7]
    }
    # 使用 imblearn 的 Pipeline 来包含采样步骤
    pipeline = ImbPipeline([
        ('scaler', StandardScaler()),
        ('randomundersampler', RandomUnderSampler(random_state=42)),
        ('smote', SMOTE(random_state=42)),
        ('model', model)
    ])
    search = RandomizedSearchCV(
        pipeline,
        params,
        n_iter=10,
        cv=3,
        scoring='f1_weighted',
        n_jobs=-1,
        random_state=42
    )
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    print(f"XGBoost 最佳参数: {search.best_params_}")
    y_pred = best_model.predict(X_test)
    print("XGBoost模型结果：")
    print(classification_report(y_test, y_pred))
    print(f"F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Macro: {f1_score(y_test, y_pred, average='macro'):.4f}")


In [22]:
test_xgboost()

Parameters: { "use_label_encoder" } are not used.



XGBoost 最佳参数: {'model__max_depth': 7, 'model__learning_rate': 0.05}
XGBoost模型结果：
              precision    recall  f1-score   support

           0       0.89      0.92      0.90      3660
           1       0.86      0.86      0.86      3984
           2       0.94      0.92      0.93      5556

    accuracy                           0.90     13200
   macro avg       0.89      0.90      0.90     13200
weighted avg       0.90      0.90      0.90     13200

F1-Weighted: 0.8994
F1-Macro: 0.8960


In [24]:
from sklearn.ensemble import RandomForestClassifier

def test_random_forest():
    model = RandomForestClassifier(class_weight='balanced', random_state=42)

    params = {
        'model__n_estimators': [100, 150],
        'model__max_depth': [None, 15]
    }

    pipeline = ImbPipeline([
        ('scaler', StandardScaler()),
        ('rus', RandomUnderSampler(random_state=42)),
        ('smote', SMOTE(random_state=42)),
        ('model', model)
    ])

    search = RandomizedSearchCV(
        pipeline,
        params,
        n_iter=4,  # 参数组合较少
        cv=3,
        scoring='f1_weighted',
        n_jobs=-1,
        random_state=42
    )

    search.fit(X_train, y_train)
    best_model = search.best_estimator_

    print("【随机森林 模型】")
    print(f"RandomForest 最佳参数: {search.best_params_}")

    y_pred = best_model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(f"F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Macro: {f1_score(y_test, y_pred, average='macro'):.4f}")

# 调用测试
test_random_forest()


【随机森林 模型】
RandomForest 最佳参数: {'model__n_estimators': 150, 'model__max_depth': 15}
              precision    recall  f1-score   support

           0       0.89      0.92      0.91      3660
           1       0.86      0.86      0.86      3984
           2       0.93      0.92      0.93      5556

    accuracy                           0.90     13200
   macro avg       0.90      0.90      0.90     13200
weighted avg       0.90      0.90      0.90     13200

F1-Weighted: 0.9005
F1-Macro: 0.8974


In [18]:
unique, counts = np.unique(y, return_counts=True)
print(dict(zip(unique, counts)))
print(df)

{0: 1, 1: 1, 2: 1}
                                                 Example  \
0                       汪苏泷超话 求问 纪录片是啥 在哪看 打工的孩子2g冲浪了 ​   
1                                          璐璐公主（2g冲浪版） ​   
2      #周震南求生欲三连 到底是什么让南南如此害怕开口？VvvVvV周震南 带着#来看我们的演唱会...   
3                         以闪亮之名超话问问超话主持人是个啥事啊 2g冲浪不清楚 ​   
4             什么2G冲浪速度才知道内娱现役最爱的两个人昨晚坐一桌杨幂王鹤棣你俩啥时候合作一下 ​   
...                                                  ...   
65995                       #射雕疑似被恶意打分#啊对对对，全天下都是哥哥的黑粉 ​   
65996                                         极极批早该当黑粉打了   
65997  应该是宾度黑粉和它们正主一样的洗头佬基本盘的特性。上次见到这种生物还是这种基本盘凑堆侮辱珍古...   
65998  #春节档六部电影豆瓣开分#哈哈哈哈哈哈哈哈，某部电影给别人打一分的时候怎么不说黑粉，被路人盘...   
65999    #射雕豆瓣5.5#33万人打分，票房还才5.6亿，一群恶意低分！黑粉智商也不好啊，闹笑话了 ​   

                                            Cleaned_Text  \
0                          汪苏泷超话 求问 纪录片是啥 在哪看 打工的孩子2g冲浪了   
1                                              璐璐公主2g冲浪版   
2      周震南求生欲三连 到底是什么让南南如此害怕开口VvvVvV周震南 带着来看我们的演唱会 来做...   
3                   