# 基于机器学习的文本情感分类（如微博评论等）

> 语言推荐：python；设计组成：数据采集与分析、机器学习算法原理、详细的可视化结果

## 01 数据整理与清洗

从已有的微博数据集中，进行数据整理与清洗，提取有用的特征

In [1]:
import json

with open("data/1user_weibo_analysis_data.json", "r", encoding="utf-8") as f:
    user_weibo_data: dict = json.load(f)

In [2]:
# 提取有用的属性作为表头
total_weibos = [("user_id", "user_name", "user_gender", "user_ip", "weibo_id", "text", "created_at", "likes", "comments", "reposts", "sentiment_type", "sentiment_intensity")]

for info in user_weibo_data.values():
    user_id = info["uid"]
    user_name = info["screen_name"]
    user_gender = info["gender"]
    user_ip = info["IP"]
    for w in info["weibos"]:
        weibo_id = w["wid"]
        text = w["text"]
        created_at = w["created_at"]
        attitudes_count = w["attitudes_count"]
        comments_count = w["comments_count"]
        reposts_count = w["reposts_count"]
        sentiment_type = w["sentiment_analysis"]["comprehensive_sentiment"]["type"].replace("中性", "中立").split('-')[0]
        sentiment_intensity = w["sentiment_analysis"]["comprehensive_sentiment"]["intensity"]
        has_asp_sentiment = len(w["sentiment_analysis"]["aspect_sentiment"]) > 0
        if (
            3 < len(text) < 50
            and not has_asp_sentiment
            and "混合" not in sentiment_type
        ):
            total_weibos.append((user_id, user_name, user_gender, user_ip, weibo_id, text, created_at, attitudes_count, comments_count, reposts_count, sentiment_type, sentiment_intensity))

len(total_weibos)

367322

In [3]:
import pandas as pd

def convert_to_dataframe(data):
    """
    将处理后的微博数据转换为 Pandas DataFrame

    Args:
        weibo_data (list): 处理后的微博数据列表

    Returns:
        pd.DataFrame: 转换后的 DataFrame
    """
    columns = data[0]
    data_rows = data[1:]
    df = pd.DataFrame(data_rows, columns=columns)
    return df

df = convert_to_dataframe(total_weibos)

df.describe()

Unnamed: 0,likes,comments,reposts,sentiment_intensity
count,367321.0,367321.0,367321.0,367321.0
mean,2.223309,1.367727,0.06552,5.839786
std,17.460882,4.239247,2.54949,1.88466
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,5.0
50%,0.0,0.0,0.0,6.0
75%,2.0,2.0,0.0,7.0
max,6749.0,586.0,602.0,10.0


### 数据清洗

In [4]:
import re

def is_valid(text: str) -> bool:
    """
    判断该微博是否为有效微博，依据为广告、转载、营销相关的关键词
    如果微博中出现了关键词，则认为是无效微博，直接删除

    Args:
        text (str): 一条微博的文本内容

    Returns:
        bool: 微博是否是否有效
    """

    if len(text.strip()) <= 3:
        return False
    
    # 广告、转载相关关键词
    ad_keywords = [
        '点开红包', '现金红包', '好礼', '网页链接', 
        '我在参与', '连续签到', '粉打卡', '年度歌曲', 
        '免费围观', '关注超话', "蚂蚁庄园：", "森林驿站", 
        "头条文章", "注册微博", "注册微博", "闲鱼发布",
        "闲鱼号", "头像挂件"
    ]

    # 微博中含有任意一个关键词，则认为无效
    if any(kw in text for kw in ad_keywords):
        # print(f"微博“{text}”为抽奖、广告、转载等微博，删除")
        return False
    
    return True



def clean_text(text, user_name=''):
    """清洗微博文本内容，去除无用信息

    Args:
        text (str): 微博文本内容
        user_name (str, optional): 用户名。 Defaults to ''.

    Returns:
        str: 清洗后的微博文本
    """

    # 去除分享来源信息（直接将“分享自”及其后面的所有信息去除）
    text = re.sub(r'(?:[（(])?分享自(?!己).*$', '', text)

    # 去除“分享图片/视频”文本
    text = re.sub(r'分享(图片|视频)', '', text)
    
    # 去除多余空格
    text = re.sub(r'[\s]+', ' ', text)

    # 去除话题标签
    text = re.sub(r'#.*?#', '', text)

    # 去除视频来源信息
    text = re.sub(rf'{re.escape(user_name)}的微博视频\s*', '', text)

    # 去除@用户名
    text = re.sub(r'@[\w\-]+', '', text)

    # 去除URL
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # 去除特殊字符与多余标点
    text = re.sub(r'[^\u4e00-\u9fff\w\s，。！？；：""''()（）【】]', '', text)
    
    # 去除纯表情符号的内容
    if len(re.sub(r'[\s]', '', text)) <= 1:
        return ''
    
    return text.strip()

In [5]:
weibo_after_dc = [total_weibos[0]]

for w in total_weibos[1:]:
    text = clean_text(w[5], w[1])
    if is_valid(text):
        weibo_after_dc.append((
            w[0], w[1], w[2], w[3], w[4], text, w[6], w[7], w[8], w[9], w[10], w[11]
        ))

len(weibo_after_dc)

358585

In [6]:
df_dc = convert_to_dataframe(weibo_after_dc)

df_dc

Unnamed: 0,user_id,user_name,user_gender,user_ip,weibo_id,text,created_at,likes,comments,reposts,sentiment_type,sentiment_intensity
0,1000129923,重铸无限城荣光,女,湖南,5146551031173596,我居然有两个隐身访问和删除记录访问欸！！！ 来看啥的呢,2025-03-21 08:27:27,2,0,0,中立,3
1,1000129923,重铸无限城荣光,女,湖南,5146658010826468,nnd最不会打这种了 每次遇到这种单边粉键单note单边粉键就框框漏,2025-03-21 15:32:33,2,2,0,消极,6
2,1000129923,重铸无限城荣光,女,湖南,5146660519280964,好想麦 这附近为什么没有麦啊,2025-03-21 15:42:30,1,0,0,消极,4
3,1000129923,重铸无限城荣光,女,湖南,5146805668416291,怎么天天不是左边太阳穴长痘就是右边太阳穴长痘 怎么天天太阳穴长痘啊一边消了另一边又起的）,2025-03-22 01:19:17,0,0,0,消极,4
4,1000129923,重铸无限城荣光,女,湖南,5147074517797068,不是） 我这笔也没摔啊 怎么就坏了 这可是17块的铅笔啊,2025-03-22 19:07:36,1,0,0,消极,3
...,...,...,...,...,...,...,...,...,...,...,...,...
358579,7967043072,瑞可犟,女,重庆,5150778986856825,怎么没人发这个到微博 金珉周口中那个以前会把想说的话埋在心里，现在不想搞砸的关系就会多沟通,2025-04-02 00:27:50,9,0,0,中立,4
358580,7967043072,瑞可犟,女,重庆,5150928627569969,正式开启大和抚子 高岭之花era,2025-04-02 10:22:27,6,0,2,积极,7
358581,7967043072,瑞可犟,女,重庆,5150985211349095,田力可可可可手包我,2025-04-02 14:07:18,2,0,1,积极,9
358582,7967043072,瑞可犟,女,重庆,5150988403475678,我刚刚走在路上的时候听见有人对我说你是人间四月天,2025-04-02 14:19:59,11,1,8,积极,8


In [7]:
def save_to_csv(df, file_path):
    """
    将 DataFrame 保存为 CSV 文件

    Args:
        df (pd.DataFrame): 需要保存的 DataFrame
        file_path (str): 保存的文件路径
    """
    df.to_csv(file_path, index=False, encoding='utf-8-sig')  # 使用 utf-8-sig 编码以支持 Excel 正确显示中文

save_to_csv(df_dc, "weibo_after_dc.csv")

In [74]:
# 列名位于列表第一行，数据从第二行开始
columns = weibo_after_fe[0]
data_rows = weibo_after_fe[1:]

# 构建 DataFrame
df = pd.DataFrame(data_rows, columns=columns)

# 可选：查看基本信息
# print(df.head())
# print(df.info())

# 保存为 CSV 文件（带 BOM 方便 Excel 正确显示中文）
output_path = 'weibo_features.csv'
df.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f'已保存到 {output_path}, 行数: {len(df)}')

已保存到 weibo_features.csv, 行数: 358584
