In [1]:
BASE_INFO_PATH = r"..\user_weibo_data_cleaned"
SENTI_ANALY_PATH = r"..\LLM_Analysis\result\ds_analysis"

USER_INFO_FILE = "user_cleaned.csv"
WEIBO_INFO_FILE = "weibo_cleaned.csv"

In [2]:
import os
import pandas as pd
import json


def load_and_preprocess_data():
    """Load and preprocess user and Weibo data.

    Returns:
        tuple: A tuple containing the preprocessed user DataFrame, Weibo DataFrame, and a list of error user IDs.
    """
    # 读取数据
    df_user = pd.read_csv(os.path.join(BASE_INFO_PATH, USER_INFO_FILE))
    df_weibo = pd.read_csv(os.path.join(BASE_INFO_PATH, WEIBO_INFO_FILE))
    
    # 转换数据类型
    df_weibo["created_at"] = pd.to_datetime(df_weibo["created_at"])
    df_user["uid"] = df_user["uid"].astype("str")
    df_weibo["uid"] = df_weibo["uid"].astype("str")
    df_weibo["wid"] = df_weibo["wid"].astype("str")
    
    # 过滤ERROR用户
    error_user_ids = df_user[df_user["error"] == 1]["uid"].tolist()
    df_user = df_user[df_user["error"] != 1]
    df_weibo = df_weibo[~df_weibo["uid"].isin(error_user_ids)]
    
    return df_user, df_weibo, error_user_ids


def load_sentiment_analysis(error_user_ids):
    """Load sentiment analysis results.

    Args:
        error_user_ids (list): A list of user IDs that encountered errors.

    Returns:
        dict: A dictionary containing sentiment analysis results for each user.
    """
    sentiment_analysis_dict = {}
    for i in range(1, 8):
        file_name = f"ds_{i}.json"
        file_path = os.path.join(SENTI_ANALY_PATH, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            sentiment_analysis_dict.update(json.load(f))
    sentiment_analysis_dict = {uid: info for uid, info in sentiment_analysis_dict.items() if uid not in error_user_ids}
    return sentiment_analysis_dict


def process_sentiment_type(sentiment_analysis_dict: dict):
    """Process sentiment types in the sentiment analysis dictionary.

    Args:
        sentiment_analysis_dict (dict): A dictionary containing sentiment analysis results.

    Returns:
        dict: The updated sentiment analysis dictionary with processed sentiment types.
    """
    for info in sentiment_analysis_dict.values():
        for analysis in info.values():
            comp_sentiment = analysis.get('comprehensive_sentiment', {})
            aspect_sentiment = analysis.get('aspect_sentiment', {})

            if comp_sentiment:
                try:
                    sentiment_type = comp_sentiment.get('type', '')
                    sentiment_list = sentiment_type.split('-')
                    # if len(sentiment_list) == 2:
                    #     main_type, sub_type = sentiment_list
                    if len(sentiment_list) == 3:
                        # print(f"情感类型：{sentiment_type}")
                        comp_sentiment, aspect_sentiment = sentiment_type.split('/')
                        # main_type, sub_type = comp_sentiment.split('-')
                        analysis["comprehensive_sentiment"]["type"] = comp_sentiment
                        analysis["aspect_sentiment"].update({aspect_sentiment: "待定"})
                        # print(f'修改后：{analysis["comprehensive_sentiment"]["type"]}, {analysis["aspect_sentiment"]}')
                        # print()
                    # else:
                    #     main_type = sentiment_type
                except Exception as e:
                    print("Error parsing sentiment type:", e)

    return sentiment_analysis_dict

In [3]:
class SentimentAnalysis:
    def __init__(self, analysis_dict):
        self.created_at = analysis_dict.get("created_at")
        self.post_interval = analysis_dict.get("post_interval")
        self.mentions = analysis_dict.get("mentions", [])
        self.topics = analysis_dict.get("topics", [])
        self.comprehensive_sentiment = analysis_dict.get("comprehensive_sentiment", {})
        self.aspect_sentiment = analysis_dict.get("aspect_sentiment", {})
        self.narrative_threads = analysis_dict.get("narrative_threads", [])


class Weibo:
    def __init__(self, wid, uid, text, created_at, attitudes_count, comments_count, reposts_count, sentiment_analysis=None):
        self.wid = wid
        self.uid = uid
        self.text = text
        self.created_at = created_at
        self.attitudes_count = attitudes_count
        self.comments_count = comments_count
        self.reposts_count = reposts_count
        self.sentiment_analysis = sentiment_analysis

class User:
    def __init__(self, uid, screen_name, gender, statuses_count, IP):
        self.uid = uid
        self.screen_name = screen_name
        self.gender = gender
        self.statuses_count = statuses_count
        self.IP = IP
        self.weibos = []

    def add_weibo(self, weibo):
        self.weibos.append(weibo)


def build_user_objects(df_user, df_weibo, sentiment_analysis_dict):
    # 使用字典推导式快速构建user_dict
    user_dict = {
        str(row['uid']): User(
            uid=row['uid'],
            screen_name=row['screen_name'],
            gender=row['gender'],
            statuses_count=row['statuses_count'],
            IP=row['IP']
        )
        for _, row in df_user.iterrows()
    }
    
    # 预先创建情感分析查找字典，避免重复查找
    sentiment_lookup = {}
    for uid, user_sentiments in sentiment_analysis_dict.items():
        sentiment_lookup[uid] = {
            wid: SentimentAnalysis(sentiment_data)
            for wid, sentiment_data in user_sentiments.items()
        }
    
    # 批量处理微博数据
    for _, row in df_weibo.iterrows():
        uid = str(row['uid'])
        wid = str(row['wid'])
        
        # 快速查找情感分析
        sentiment = None
        if uid in sentiment_lookup and wid in sentiment_lookup[uid]:
            sentiment = sentiment_lookup[uid][wid]
        
        # 创建Weibo对象
        weibo = Weibo(
            wid=row['wid'],
            uid=row['uid'],
            text=row['text'],
            created_at=row['created_at'],
            attitudes_count=row['attitudes_count'],
            comments_count=row['comments_count'],
            reposts_count=row['reposts_count'],
            sentiment_analysis=sentiment
        )
        
        # 添加到用户
        if uid in user_dict:
            user_dict[uid].add_weibo(weibo)
    
    return user_dict

In [4]:
# 读取用户与微博数据
df_user, df_weibo, error_user_ids = load_and_preprocess_data()

In [5]:
# 获取情感分析数据
sentiment_analysis_dict = load_sentiment_analysis(error_user_ids)

# 处理情感类型分析
sentiment_analysis_dict = process_sentiment_type(sentiment_analysis_dict)

In [6]:
# 构建用户对象
user_dict = build_user_objects(df_user, df_weibo, sentiment_analysis_dict)

In [7]:
from collections import Counter

class UserProfile:
    def __init__(self, user):
        self.user = user
        self.basic_info = self._analyze_basic_info()
        self.behavior_patterns = self._analyze_behavior_patterns()
        self.content_preferences = self._analyze_content_preferences()
        self.emotional_profile = self._analyze_emotional_profile()
        self.social_network = self._analyze_social_network()
        self.activity_patterns = self._analyze_activity_patterns()
    
    def _analyze_basic_info(self):
        """基础信息分析"""
        return {
            'uid': self.user.uid,
            'screen_name': self.user.screen_name,
            'gender': self.user.gender,
            'statuses_count': self.user.statuses_count,
            'IP_location': self.user.IP,
            'total_weibos': len(self.user.weibos),
            'active_days': self._calculate_active_days(),
            'avg_weibos_per_day': round(len(self.user.weibos) / self._calculate_active_days(), 2)
            # 'avg_weibos_per_day': round(len(self.user.weibos) / max(self._calculate_active_days()), 1)
        }
    
    def _analyze_behavior_patterns(self):
        """行为模式分析"""
        if not self.user.weibos:
            return {}
        
        # 发布时间分布
        posting_hours = [weibo.created_at.hour for weibo in self.user.weibos]
        posting_days = [weibo.created_at.strftime('%A') for weibo in self.user.weibos]

        most_active_hour = Counter(posting_hours).most_common(1)[0]
        most_active_day = Counter(posting_days).most_common(1)[0]

        return {
            'most_active_hour': most_active_hour[0],
            'most_active_hour_ratio': round(most_active_hour[1] / len(posting_hours), 2),
            'most_active_day': most_active_day[0],
            'most_active_day_ratio': round(most_active_day[1] / len(posting_days), 2),
            'avg_posting_interval_hour': self._calculate_posting_interval(),
            'posting_regularity': self._calculate_posting_regularity(),
            'weekend_activity_ratio': self._calculate_weekend_activity(),
            'night_activity_ratio': self._calculate_night_activity()
        }
    
    def _analyze_content_preferences(self):
        """内容偏好分析"""
        if not self.user.weibos:
            return {}
        
        # 文本长度分析
        text_lengths = [len(weibo.text) for weibo in self.user.weibos]
        
        # 互动数据统计
        total_attitudes = sum(weibo.attitudes_count for weibo in self.user.weibos)
        total_comments = sum(weibo.comments_count for weibo in self.user.weibos)
        total_reposts = sum(weibo.reposts_count for weibo in self.user.weibos)
        
        return {
            'avg_text_length': round(sum(text_lengths) / len(text_lengths), 2),
            'max_text_length': max(text_lengths),
            'min_text_length': min(text_lengths),
            'total_interactions': total_attitudes + total_comments + total_reposts,
            'avg_attitudes_per_weibo': round(total_attitudes / len(self.user.weibos), 2),
            'avg_comments_per_weibo': round(total_comments / len(self.user.weibos), 2),
            'avg_reposts_per_weibo': round(total_reposts / len(self.user.weibos), 2),
            # 'interaction_ratio': round((total_attitudes + total_comments + total_reposts) / len(self.user.weibos), 2)
        }
    
    def _analyze_emotional_profile(self):
        """情感画像分析"""
        # 统计有情感分析的微博
        sentiment_weibos = [w for w in self.user.weibos if w.sentiment_analysis]
        
        
        # 情感类型统计
        sentiment_main_types = []
        sentiment_sub_types = []
        # sentiment_intensities = []
        sentiment_values = []
        event_categories = []
        keywords = []
        
        for weibo in sentiment_weibos:
            try:
                comp_sentiment = weibo.sentiment_analysis.comprehensive_sentiment
                if comp_sentiment:
                    sentiment_type = comp_sentiment.get('type', '')
                    if '-' in sentiment_type:
                        main_type, sub_type = sentiment_type.split('-')
                    else:
                        main_type = sentiment_type
                        sub_type = ''

                if main_type == '积极':
                    sentiment_value = 1
                elif main_type == '消极':
                    sentiment_value = -1
                else:
                    sentiment_value = 0
                
                base_value = 5.0
                sentiment_intensity = comp_sentiment.get("intensity", 0)
                sentiment_value = base_value + sentiment_value * sentiment_intensity * 0.5

                sentiment_main_types.append(main_type)
                sentiment_sub_types.append(sub_type)
                sentiment_values.append(sentiment_value)
                
                # sentiment_types.append(comp_sentiment.get('type', ''))
                # sentiment_intensities.append(comp_sentiment.get('intensity', 0))
                event_categories.append(comp_sentiment.get('event_category', ''))
                keywords.extend(comp_sentiment.get('keywords', []))
        
            except Exception as e:
                uid = self.user.uid
                wid = weibo.wid
                print(f"解析用户 {uid} 的微博 {wid} 时出现错误：{e}")


        top1_event_category, top1_event_category_ratio = self._get_top_items(event_categories, 1)[0]

        return {
            # 'has_sentiment_data': True,
            # 'dominant_sentiment': max(set(sentiment_types), key=sentiment_types.count) if sentiment_types else None,
            'avg_sentiment_value': round(sum(sentiment_values) / len(sentiment_values), 2),
            'top1_event_category': top1_event_category,
            'top1_event_category_ratio': top1_event_category_ratio,
            'top_event_categories': self._get_top_items(event_categories, 5),
            # 'top_keywords': self._get_top_items(keywords, 10),
            'sentiment_stability': self._calculate_sentiment_stability(sentiment_main_types),
            'positive_ratio': round(sentiment_main_types.count("积极") / len(sentiment_main_types),2),
            'negative_ratio': round(sentiment_main_types.count("消极") / len(sentiment_main_types), 2) 
            # 'neutral_ratio': sentiment_main_types.count("中性") / len(sentiment_main_types),
        }
    
    def _analyze_social_network(self):
        """社交网络分析"""
        # 基于互动数据推断社交活跃度
        total_interactions = sum(
            weibo.attitudes_count + weibo.comments_count + weibo.reposts_count 
            for weibo in self.user.weibos
        )

        mentions = []
        topics = []

        for weibo in self.user.weibos:
            mentions.extend(weibo.sentiment_analysis.mentions)
            topics.extend(weibo.sentiment_analysis.topics)

        return {
            'social_activity_level': self._categorize_social_level(total_interactions),
            'avg_interactions_per_weibo': round(total_interactions / len(self.user.weibos), 2) if self.user.weibos else 0,
            # 'engagement_rate': round(total_interactions / (len(self.user.weibos) * 100), 2) if self.user.weibos else 0  # 假设平均曝光100次
            'mentions': mentions, 
            'mention_count': len(mentions), 
            'distinct_mention_count': len(set(mentions)), 
            'topics': topics,
            'topic_count': len(topics), 
            'distinct_topic_count': len(set(topics)),
        }
    
    def _analyze_activity_patterns(self):
        """活动模式分析"""
        if not self.user.weibos:
            return {}
        
        # 按月份统计活动
        monthly_activity = {}
        for weibo in self.user.weibos:
            month = weibo.created_at.strftime('%Y-%m')
            monthly_activity[month] = monthly_activity.get(month, 0) + 1
        
        return {
            'most_active_month': max(monthly_activity.items(), key=lambda x: x[1])[0] if monthly_activity else None,
            'activity_trend': self._calculate_activity_trend(monthly_activity),
            'consistency_score': self._calculate_consistency_score(monthly_activity)
        }
    
    # 辅助方法
    def _calculate_active_days(self):
        """计算活跃天数"""
        if not self.user.weibos:
            return 0
        unique_days = set(weibo.created_at.date() for weibo in self.user.weibos)
        return len(unique_days)
    
    def _calculate_posting_interval(self):
        """计算平均发帖间隔（小时）"""
        if len(self.user.weibos) < 2:
            return 0
        
        sorted_weibos = sorted(self.user.weibos, key=lambda x: x.created_at)
        intervals = []
        for i in range(1, len(sorted_weibos)):
            interval = (sorted_weibos[i].created_at - sorted_weibos[i-1].created_at).total_seconds() / 3600  # 小时
            intervals.append(interval)
        
        return round(sum(intervals) / len(intervals), 2) if intervals else 0
    
    def _calculate_posting_regularity(self):
        """计算发帖规律性，若发帖间隔均匀，则认为发帖规律，值越接近1则发帖越规律"""
        
        intervals = []
        sorted_weibos = sorted(self.user.weibos, key=lambda x: x.created_at)
        for i in range(1, len(sorted_weibos)):
            interval = (sorted_weibos[i].created_at - sorted_weibos[i-1].created_at).total_seconds() / 3600
            intervals.append(interval)
        
        # 计算间隔的标准差，越小越规律
        if not intervals:
            return 0
        mean_interval = sum(intervals) / len(intervals)
        variance = sum((x - mean_interval) ** 2 for x in intervals) / len(intervals)
        return round(1 / (1 + variance), 5)  # 归一化，值越大越规律
    
    def _calculate_weekend_activity(self):
        """计算周末活动比例"""
        weekend_posts = sum(1 for weibo in self.user.weibos if weibo.created_at.weekday() >= 5)
        return round(weekend_posts / len(self.user.weibos), 2)
    
    def _calculate_night_activity(self):
        """计算夜间活动比例（22:00-06:00）"""
        night_posts = sum(1 for weibo in self.user.weibos 
                         if weibo.created_at.hour >= 22 or weibo.created_at.hour <= 6)
        return round(night_posts / len(self.user.weibos), 2)
    
    def _get_top_items(self, items, top_n):
        """获取出现频率最高的前N项"""
        from collections import Counter
        counter = Counter(items)
        return [(item, round(count / len(items), 2)) for item, count in counter.most_common(top_n)]
    
    def _calculate_sentiment_stability(self, sentiment_types):
        """计算情感稳定性，值越接近1则情感越稳定"""      
        # 计算情感类型变化频率
        changes = sum(1 for i in range(1, len(sentiment_types)) 
                    if sentiment_types[i] != sentiment_types[i-1])
        return round(1 - (changes / (len(sentiment_types) - 1)), 2)
    
    def _categorize_social_level(self, total_interactions):
        """社交活跃度分类"""
        if total_interactions >= 1000:
            return "高活跃"
        elif total_interactions >= 100:
            return "中活跃"
        else:
            return "低活跃"
    
    def _calculate_activity_trend(self, monthly_activity):
        """计算活动趋势"""
        if len(monthly_activity) < 2:
            return "稳定"
        
        months = sorted(monthly_activity.keys())
        first_month_count = monthly_activity[months[0]]
        last_month_count = monthly_activity[months[-1]]
        
        if last_month_count > first_month_count * 1.5:
            return "上升"
        elif last_month_count < first_month_count * 0.7:
            return "下降"
        else:
            return "稳定"
    
    def _calculate_consistency_score(self, monthly_activity):
        """计算一致性得分"""
        if not monthly_activity:
            return 0
        
        values = list(monthly_activity.values())
        mean_value = sum(values) / len(values)
        variance = sum((x - mean_value) ** 2 for x in values) / len(values)
        return 1 / (1 + variance)  # 归一化
    
    def get_summary(self):
        """获取用户画像摘要"""
        return {
            'basic_info': self.basic_info,
            'behavior_patterns': self.behavior_patterns,
            'content_preferences': self.content_preferences,
            'emotional_profile': self.emotional_profile,
            'social_network': self.social_network,
            # 'activity_patterns': self.activity_patterns
        }
    
    def get_user_type(self):
        """获取用户类型标签"""
        labels = []
        
        # 基于活跃度
        if self.basic_info['avg_weibos_per_day'] > 2:
            labels.append("高频用户")
        elif self.basic_info['avg_weibos_per_day'] < 0.1:
            labels.append("低频用户")
        else:
            labels.append("中频用户")
        
        # 基于情感特征
        if self.emotional_profile.get('positive_ratio', 0) > 0.6:
            labels.append("积极用户")
        elif self.emotional_profile.get('negative_ratio', 0) > 0.6:
            labels.append("消极用户")
        
        # 基于社交活跃度
        labels.append(self.social_network['social_activity_level'])
        
        # 基于活动时间
        if self.behavior_patterns.get('night_activity_ratio', 0) > 0.3:
            labels.append("夜猫子")
        
        return labels

In [8]:
profile_list = [UserProfile(user) for user in user_dict.values()]

In [9]:
import random

random.choice(profile_list).get_summary()

{'basic_info': {'uid': '2877415140',
  'screen_name': 'NeedUuuuuuuu',
  'gender': 'f',
  'statuses_count': 2452,
  'IP_location': '贵州',
  'total_weibos': 43,
  'active_days': 28,
  'avg_weibos_per_day': 1.54},
 'behavior_patterns': {'most_active_hour': 9,
  'most_active_hour_ratio': 0.19,
  'most_active_day': 'Sunday',
  'most_active_day_ratio': 0.21,
  'avg_posting_interval_hour': 97.52,
  'posting_regularity': 2e-05,
  'weekend_activity_ratio': 0.3,
  'night_activity_ratio': 0.21},
 'content_preferences': {'avg_text_length': 58.72,
  'max_text_length': 176,
  'min_text_length': 2,
  'total_interactions': 729,
  'avg_attitudes_per_weibo': 12.91,
  'avg_comments_per_weibo': 4.0,
  'avg_reposts_per_weibo': 0.05},
 'emotional_profile': {'avg_sentiment_value': 5.43,
  'top1_event_category': '旅游体验',
  'top1_event_category_ratio': 0.12,
  'top_event_categories': [('旅游体验', 0.12),
   ('娱乐活动', 0.07),
   ('人际关系', 0.07),
   ('日常记录', 0.05),
   ('个人成长', 0.05)],
  'sentiment_stability': 0.36,
  'po

In [10]:
# 将所有用户画像摘要转换为DataFrame
profile_dict_list = [profile.get_summary() for profile in profile_list]
df_profile = pd.json_normalize(profile_dict_list)
df_profile.head()

Unnamed: 0,basic_info.uid,basic_info.screen_name,basic_info.gender,basic_info.statuses_count,basic_info.IP_location,basic_info.total_weibos,basic_info.active_days,basic_info.avg_weibos_per_day,behavior_patterns.most_active_hour,behavior_patterns.most_active_hour_ratio,...,emotional_profile.positive_ratio,emotional_profile.negative_ratio,social_network.social_activity_level,social_network.avg_interactions_per_weibo,social_network.mentions,social_network.mention_count,social_network.distinct_mention_count,social_network.topics,social_network.topic_count,social_network.distinct_topic_count
0,1000129923,重铸无限城荣光,f,7991,湖南,50,11,4.55,0,0.14,...,0.26,0.52,低活跃,1.98,[],0,0,[],0,0
1,1004171625,财欣欣666,f,536,四川,33,9,3.67,7,0.18,...,0.55,0.15,高活跃,137.64,[财宝宝],1,1,"[生活手记, 金价, 微博账号异常, 微博账号异常怎么解除, 生活手记, 热点, 生活手记,...",29,23
2,1005603481,青岛金笑笑,f,3867,山东,46,25,1.84,12,0.17,...,0.09,0.65,低活跃,1.22,[],0,0,[产后妈妈的缺觉困境],1,1
3,1006429701,彗星划过的一颗粉尘,f,3013,湖南,47,32,1.47,20,0.15,...,0.43,0.32,低活跃,0.45,[],0,0,[],0,0
4,1007473540,春稳王,m,521,北京,22,22,1.0,21,0.27,...,0.77,0.09,低活跃,2.18,"[崔珊珊__, 崔珊珊__, 崔珊珊__]",3,1,"[奥运会乒乓球混双决赛, 七夕, 中秋, 好东西, 胖东来攻入北京, 豆花, 秦炉, 元宵节...",9,9


In [11]:
screen_name_list = [profile.basic_info["screen_name"] for profile in profile_list]

for profile in profile_list:
    screen_name = profile.basic_info["screen_name"]
    mentions = profile.social_network.get("mentions", [])
    if mentions:
        related_users = set([user for user in mentions if user in screen_name_list and user != screen_name])
        if related_users:
            print(f"用户 {screen_name} 提及了其他用户：{related_users}")

用户 站在村口的乔治 提及了其他用户：{'一只小抱扑'}
用户 阿林林喜欢猫猫周 提及了其他用户：{'江湖路远--'}
用户 请叫我Ms睡不醒 提及了其他用户：{'就不摸绵羊'}
用户 屁嘞噗噗 提及了其他用户：{'周小超啦'}
用户 就不摸绵羊 提及了其他用户：{'请叫我Ms睡不醒'}
用户 Rebeca_Q 提及了其他用户：{'十日o3o'}
用户 白鹅余鹅记录簿 提及了其他用户：{'一罐起司糖'}
用户 夜雪潇月-何以倾城 提及了其他用户：{'米米妈育儿'}
用户 周小超啦 提及了其他用户：{'屁嘞噗噗'}
用户 京畿道桜季_婧候乔青羽版 提及了其他用户：{'那就拥抱橘子熊'}
用户 一罐起司糖 提及了其他用户：{'白鹅余鹅记录簿'}
用户 长空夕醉ing 提及了其他用户：{'砚徵书笙'}
用户 英国小伙互殴七年可以结婚 提及了其他用户：{'我只屬於你在這短暫的生命_'}
用户 美男v我50万 提及了其他用户：{'少女6只猫之宠物沟通师99解答1次'}
用户 不要温顺地走进那个良夜- 提及了其他用户：{'吒团一'}
用户 砚徵书笙 提及了其他用户：{'长空夕醉ing'}
用户 到底要睡多久才能不困- 提及了其他用户：{'泡芙羊o'}
用户 501碎 提及了其他用户：{'翎允曦'}
用户 财智贤 提及了其他用户：{'叮铃铃uu', '财甜呀'}
