In [44]:
BASE_INFO_PATH = r"..\user_weibo_data_cleaned"
SENTI_ANALY_PATH = r"..\LLM_Analysis\result\ds_analysis"

USER_INFO_FILE = "user_cleaned.csv"
WEIBO_INFO_FILE = "weibo_cleaned.csv"


In [54]:
import os
import pandas as pd
import json


def load_and_preprocess_data():
    """Load and preprocess user and Weibo data.

    Returns:
        tuple: A tuple containing the preprocessed user DataFrame, Weibo DataFrame, and a list of error user IDs.
    """
    # 读取数据
    df_user = pd.read_csv(os.path.join(BASE_INFO_PATH, USER_INFO_FILE))
    df_weibo = pd.read_csv(os.path.join(BASE_INFO_PATH, WEIBO_INFO_FILE))
    
    # 转换数据类型
    df_weibo["created_at"] = pd.to_datetime(df_weibo["created_at"])
    df_user["uid"] = df_user["uid"].astype("str")
    df_weibo["uid"] = df_weibo["uid"].astype("str")
    df_weibo["wid"] = df_weibo["wid"].astype("str")
    
    # 过滤ERROR用户
    error_user_ids = df_user[df_user["error"] == 1]["uid"].tolist()
    df_user = df_user[df_user["error"] != 1]
    df_weibo = df_weibo[~df_weibo["uid"].isin(error_user_ids)]
    
    return df_user, df_weibo, error_user_ids


def load_sentiment_analysis(error_user_ids):
    """Load sentiment analysis results.

    Args:
        error_user_ids (list): A list of user IDs that encountered errors.

    Returns:
        dict: A dictionary containing sentiment analysis results for each user.
    """
    sentiment_analysis_dict = {}
    for i in range(1, 8):
        file_name = f"ds_{i}.json"
        file_path = os.path.join(SENTI_ANALY_PATH, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            sentiment_analysis_dict.update(json.load(f))
    sentiment_analysis_dict = {uid: info for uid, info in sentiment_analysis_dict.items() if uid not in error_user_ids}
    return sentiment_analysis_dict

In [46]:
class SentimentAnalysis:
    def __init__(self, analysis_dict):
        self.created_at = analysis_dict.get("created_at")
        self.post_interval = analysis_dict.get("post_interval")
        self.mentions = analysis_dict.get("mentions", [])
        self.topics = analysis_dict.get("topics", [])
        self.comprehensive_sentiment = analysis_dict.get("comprehensive_sentiment", {})
        self.aspect_sentiment = analysis_dict.get("aspect_sentiment", {})
        self.narrative_threads = analysis_dict.get("narrative_threads", [])


class Weibo:
    def __init__(self, wid, uid, text, created_at, attitudes_count, comments_count, reposts_count, sentiment_analysis=None):
        self.wid = wid
        self.uid = uid
        self.text = text
        self.created_at = created_at
        self.attitudes_count = attitudes_count
        self.comments_count = comments_count
        self.reposts_count = reposts_count
        self.sentiment_analysis = sentiment_analysis

class User:
    def __init__(self, uid, screen_name, gender, statuses_count, IP):
        self.uid = uid
        self.screen_name = screen_name
        self.gender = gender
        self.statuses_count = statuses_count
        self.IP = IP
        self.weibos = []

    def add_weibo(self, weibo):
        self.weibos.append(weibo)


def build_user_objects(df_user, df_weibo, sentiment_analysis_dict):
    # 使用字典推导式快速构建user_dict
    user_dict = {
        str(row['uid']): User(
            uid=row['uid'],
            screen_name=row['screen_name'],
            gender=row['gender'],
            statuses_count=row['statuses_count'],
            IP=row['IP']
        )
        for _, row in df_user.iterrows()
    }
    
    # 预先创建情感分析查找字典，避免重复查找
    sentiment_lookup = {}
    for uid, user_sentiments in sentiment_analysis_dict.items():
        sentiment_lookup[uid] = {
            wid: SentimentAnalysis(sentiment_data)
            for wid, sentiment_data in user_sentiments.items()
        }
    
    # 批量处理微博数据
    for _, row in df_weibo.iterrows():
        uid = str(row['uid'])
        wid = str(row['wid'])
        
        # 快速查找情感分析
        sentiment = None
        if uid in sentiment_lookup and wid in sentiment_lookup[uid]:
            sentiment = sentiment_lookup[uid][wid]
        
        # 创建Weibo对象
        weibo = Weibo(
            wid=row['wid'],
            uid=row['uid'],
            text=row['text'],
            created_at=row['created_at'],
            attitudes_count=row['attitudes_count'],
            comments_count=row['comments_count'],
            reposts_count=row['reposts_count'],
            sentiment_analysis=sentiment
        )
        
        # 添加到用户
        if uid in user_dict:
            user_dict[uid].add_weibo(weibo)
    
    return user_dict

In [64]:
df_user, df_weibo, error_user_ids = load_and_preprocess_data()

In [65]:
sentiment_analysis_dict = load_sentiment_analysis(error_user_ids)

In [None]:
user_dict = build_user_objects(df_user, df_weibo, sentiment_analysis_dict)

In [16]:
from collections import Counter

class UserProfile:
    def __init__(self, user):
        self.user = user
        self.basic_info = self._analyze_basic_info()
        self.behavior_patterns = self._analyze_behavior_patterns()
        self.content_preferences = self._analyze_content_preferences()
        self.emotional_profile = self._analyze_emotional_profile()
        self.social_network = self._analyze_social_network()
        self.activity_patterns = self._analyze_activity_patterns()
    
    def _analyze_basic_info(self):
        """基础信息分析"""
        return {
            'uid': self.user.uid,
            'screen_name': self.user.screen_name,
            'gender': self.user.gender,
            'statuses_count': self.user.statuses_count,
            'IP_location': self.user.IP,
            'total_weibos': len(self.user.weibos),
            'active_days': self._calculate_active_days(),
            'avg_weibos_per_day': round(len(self.user.weibos) / self._calculate_active_days(), 2)
            # 'avg_weibos_per_day': round(len(self.user.weibos) / max(self._calculate_active_days()), 1)
        }
    
    def _analyze_behavior_patterns(self):
        """行为模式分析"""
        if not self.user.weibos:
            return {}
        
        # 发布时间分布
        posting_hours = [weibo.created_at.hour for weibo in self.user.weibos]
        posting_days = [weibo.created_at.strftime('%A') for weibo in self.user.weibos]

        peak_posting_hour = Counter(posting_hours).most_common(1)[0]
        peak_posting_day = Counter(posting_days).most_common(1)[0]
        
        return {
            'peak_posting_hour': (peak_posting_hour[0], peak_posting_hour[1] / len(posting_hours)),
            'peak_posting_day': (peak_posting_day[0], peak_posting_day[1] / len(posting_days)),
            'avg_posting_interval_hour': self._calculate_posting_interval(),
            'posting_regularity': self._calculate_posting_regularity(),
            'weekend_activity_ratio': self._calculate_weekend_activity(),
            'night_activity_ratio': self._calculate_night_activity()
        }
    
    def _analyze_content_preferences(self):
        """内容偏好分析"""
        if not self.user.weibos:
            return {}
        
        # 文本长度分析
        text_lengths = [len(weibo.text) for weibo in self.user.weibos]
        
        # 互动数据统计
        total_attitudes = sum(weibo.attitudes_count for weibo in self.user.weibos)
        total_comments = sum(weibo.comments_count for weibo in self.user.weibos)
        total_reposts = sum(weibo.reposts_count for weibo in self.user.weibos)
        
        return {
            'avg_text_length': round(sum(text_lengths) / len(text_lengths), 2),
            'max_text_length': max(text_lengths),
            'min_text_length': min(text_lengths),
            'total_interactions': total_attitudes + total_comments + total_reposts,
            'avg_attitudes_per_weibo': round(total_attitudes / len(self.user.weibos), 2),
            'avg_comments_per_weibo': round(total_comments / len(self.user.weibos), 2),
            'avg_reposts_per_weibo': round(total_reposts / len(self.user.weibos), 2),
            'interaction_ratio': round((total_attitudes + total_comments + total_reposts) / len(self.user.weibos), 2)
        }
    
    def _analyze_emotional_profile(self):
        """情感画像分析"""
        # 统计有情感分析的微博
        sentiment_weibos = [w for w in self.user.weibos if w.sentiment_analysis]
        
        
        # 情感类型统计
        sentiment_main_types = []
        sentiment_sub_types = []
        # sentiment_intensities = []
        sentiment_values = []
        event_categories = []
        keywords = []
        
        for weibo in sentiment_weibos:
            comp_sentiment = weibo.sentiment_analysis.comprehensive_sentiment
            if comp_sentiment:
                sentiment_type = comp_sentiment.get('type', '')
                try:
                    if '-' in sentiment_type:
                        main_type, sub_type = sentiment_type.split('-')
                    else:
                        main_type = sentiment_type
                        sub_type = ''
                except Exception as e:
                    print("Error parsing sentiment type:", e)

                if main_type == '积极':
                    sentiment_value = 1
                elif main_type == '消极':
                    sentiment_value = -1
                else:
                    sentiment_value = 0
                
                base_value = 5.0
                sentiment_intensity = comp_sentiment.get("intensity", 0)
                sentiment_value = base_value + sentiment_value * sentiment_intensity

                sentiment_main_types.append(main_type)
                sentiment_sub_types.append(sub_type)
                sentiment_values.append(sentiment_value)
                
                # sentiment_types.append(comp_sentiment.get('type', ''))
                # sentiment_intensities.append(comp_sentiment.get('intensity', 0))
                event_categories.append(comp_sentiment.get('event_category', ''))
                keywords.extend(comp_sentiment.get('keywords', []))
        
        return {
            'has_sentiment_data': True,
            # 'dominant_sentiment': max(set(sentiment_types), key=sentiment_types.count) if sentiment_types else None,
            'avg_sentiment_value': round(sum(sentiment_values) / len(sentiment_values), 2),
            'top_event_categories': self._get_top_items(event_categories, 5),
            'top_keywords': self._get_top_items(keywords, 10),
            'sentiment_stability': self._calculate_sentiment_stability(sentiment_main_types),
            'positive_ratio': sentiment_main_types.count("积极") / len(sentiment_main_types),
            'negative_ratio': sentiment_main_types.count("消极") / len(sentiment_main_types) 
            # 'neutral_ratio': sentiment_main_types.count("中性") / len(sentiment_main_types),
        }
    
    def _analyze_social_network(self):
        """社交网络分析"""
        # 基于互动数据推断社交活跃度
        total_interactions = sum(
            weibo.attitudes_count + weibo.comments_count + weibo.reposts_count 
            for weibo in self.user.weibos
        )
        
        return {
            'social_activity_level': self._categorize_social_level(total_interactions),
            'avg_interactions_per_weibo': total_interactions / len(self.user.weibos) if self.user.weibos else 0,
            'engagement_rate': total_interactions / (len(self.user.weibos) * 100) if self.user.weibos else 0  # 假设平均曝光100次
        }
    
    def _analyze_activity_patterns(self):
        """活动模式分析"""
        if not self.user.weibos:
            return {}
        
        # 按月份统计活动
        monthly_activity = {}
        for weibo in self.user.weibos:
            month = weibo.created_at.strftime('%Y-%m')
            monthly_activity[month] = monthly_activity.get(month, 0) + 1
        
        return {
            'most_active_month': max(monthly_activity.items(), key=lambda x: x[1])[0] if monthly_activity else None,
            'activity_trend': self._calculate_activity_trend(monthly_activity),
            'consistency_score': self._calculate_consistency_score(monthly_activity)
        }
    
    # 辅助方法
    def _calculate_active_days(self):
        """计算活跃天数"""
        if not self.user.weibos:
            return 0
        unique_days = set(weibo.created_at.date() for weibo in self.user.weibos)
        return len(unique_days)
    
    def _calculate_posting_interval(self):
        """计算平均发帖间隔（小时）"""
        if len(self.user.weibos) < 2:
            return 0
        
        sorted_weibos = sorted(self.user.weibos, key=lambda x: x.created_at)
        intervals = []
        for i in range(1, len(sorted_weibos)):
            interval = (sorted_weibos[i].created_at - sorted_weibos[i-1].created_at).total_seconds() / 3600  # 小时
            intervals.append(interval)
        
        return round(sum(intervals) / len(intervals), 2) if intervals else 0
    
    def _calculate_posting_regularity(self):
        """计算发帖规律性，若发帖间隔均匀，则认为发帖规律，值越接近1则发帖越规律"""
        
        intervals = []
        sorted_weibos = sorted(self.user.weibos, key=lambda x: x.created_at)
        for i in range(1, len(sorted_weibos)):
            interval = (sorted_weibos[i].created_at - sorted_weibos[i-1].created_at).total_seconds() / 3600
            intervals.append(interval)
        
        # 计算间隔的标准差，越小越规律
        if not intervals:
            return 0
        mean_interval = sum(intervals) / len(intervals)
        variance = sum((x - mean_interval) ** 2 for x in intervals) / len(intervals)
        return round(1 / (1 + variance), 5)  # 归一化，值越大越规律
    
    def _calculate_weekend_activity(self):
        """计算周末活动比例"""
        weekend_posts = sum(1 for weibo in self.user.weibos if weibo.created_at.weekday() >= 5)
        return round(weekend_posts / len(self.user.weibos), 2)
    
    def _calculate_night_activity(self):
        """计算夜间活动比例（22:00-06:00）"""
        night_posts = sum(1 for weibo in self.user.weibos 
                         if weibo.created_at.hour >= 22 or weibo.created_at.hour <= 6)
        return round(night_posts / len(self.user.weibos), 2)
    
    def _get_top_items(self, items, top_n):
        """获取出现频率最高的前N项"""
        from collections import Counter
        counter = Counter(items)
        return [(item, round(count / len(items), 2)) for item, count in counter.most_common(top_n)]
    
    def _calculate_sentiment_stability(self, sentiment_types):
        """计算情感稳定性，值越接近1则情感越稳定"""      
        # 计算情感类型变化频率
        changes = sum(1 for i in range(1, len(sentiment_types)) 
                    if sentiment_types[i] != sentiment_types[i-1])
        return round(1 - (changes / (len(sentiment_types) - 1)), 2)
    
    def _categorize_social_level(self, total_interactions):
        """社交活跃度分类"""
        if total_interactions >= 1000:
            return "高活跃"
        elif total_interactions >= 100:
            return "中活跃"
        else:
            return "低活跃"
    
    def _calculate_activity_trend(self, monthly_activity):
        """计算活动趋势"""
        if len(monthly_activity) < 2:
            return "稳定"
        
        months = sorted(monthly_activity.keys())
        first_month_count = monthly_activity[months[0]]
        last_month_count = monthly_activity[months[-1]]
        
        if last_month_count > first_month_count * 1.5:
            return "上升"
        elif last_month_count < first_month_count * 0.7:
            return "下降"
        else:
            return "稳定"
    
    def _calculate_consistency_score(self, monthly_activity):
        """计算一致性得分"""
        if not monthly_activity:
            return 0
        
        values = list(monthly_activity.values())
        mean_value = sum(values) / len(values)
        variance = sum((x - mean_value) ** 2 for x in values) / len(values)
        return 1 / (1 + variance)  # 归一化
    
    def get_summary(self):
        """获取用户画像摘要"""
        return {
            'basic_info': self.basic_info,
            'behavior_patterns': self.behavior_patterns,
            'content_preferences': self.content_preferences,
            'emotional_profile': self.emotional_profile,
            'social_network': self.social_network,
            # 'activity_patterns': self.activity_patterns
        }
    
    def get_user_type(self):
        """获取用户类型标签"""
        labels = []
        
        # 基于活跃度
        if self.basic_info['avg_weibos_per_day'] > 2:
            labels.append("高频用户")
        elif self.basic_info['avg_weibos_per_day'] < 0.1:
            labels.append("低频用户")
        else:
            labels.append("中频用户")
        
        # 基于情感特征
        if self.emotional_profile.get('positive_ratio', 0) > 0.6:
            labels.append("积极用户")
        elif self.emotional_profile.get('negative_ratio', 0) > 0.6:
            labels.append("消极用户")
        
        # 基于社交活跃度
        labels.append(self.social_network['social_activity_level'])
        
        # 基于活动时间
        if self.behavior_patterns.get('night_activity_ratio', 0) > 0.3:
            labels.append("夜猫子")
        
        return labels

In [17]:
profile_list = [UserProfile(user) for user in user_dict.values()]

Error parsing sentiment type: too many values to unpack (expected 2)
Error parsing sentiment type: too many values to unpack (expected 2)
Error parsing sentiment type: too many values to unpack (expected 2)


UnboundLocalError: cannot access local variable 'main_type' where it is not associated with a value

In [8]:
from collections import Counter
counter = Counter([weibo.created_at.strftime('%A') for weibo in user_dict["1000129923"].weibos])

len(counter)

7

In [66]:
empty_user_ids = set()

for uid, analysis in sentiment_analysis_dict.items():
    for wid, info in analysis.items():
        comprehensive_sentiment = info["comprehensive_sentiment"]
        if comprehensive_sentiment:
            try:
                sentiment_type = comprehensive_sentiment["type"]
                if '/' in sentiment_type:
                    print(f"{uid} 的 {wid} 微博情感分析结果为：{sentiment_type}")
                elif '-' in sentiment_type:
                    main_type, sub_type = sentiment_type.split('-')
                else:
                    main_type = sentiment_type
                    sub_type = ''
            except Exception as e:
                print(f"！{uid} 的 {wid} 的情感种类分析结果：{sentiment_type}")

        else:
            print(f"{uid} 的 {wid} 缺少情感分析数据")
            empty_user_ids.add(uid)

1221391872 的 5149955321496830 微博情感分析结果为：积极-喜悦/消极-不适
1221391872 的 5151118159512679 微博情感分析结果为：消极-焦虑/积极-乐观
1428514052 的 5045670788467661 微博情感分析结果为：积极-支持/消极-批评
1428514052 的 5046036165560158 微博情感分析结果为：积极-喜悦/中立-分析
1428514052 的 5065991728398678 微博情感分析结果为：消极-厌恶/积极-祝贺
1428514052 的 5129274820923698 微博情感分析结果为：中立-思考/消极-哀愁
1576755003 的 5134967474488203 微博情感分析结果为：混合-喜悦/抱怨
1576755003 的 5136174091865568 微博情感分析结果为：混合-疲惫/满足
1576755003 的 5136984051286492 微博情感分析结果为：混合-快乐/不适
1576755003 的 5137316345551657 微博情感分析结果为：混合-满足/失望
1576755003 的 5140411103054209 微博情感分析结果为：混合-无奈/喜爱
1585893207 的 5146766991951950 微博情感分析结果为：积极-喜爱/消极-愤怒
1612997921 的 5147490429699824 微博情感分析结果为：消极-哀愁/积极-喜悦
1707143432 的 5138967180611963 微博情感分析结果为：消极-愤怒/积极-满足
1707143432 的 5140522952819371 微博情感分析结果为：消极-沮丧/积极-满足
1707143432 的 5142749108506889 微博情感分析结果为：消极-无奈/积极-幽默
1710096575 的 5135856215525979 微博情感分析结果为：积极-喜悦/消极-哀愁
1710096575 的 5135881862382985 微博情感分析结果为：中立-恢复/消极-痛苦
1710096575 的 5136973920731294 微博情感分析结果为：积极-喜悦/消极-担忧
1710096575 的 51376942195720

In [63]:
error_user_ids.extend(empty_user_ids)

error_user_ids, len(error_user_ids)

(['1001430012',
  '1459678264',
  '1503453404',
  '1559322502',
  '1568298501',
  '1613106074',
  '1616697935',
  '1632664400',
  '1711730301',
  '1740694410',
  '1790654275',
  '1801338763',
  '1825825585',
  '1834416354',
  '1874193997',
  '1924798183',
  '1950063621',
  '1959239071',
  '2132576657',
  '2146496333',
  '2248101503',
  '2262471222',
  '2456280585',
  '2543209313',
  '2675302121',
  '2698237391',
  '2718070573',
  '2898049334',
  '2963046343',
  '2977278001',
  '2982804325',
  '2983965043',
  '3033830921',
  '3121541957',
  '3172291161',
  '3199686834',
  '3205121724',
  '3219416377',
  '3303375507',
  '3392500842',
  '3495177675',
  '3544134360',
  '3631024543',
  '3730739530',
  '3733806227',
  '3859337249',
  '3913469681',
  '5034739591',
  '5119132479',
  '5126203675',
  '5135867091',
  '5144443546',
  '5175857751',
  '5180815777',
  '5219300080',
  '5242886516',
  '5244152631',
  '5272845265',
  '5297008194',
  '5301613713',
  '5312784372',
  '5332922696',
  '53556