In [14]:
import json
import pymysql

# 读取配置文件
with open('weibo_config.json', 'r') as f:
    config = json.load(f)

def execute_sql(sql, params=None):
    try:
        with pymysql.connect(**config['database']) as conn:
            with conn.cursor() as cursor:
                cursor.execute(sql, params)
                result = cursor.fetchall()
                return result
    except Exception as e:
        print(f"执行SQL语句时出现错误：{e}")
        raise

sql = """
SELECT wid, text, created_at 
FROM weibo_cleaned
WHERE done != 1 AND uid IN (
    SELECT uid FROM (
        SELECT uid FROM user_cleaned WHERE done = 0 LIMIT 1
    ) AS temp
)
"""

sql = """
SELECT DATE_FORMAT(created_at, '%Y-%m-%dT%H:%i:%S') FROM weibo LIMIT 1;
"""

execute_sql(sql)

(('2024-06-01T00:00:06',),)

In [3]:
import json

file_name = "test100cleaned.json"

with open(file_name, 'r', encoding='utf-8') as f:
    total_dict:dict = json.load(f)

user_list = []

for user_info in total_dict.values():
    user = {}
    user["元数据"] = (
        user_info["user_id"], 
        user_info["screen_name"], 
        user_info["IP"], 
        "男" if user_info["sex"] == 'm' else "女"
    )
    
    user["微博序列"] = [
        (
            w["text"],
            w["created_at"], 
            (w["attitudes_count"], 
             w["comments_count"], 
             w["reposts_count"]
             )
         ) for w in user_info["weibo"]
    ]
    user_list.append(user)



In [4]:
user_weibo_len_list = []
for user in user_list:
    try:
        user_name = user["元数据"][1]
        total_weibo_len = sum(len(w[0]) for w in user["微博序列"])
        weibo_count = len(user["微博序列"])
        avg_weibo_len = total_weibo_len / weibo_count
        user_weibo_len_list.append((user_name, total_weibo_len, weibo_count, avg_weibo_len))
    except Exception as e:
        print(f"处理{user_name}时出现错误：{e}")
user_weibo_len_list.sort(key=lambda x: x[1], reverse=True)

user_weibo_len_list

[('宁港升腾', 56389, 50, 1127.78),
 ('作家戴明', 49159, 50, 983.18),
 ('姑苏三先生', 47567, 50, 951.34),
 ('曦月儿', 34297, 49, 699.9387755102041),
 ('云杉413', 34296, 50, 685.92),
 ('心理咨询师查尔斯', 30125, 50, 602.5),
 ('蔚蓝尔蓝', 25493, 39, 653.6666666666666),
 ('buorong', 18845, 35, 538.4285714285714),
 ('dj热情三分爱七分', 17617, 50, 352.34),
 ('布衣锦时', 16810, 50, 336.2),
 ('林中跬步', 15423, 50, 308.46),
 ('Wqlmlmj', 15344, 37, 414.7027027027027),
 ('我是快乐神', 12954, 50, 259.08),
 ('cailiannv12', 12548, 50, 250.96),
 ('二十三荳', 12427, 49, 253.6122448979592),
 ('新镇闻杰', 12388, 50, 247.76),
 ('吾东隅已逝', 11971, 50, 239.42),
 ('果然有才', 11865, 50, 237.3),
 ('言一声医事Y1314', 9105, 50, 182.1),
 ('耗子小笨笨', 8729, 42, 207.83333333333334),
 ('因圆天地间', 8304, 50, 166.08),
 ('倾山啊倾山', 7627, 48, 158.89583333333334),
 ('smile__Juan', 7585, 34, 223.08823529411765),
 ('anuan668_238', 6934, 49, 141.51020408163265),
 ('人生本由我', 6884, 50, 137.68),
 ('香草与巧克力zy', 6397, 50, 127.94),
 ('哈吉艾打嗝', 6326, 50, 126.52),
 ('V8纯电动九菜合子特别有理想', 6204, 48, 129.25),
 ('子绿

In [5]:
len(user_list)

97

In [8]:
# 设置最大字符长度限制
max_length = 1000
user_selected_content = {}

def hybrid_select_weibos(weibo_content: dict, max_length: int = 1000) -> dict:
    """
    混合策略选择微博内容：
    1. 60%空间用于保存重要微博（基于长度和互动数）
    2. 40%空间用于均匀采样其他微博
    
    参数:
        weibo_content: 用户ID到微博内容列表的字典
        max_length: 最大允许的字符长度
        
    返回:
        处理后的用户ID到合并微博内容的字典
    """
    def get_importance_score(weibo: str) -> float:
        """
        计算单条微博的重要性得分
        - 微博长度<50: 得分随长度增加而增加
        - 微博长度>300: 得分随长度增加而减少
        - 其他情况: 得分为1.0
        """
        length = len(weibo)
        if length < 50:
            return length / 50
        elif length > 300:
            return 300 / length
        return 1.0

    def select_important_weibos(weibos: list, target_length: int) -> list:
        """
        选择重要微博，按重要性得分排序并在目标长度内选择尽可能多的微博
        """
        sorted_weibos = sorted(weibos, key=get_importance_score, reverse=True)
        result = []
        current_length = 0
        
        for weibo in sorted_weibos:
            # 考虑分隔符的长度
            if current_length + len(weibo) + 1 < target_length:
                result.append(weibo)
                current_length += len(weibo) + 1  # +1 for "|"
            else:
                break
        
        return result

    def sample_remaining_weibos(weibos: list, target_length: int) -> list:
        """
        对剩余微博进行采样，保持原有顺序但按比例截断每条微博
        """
        if not weibos:
            return []
            
        # 计算总长度（包括分隔符）
        total_length = sum(len(w) for w in weibos) + (len(weibos) - 1)
        if total_length <= target_length:
            return weibos
        
        # 计算需要保留的比例
        keep_ratio = (target_length - (len(weibos) - 1)) / sum(len(w) for w in weibos)
        result = []
        current_length = 0
        
        # 按比例截断每条微博
        for weibo in weibos:
            keep_length = int(len(weibo) * keep_ratio)
            if keep_length > 0:
                result.append(weibo[:keep_length])
                current_length += keep_length + 1
                if current_length >= target_length:
                    break
                    
        return result

    user_selected_content = {}

    # 处理每个用户的微博
    for user_id, weibos in weibo_content.items():
        # 计算总长度（包括分隔符）
        total_length = sum(len(weibo) for weibo in weibos) + (len(weibos) - 1)
        
        # 如果总长度在限制内，直接使用全部内容
        if total_length <= max_length:
            user_selected_content[user_id] = "|".join(weibos)
            continue
        
        # 预留分隔符的空间
        separator_length = 1  # "|" 的长度
        effective_length = max_length - separator_length * 2  # 为最后合并预留空间
        
        # 按8:2分配空间
        important_length = int(effective_length * 0.8)
        remaining_length = effective_length - important_length

        # 选择重要微博
        important_weibos = select_important_weibos(weibos, important_length)
        
        # 采样剩余微博
        remaining_weibos = [w for w in weibos if w not in important_weibos]
        sampled_weibos = sample_remaining_weibos(remaining_weibos, remaining_length)
        
        # 合并结果并确保不超过最大长度
        final_text = "|".join(important_weibos + sampled_weibos)
        if len(final_text) > max_length:
            final_text = final_text[:max_length-3] + "..."
            
        user_selected_content[user_id] = final_text

    return user_selected_content

weibo_content = {"1100949262": [w["text"] for w in total_dict["1100949262"]["weibo"]]}

# 使用示例：处理所有用户的微博内容
user_selected_content = hybrid_select_weibos(weibo_content, max_length)

# 验证结果：打印前三个用户的内容长度
for user_id, content in list(user_selected_content.items())[:5]:
    before = sum(len(w) for w in weibo_content[user_id])
    after = len(content)
    print(f"用户 {user_id} 的原微博内容长度：{before}，采样后内容长度: {len(content)}")
    print(f"内容：{content}")

用户 1100949262 的原微博内容长度：56389，采样后内容长度: 540
内容：每日健康问答|不同年龄段人群所需睡眠时长是多少？ 全国爱卫办近日发布《睡眠健康核心信息及释义》，推荐成年人晚上10至11点入睡，早晨6至7点起床，其中老年人可早晨5至6点起床。 此信息及释义提出，不同年龄段人群所需睡眠时长不同，且因人而异。一般来说，学龄前儿童每天需要10至13小时，中小学生8至10小时，成年人7至8小时，老年人6至7小时。 根据此信息及释义，良好睡眠质量通常表现为：入睡时间在30分钟以内（6岁以下儿童在20分钟以内）；夜间醒来的次数不超过3次，且醒来后能在20分钟内再次入睡；醒来后感到精神饱满、心情愉悦、精力充沛、注意力集中。 根据此信息及释义，规律作息、劳逸结合、适量运动以及安静舒适的睡眠环境有助于保持良好睡眠。应避免熬夜、睡前饮酒喝茶，晚餐不宜过饱过晚，睡前尽量不要刷手机。（记者董瑞丰 李恒）|每|每日|每日|每日健康|每|每日健康|每日健|每日|每日健|每日|每日|每日|每|每日健康|#妇女|每日健|每日健|每日健|每日|每日健|每日健康|每日健|每日|每日|每日健康问|每日健|每日|每日健康问|每|每|每日健康|每日健|每日|每日健|每日健康|每日健|每|每日|每日健康|每日|每日|每日|每日|每日健|每日健|每日|每日健|每日健|每日健


In [1]:
from openai import OpenAI

client = OpenAI(api_key="sk-8a842a935e1348e6b99a95d0614d7edc", base_url="https://api.deepseek.com")

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": "你是一个专业的英语翻译官，请将我给出的中文翻译成英文"},
        {"role": "user", "content": "念天地之悠悠，独怆然而涕下"},
    ],
    stream=False
)

print(response.choices[0].message.content)

I find no man in the world who knows me; Oh where can I find a friend to know my heart?
