In [1]:
import pandas as pd

file_path = '事件筛选2023.csv'
df = pd.read_csv(file_path)

# 处理缺失值
df['大事件'].fillna('未知', inplace=True)  # 填充缺失值
df['风险类型'].fillna('未知', inplace=True)  # 填充缺失值

# 确保时间列的数据类型
df['监测起始时间'] = pd.to_datetime(df['监测起始时间'], errors='coerce')
df['监测结束时间'] = pd.to_datetime(df['监测结束时间'], errors='coerce')
df['首曝时间'] = pd.to_datetime(df['首曝时间'], errors='coerce')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   事件名称    1000 non-null   object        
 1   大事件     1000 non-null   object        
 2   事件背景    1000 non-null   object        
 3   主关键词    1000 non-null   object        
 4   监测起始时间  1000 non-null   datetime64[ns]
 5   监测结束时间  1000 non-null   datetime64[ns]
 6   首曝时间    1000 non-null   datetime64[ns]
 7   地域      1000 non-null   object        
 8   倾向      1000 non-null   object        
 9   话题领域    1000 non-null   object        
 10  涉事主体    1000 non-null   object        
 11  风险类型    1000 non-null   object        
 12  传播量     1000 non-null   float64       
dtypes: datetime64[ns](3), float64(1), object(9)
memory usage: 101.7+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['大事件'].fillna('未知', inplace=True)  # 填充缺失值
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['风险类型'].fillna('未知', inplace=True)  # 填充缺失值


In [2]:
import numpy as np
import random

# 设置传播量上限
propagation_min = 70000000
propagation_max = 500000000# 根据需要设定

def select(population, fitness_scores):
        
    # 将适应度分数转换为概率分布
    fitness_total = sum(fitness_scores)
    if fitness_total == 0:
        probabilities = np.full(len(population), 1/len(population))
    else:
        probabilities = [score / fitness_total for score in fitness_scores]
    
    # 根据概率选择个体
    selected_index = np.random.choice(len(population), p=probabilities)
    return population[selected_index]
    
# 计算多样性和均衡性方差的函数
def calculate_diversity_and_variance(df, topic_col='话题领域', subject_col='涉事主体', risk_col='风险类型'):
    unique_topics_ratio = df[topic_col].nunique() / 102
    unique_subjects_ratio = df[subject_col].nunique() / 123
    unique_risks_ratio = df[risk_col].nunique() / 19
    
    diversity = unique_topics_ratio * unique_subjects_ratio * unique_risks_ratio
    
    std_topics = df[topic_col].value_counts(normalize=True).std()
    std_subjects = df[subject_col].value_counts(normalize=True).std()
    std_risks = df[risk_col].value_counts(normalize=True).std()
    std_spread = df['传播量'].std()
    
    variance_diversity = std_topics * std_subjects * std_risks * std_spread
    
    objective = diversity / variance_diversity
    
    return objective

# 评估个体适应度的函数
def evaluate(individual, df, propagation_min, propagation_max, topic_col='话题领域', subject_col='涉事主体', risk_col='风险类型'):
    selected_events = individual
    propagation_sum = (selected_events['传播量'] * 0.4 * 0.1).sum()  # 调整传播量的比例

    if propagation_sum < propagation_min or propagation_sum > propagation_max:
        return (0, )
    
    diversity = calculate_diversity_and_variance(selected_events, topic_col, subject_col, risk_col)

    # 倾向权重
    tendency_weight = selected_events['倾向'].isin(['负面', '争议']).mean()
    
    # 大事件覆盖率
    big_events_count = selected_events['大事件'].notnull().sum()
    total_events_count = len(selected_events)
    big_events_coverage = big_events_count / total_events_count if total_events_count > 0 else 0
    
    # 大事件多样性
    big_events = selected_events[selected_events['大事件'].notnull()]
    big_events_diversity = big_events[topic_col].nunique()

    # 综合适应度
    fitness = diversity * tendency_weight * big_events_coverage * big_events_diversity
    return fitness

# 贪心算法生成初始种群，确保不重复选择事件
def generate_initial_population(events_df, max_spread, population_size=10):
    initial_population = []
    
    # 根据传播量排序，选择传播量较小的事件优先
    sorted_events = events_df.sort_values(by="传播量")
    
    for _ in range(population_size):
        selected_events = []  # 用于存储当前组合中的事件
        selected_event_ids = set()  # 用于追踪已选择的事件ID
        spread_sum = 0  # 当前组合的传播量和

        # 按顺序选择事件，确保不重复选择
        for _, event in sorted_events.iterrows():
            # 检查事件是否已被选择，且是否满足传播量限制
            if event['事件名称'] not in selected_event_ids and spread_sum + event['传播量'] <= propagation_max:
                selected_events.append(event)  # 将事件加入组合
                selected_event_ids.add(event['事件名称'])  # 记录事件ID，防止重复
                spread_sum += event['传播量']  # 更新传播量和

                # 确保每个个体包含约100个事件
                if len(selected_events) >= 100:
                    break
        
        # 如果事件数量不足100，继续选择直到达到100个
        while len(selected_events) < 100 and spread_sum < propagation_max:
            for _, event in sorted_events.iterrows():
                if event['事件名称'] not in selected_event_ids and spread_sum + event['传播量'] <= propagation_max:
                    selected_events.append(event)
                    selected_event_ids.add(event['事件名称'])
                    spread_sum += event['传播量']
                    if len(selected_events) >= 100:
                        break
        
        # 将选择的事件集转化为 DataFrame 并加入初始种群
        selected_events_df = pd.DataFrame(selected_events)
        
        # 去重：确保每个“大事件”只保留一个事件
        big_events = selected_events_df[selected_events_df['大事件'].notnull()]
        if len(big_events) > 1:
            big_events = big_events.drop_duplicates(subset=['事件名称'], keep='first')
            selected_events_df = pd.concat([selected_events_df[~selected_events_df['事件名称'].isin(big_events['事件名称'])], big_events])
        
        # 将选择的事件集转化为DataFrame并加入初始种群
        initial_population.append(pd.DataFrame(selected_events_df))

    return initial_population

# 交叉操作，确保生成的新个体不包含重复事件
def crossover(parent1, parent2):
    # 随机选择交叉点
    cross_point = random.randint(0, len(parent1) - 1)
    
    # 使用交叉点前后的事件组合创建子代，并去重
    child1_events = pd.concat([parent1.iloc[:cross_point], parent2.iloc[cross_point:]]).drop_duplicates(subset='事件名称')
    child2_events = pd.concat([parent2.iloc[:cross_point], parent1.iloc[cross_point:]]).drop_duplicates(subset='事件名称')
    
    # 去重：确保每个“大事件”只保留一个事件
    child1_big_events = child1_events[child1_events['大事件'].notnull()]
    if len(child1_big_events) > 1:
        child1_big_events = child1_big_events.drop_duplicates(subset=['事件名称'], keep='first')
        child1_events = pd.concat([child1_events[~child1_events['事件名称'].isin(child1_big_events['事件名称'])], child1_big_events])
    
    child2_big_events = child2_events[child2_events['大事件'].notnull()]
    if len(child2_big_events) > 1:
        child2_big_events = child2_big_events.drop_duplicates(subset=['事件名称'], keep='first')
        child2_events = pd.concat([child2_events[~child2_events['事件名称'].isin(child2_big_events['事件名称'])], child2_big_events])
    
    # 返回两个子代
    return child1_events, child2_events

# 变异操作，确保新的事件不会重复选择
def mutate(individual, events_df, mutation_rate=0.1):
    selected_event_ids = set(individual['事件名称'])  # 获取当前个体中的事件ID
    for i in range(len(individual)):
        if random.random() < mutation_rate:
            # 从整个事件集随机选择一个未被选择的事件
            new_event = events_df[~events_df['事件名称'].isin(selected_event_ids)].sample(1).iloc[0]
            individual.iloc[i] = new_event  # 将当前事件替换为新事件
            selected_event_ids.add(new_event['事件名称'])  # 更新选定事件的ID集合

    # 去重：确保每个“大事件”只保留一个事件
    big_events = individual[individual['大事件'].notnull()]
    if len(big_events) > 1:
        big_events = big_events.drop_duplicates(subset=['事件名称'], keep='first')
        individual = pd.concat([individual[~individual['事件名称'].isin(big_events['事件名称'])], big_events])
    return individual

def elitist_selection(population, fitness_scores, elite_size):
    # 将个体与其适应度分数配对
    paired_population = list(zip(population, fitness_scores))
    # 按适应度分数排序
    paired_population.sort(key=lambda x: x[1], reverse=True)
    # 选择适应度最高的elite_size个个体
    elite_individuals = [individual for individual, score in paired_population[:elite_size]]
    return elite_individuals

# 遗传算法主函数
def genetic_algorithm(events_df, propagation_min, propagation_max, generations=100, population_size=200, mutation_rate=0.1, elite_size = 10):
    # 初始种群生成
    population = generate_initial_population(events_df, propagation_max, population_size)
    
    for generation in range(generations):
        # 计算每个个体的适应度分数
        fitness_scores = [evaluate(individual, events_df, propagation_min, propagation_max) for individual in population]
        fitness_scores = [score if isinstance(score, (int, float)) else score[0] for score in fitness_scores]

        if len(fitness_scores) != len(population):
            raise ValueError("Fitness scores length does not match population length")

        # 精英选择策略
        elite_individuals = elitist_selection(population, fitness_scores, elite_size)
        
        new_population = elite_individuals.copy()
        
        # 生成下一代个体
        while len(new_population) < population_size:
            # 选择父代
            parent1 = select(population, fitness_scores)
            parent2 = select(population, fitness_scores)
            
            # 交叉操作生成子代，并确保去重
            child1, child2 = crossover(parent1, parent2)
            
            # 变异操作，确保事件唯一性
            child1 = mutate(child1, events_df, mutation_rate)
            child2 = mutate(child2, events_df, mutation_rate)
            
            new_population.extend([child1, child2])  # 添加新个体至种群
        
        # 更新种群
        population = new_population[:population_size]
    
    # 找到适应度最高的个体
    fitness_scores = [evaluate(individual, events_df, propagation_min, propagation_max) for individual in population]
    best_individual = population[fitness_scores.index(max(fitness_scores))]
    
    return best_individual

# 执行遗传算法，输出最优组合
best_dataset = genetic_algorithm(df, propagation_min, propagation_max, elite_size = 10)

# 计算筛选后的多样性和传播量
final_diversity = calculate_diversity_and_variance(best_dataset, topic_col='话题领域', subject_col='涉事主体', risk_col='风险类型')
final_propagation_sum = (best_dataset['传播量'] * 0.4 * 0.1).sum()

# 计算大事件的个数
big_events_count = best_dataset[best_dataset['大事件'].notnull()].shape[0]

# 计算三种多样性数值
unique_topics_diversity = best_dataset['话题领域'].nunique() / 102
unique_subjects_diversity = best_dataset['涉事主体'].nunique() / 123
unique_risks_diversity = best_dataset['风险类型'].nunique() / 19

# 计算总事件数
total_events_count = best_dataset.shape[0]

# 计算倾向中各类别的占比
tendency_counts = best_dataset['倾向'].value_counts(normalize=True)

# 输出筛选后的Dataset及其相关统计信息
print("筛选后的Dataset:")
print(best_dataset)
print("多样性 (Diversity):", final_diversity)
print("传播量 (Propagation Sum):", final_propagation_sum)
print("大事件个数:", big_events_count)
print("话题领域多样性:", unique_topics_diversity)
print("涉事主体多样性:", unique_subjects_diversity)
print("风险类型多样性:", unique_risks_diversity)
print("总事件数:", total_events_count)
print("倾向占比:")
print(tendency_counts)

# 保存筛选后的数据集为 Excel 文件
output_file_path = '筛选后的事件组合2023.xlsx'
best_dataset.to_excel(output_file_path, index=False)

# 输出筛选结果和文件保存路径
print("筛选后的Dataset已保存为Excel文件：", output_file_path)
print("多样性 (Diversity):", final_diversity)
print("传播量 (Propagation Sum):", final_propagation_sum)

筛选后的Dataset:
                         事件名称   大事件  \
999         洪都拉斯寻求与中国正式建立外交关系    未知   
998           我国自主研发的新一代CPU发布     　   
997            杭州亚运会中国体育代表团成立    未知   
996       印度“月船3号”月球探测器着陆月球表面    未知   
995     十三届全国人大常委会第三十九次会议在京举行    未知   
..                        ...   ...   
904  习近平主席出席金砖国家领导人巴以问题特别视频峰会  巴以冲突   
903              华为15亿成立房地产公司    未知   
902       侯友宜获提名参选2024台湾地区领导人    未知   
901           一中国香港籍货轮在长崎近海沉没    未知   
900             山西安泽施工事故致7人遇难     　   

                                                  事件背景  \
999           3月15日，洪都拉斯总统发文称，已指示外长寻求与中华人民共和国建立正式外交关系。   
998  11月28日，2023龙芯产品发布暨用户大会在北京举行。大会发布了新一代通用处理器龙芯3A6...   
997  杭州第19届亚运会中国体育代表团成立大会12日在北京举行。出征杭州亚运会的中国体育代表团共有...   
996   8月23日新德里消息：印度空间研究组织当地时间23日宣布，“月船3号”月球探测器已着陆月球表面。   
995  2月23日上午，十三届全国人大常委会第三十九次会议在北京人民大会堂举行第一次全体会议。经审查...   
..                                                 ...   
904  11月21日晚，国家主席习近平出席金砖国家领导人巴以问题特别视频峰会并发表题为《推动停火止战...   
903  据国家企业信用信息系统显示，8月3日，华为投资控股有限公司出资成立东莞棠雅实业投资有限公司，...