In [156]:
import polars as pl


wide_df = pl.read_parquet("website/combined_papers.parquet")
# shuffle the rows
wide_df = wide_df.sample(fraction=1.0, shuffle=True, seed=42)
print(wide_df.schema)
print(wide_df.columns)

Schema({'title': String, 'authors': List(String), 'institution': List(String), 'problem_background': String, 'method': String, 'experiment': String, 'one_sentence_summary': String, 'slug': String, 'keywords': List(String), 'further_thoughts': String, 'model': String, 'temperature': Float64, 'top_p': Float64, 'lang': String, 'id': String, 'preference': String, 'summary_time': String, 'score': Float64, 'abstract': String, 'categories': List(String), 'created': String, 'updated': String, 'source_file': String})
['title', 'authors', 'institution', 'problem_background', 'method', 'experiment', 'one_sentence_summary', 'slug', 'keywords', 'further_thoughts', 'model', 'temperature', 'top_p', 'lang', 'id', 'preference', 'summary_time', 'score', 'abstract', 'categories', 'created', 'updated', 'source_file']


In [157]:
SAFE_DELIMITER = "\x1f"
wide_df = wide_df.lazy()
format_df = wide_df.with_columns(
    pl.col(name).list.join(SAFE_DELIMITER).alias(name)
    for name in ("authors", "institution", "keywords", "categories")
)
format_df = format_df.with_columns(
    # Convert Float64 to String with precision 2
    pl.col(name).round(6).cast(pl.Utf8).alias(name)
    for name in ("score", "top_p", "temperature")
)
long_df = format_df.unpivot(index="id")

long_df = long_df.collect()
long_df.write_csv("raw.csv")
long_df

id,variable,value
str,str,str
"""2502.14866""","""title""","""LServe: Efficient Long-sequenc…"
"""2503.08727""","""title""","""Training Plug-n-Play Knowledge…"
"""2504.21018""","""title""","""HYPEROFA: Expanding LLM Vocabu…"
"""2502.05945""","""title""","""HSI: Head-Specific Interventio…"
"""2504.02263""","""title""","""MegaScale-Infer: Serving Mixtu…"
…,…,…
"""2503.23798""","""source_file""","""2503.23798.json"""
"""2504.18413""","""source_file""","""2504.18413.json"""
"""2504.21801""","""source_file""","""2504.21801.json"""
"""2503.01713""","""source_file""","""2503.01713.json"""


In [158]:
SAFE_DELIMITER = "\x1f"
# 需要转换回列表的列名
list_cols_to_revert = ["authors", "institution", "keywords", "categories"]
# 需要转换回浮点数的列名
float_cols_to_revert = ["score", "top_p", "temperature"]

pivoted_df = long_df.pivot(
    index="id",
    on="variable",
    values="value"
)
pivoted_df

id,title,authors,institution,problem_background,method,experiment,one_sentence_summary,slug,keywords,further_thoughts,model,temperature,top_p,lang,preference,summary_time,score,abstract,categories,created,updated,source_file
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""2502.14866""","""LServe: Efficient Long-sequenc…","""Shang YangJunxian GuoHaotian…","""MITShanghai Jiao Tong Univers…","""大型语言模型（LLMs）在处理长序列和复杂推理任务时表现出色…","""* **核心思想：** LServe 通过统一块稀疏注意力框…","""* **实验设置：** 本文使用Llama-3-8B、Min…","""本文提出LServe系统，通过统一块稀疏注意力机制结合静态和…","""lserve-sparse-attention""","""LLMSparse AttentionBlock Spa…","""这项工作突显了稀疏注意力的潜力，不仅可以扩展到多模态模型（如…","""grok-3-mini-latest""","""0.5""","""0.7""","""zh""","""dislike""","""2025-05-04T08:26:52.160873+00:…","""0.830707""","""Large language models (LLMs) h…","""cs.CLcs.AIcs.DCcs.LGcs.PF""","""2025-04-21""","""2025-04-22""","""2502.14866.json"""
"""2503.08727""","""Training Plug-n-Play Knowledge…","""Lucas CacciaAlan AnsellEdoar…","""Microsoft Research MontrealUn…","""大型语言模型（LLM）在海量语料上预训练后，能够捕获广泛的语…","""* **核心思想：** 本文提出深度上下文蒸馏（Deep…","""* **数据集和模型：** 实验使用 QuALITY（多…","""本文提出使用深度上下文蒸馏训练可插拔知识模块的方法，能够在低…","""plug-and-play-knowledge-module…","""LLMKnowledge ModuleDeep Cont…","""这个模块化方法强调了知识注入的灵活性，可能在隐私保护和高效推…","""grok-3-mini-latest""","""0.5""","""0.7""","""zh""","""unknown""","""2025-05-04T08:28:09.998715+00:…","""0.690604""","""Dynamically integrating new or…","""cs.LGcs.AI""","""2025-04-29""","""2025-04-30""","""2503.08727.json"""
"""2504.21018""","""HYPEROFA: Expanding LLM Vocabu…","""Enes ÖzerenYihong LiuHinrich…","""LMU MunichMunich Center for M…","""多语言预训练语言模型（PLMs）在中等和低资源语言上的性能 …","""*核心思想:* 使用超网络学习从外部多语言词向量空间到PLM…","""*实验设置:* 在RoBERTa和XLM-R上扩展词汇，比较…","""本文提出基于超网络的HYPEROFA方法，用于初始化新语言令…","""hyperofa-embedding-initializat…","""HypernetworkEmbedding Initial…","""HYPEROFA的方法突显了超网络在嵌入初始化中的灵活性，可…","""grok-3-mini-latest""","""0.5""","""0.7""","""zh""","""unknown""","""2025-05-04T08:33:03.377385+00:…","""0.629196""","""Many pre-trained language mode…","""cs.CLcs.LG""","""2025-04-21""","""2025-05-01""","""2504.21018.json"""
"""2502.05945""","""HSI: Head-Specific Interventio…","""Paul DarmAnnalisa Riccardi""","""University of Strathclyde""","""大型语言模型（LLMs）在各种领域的广泛应用使得安全对齐变得…","""* **核心思想:** 通过在模型的注意力头级别进行细粒度激…","""* **实验设置:** 使用Llama 2 7b模型和Ant…","""本文提出Head-Specific Intervention…","""head-specific-intervention-hsi""","""LLMAttention HeadsActivation…","""本文揭示了注意力头激活的线性可分性，这可能启发更细粒度的模型…","""grok-3-mini-latest""","""0.5""","""0.7""","""zh""","""unknown""","""2025-05-04T08:27:44.573145+00:…","""0.78968""","""Robust alignment guardrails fo…","""cs.CLcs.AI""","""2025-05-01""","""2025-05-02""","""2502.05945.json"""
"""2504.02263""","""MegaScale-Infer: Serving Mixtu…","""Ruidong ZhuZiheng JiangChao …","""ByteDance SeedPeking Universi…","""混合专家（MoE）模型在扩展大型语言模型（LLM）时展示了巨…","""* **核心思想：** 通过分离注意力模块和FFN模块（即专…","""* **实验设置：** 使用Mixtral-8×22B、DB…","""本文提出MegaScale-Infer系统，通过分离注意力模…","""megascale-infer-disaggregated-…","""LLMMixture-Of-ExpertsDisaggr…","""这项工作突出了资源分离在AI推理中的潜力，或许可以扩展到其他…","""grok-3-mini-latest""","""0.5""","""0.7""","""zh""","""unknown""","""2025-05-04T08:30:03.321848+00:…","""0.706508""","""Mixture-of-Experts (MoE) showc…","""cs.DCcs.LG""","""2025-04-23""","""2025-04-24""","""2504.02263.json"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2503.23798""","""Adaptive Layer-skipping in Pre…","""Xuan LuoWeizhi WangXifeng Ya…","""University of California, Sant…","""大型语言模型（LLMs）在生成每个token时通常进行完整的…","""*核心思想:* FlexiDepth是一种插件式方法，旨在在…","""*实验设置:* 本文在Llama-3-8B-Instruct…","""本文提出FlexiDepth方法，通过插件式路由器和适配器实…","""adaptive-layer-skipping-llms""","""Large Language ModelsLayer Sk…","""FlexiDepth的层跳过机制启发我们思考LLM内部计算的…","""grok-3-mini-latest""","""0.5""","""0.7""","""zh""","""unknown""","""2025-05-04T08:28:28.330351+00:…","""0.625531""","""Various layer-skipping methods…","""cs.CLcs.AI""","""2025-04-17""","""2025-04-21""","""2503.23798.json"""
"""2504.18413""","""An Empirical Study of Evaluati…","""Ning XianYixing FanRuqing Zh…","""Institute of Computing Technol…","""长形式问题回答（LFQA）旨在为复杂问题生成长篇答案，随着大…","""本文采用实证研究方法，核心思想是通过元评估（meta-eva…","""实验使用ASQA（歧义事实QA）、ANTIQUE（非事实开放…","""本文实证研究了长形式问题回答的自动评估指标，证明了基于LLM…","""empirical-study-lfqa-evaluatio…","""LLMEvaluation MetricsQuestio…","""本文的研究强调了LLM在评估中的潜力，但也揭示了偏差问题，这…","""grok-3-mini-latest""","""0.5""","""0.7""","""zh""","""unknown""","""2025-05-04T08:31:29.169100+00:…","""0.557759""","""\Ac{LFQA} aims to generate len…","""cs.IR""","""2025-04-25""","""2025-04-28""","""2504.18413.json"""
"""2504.21801""","""DeepSeek-Prover-V2: Advancing …","""Z. Z. RenZhihong ShaoJunxiao…","""DeepSeek-AI""","""大型语言模型（LLMs）在非正式数学推理中表现出色，能够处理…","""* **核心思想：** 通过将复杂定理分解为子目标，并结合强…","""* **数据集和实验设置：** 本文在多个基准上评估模型，包…","""本文提出DeepSeek-Prover-V2，通过子目标分解…","""deepseek-prover-v2""","""Formal Theorem ProvingReinfor…","""本文的方法展示了如何通过强化学习和子目标分解来桥接非正式和正…","""grok-3-mini-latest""","""0.5""","""0.7""","""zh""","""unknown""","""2025-05-04T08:32:56.817665+00:…","""0.569836""","""We introduce DeepSeek-Prover-V…","""cs.CLcs.AI""","""2025-04-30""","""2025-05-01""","""2504.21801.json"""
"""2503.01713""","""SAGE: A Framework of Precise R…","""Jintao ZhangGuoliang LiJinya…","""Tsinghua University""","""检索增强生成（RAG）技术在特定语料库上的问答（QA）任务中…","""* **核心思想：** SAGE框架旨在通过改进检索阶段…","""* **数据集和设置：** 使用NarrativeQA、…","""本文提出SAGE框架，通过语义分割、基于梯度的块选择和LLM…","""sage-precise-retrieval-for-rag""","""RAGSemantic SegmentationChun…","""SAGE框架的语义分割和动态块选择机制可能扩展到多模态检索领…","""grok-3-mini-latest""","""0.5""","""0.7""","""zh""","""unknown""","""2025-05-04T08:27:34.874065+00:…","""0.641307""","""Retrieval-augmented generation…","""cs.LGcs.AIcs.DBcs.IR""","""2025-04-30""","""2025-05-01""","""2503.01713.json"""


In [159]:
restored_df = pivoted_df.with_columns(
    # 将字符串分割回列表
    pl.col(name).str.split(SAFE_DELIMITER).alias(name)
    for name in list_cols_to_revert if name in pivoted_df.columns # 确保列存在
).with_columns(
    # 将字符串转换回 Float64
    pl.col(name).cast(pl.Float64).alias(name)
    for name in float_cols_to_revert if name in pivoted_df.columns # 确保列存在
)
restored_df.schema

Schema([('id', String),
        ('title', String),
        ('authors', List(String)),
        ('institution', List(String)),
        ('problem_background', String),
        ('method', String),
        ('experiment', String),
        ('one_sentence_summary', String),
        ('slug', String),
        ('keywords', List(String)),
        ('further_thoughts', String),
        ('model', String),
        ('temperature', Float64),
        ('top_p', Float64),
        ('lang', String),
        ('preference', String),
        ('summary_time', String),
        ('score', Float64),
        ('abstract', String),
        ('categories', List(String)),
        ('created', String),
        ('updated', String),
        ('source_file', String)])

In [160]:
wide_df.select("id").collect() == restored_df.select("id")

id
bool
true
true
true
true
true
…
true
true
true
true
