In [3]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv('vega-lite_examples.csv', encoding='iso-8859-1')
# 确定原始表格的行数H
H = len(df)

# 确定原始表格的列数N（每两列为一组）
N = len(df.columns)

# 初始化一个新的DataFrame来存储转换后的数据
new_df = pd.DataFrame()

# 遍历每一组两列
for i in range(0, N, 2):
    # 选取每两列
    temp_df = df.iloc[:, i:i+2]
    # 将选取的两列数据转置并添加到新的DataFrame中
    new_df = pd.concat([new_df, temp_df.transpose()], ignore_index=True)

# 保存到新的CSV文件
new_df.to_csv('transformed_vega-lite_examples.csv', index=False)

In [5]:
import pandas as pd

# 读取CSV文件
input_file = 'vega-lite_examples.csv'
df = pd.read_csv(input_file, header=None, encoding='iso-8859-1')

# 获取原始数据的维度
H, num_columns = df.shape
N = num_columns // 2

# 初始化一个新的DataFrame
expanded_df = pd.DataFrame(columns=['name', 'url'])

# 处理数据
for i in range(N):
    col1 = df.iloc[:, 2*i]
    col2 = df.iloc[:, 2*i + 1]
    
    # 创建临时DataFrame
    temp_df = pd.DataFrame({
        'name': col1,
        'url': col2
    })
    
    # 过滤掉任何字段为空的行
    temp_df = temp_df.dropna(subset=['name', 'url'])
    
    # 追加到最终DataFrame
    expanded_df = pd.concat([expanded_df, temp_df], ignore_index=True)

# 将结果保存为新的CSV文件
output_file = 'transformed_vega-lite_examples.csv'
expanded_df.to_csv(output_file, index=False)

print(f"转换完成。结果已保存到 {output_file}")

转换完成。结果已保存到 transformed_vega-lite_examples.csv


In [1]:
import pandas as pd
import requests
import re
import json
from tqdm import tqdm
import os

# 读取CSV文件
input_file = 'transformed_vega-lite_examples.csv'  # 替换为你的CSV文件路径
output_file = 'examples.json'
df = pd.read_csv(input_file)

# 准备存储结果的列表

# 正则表达式用于提取meta描述
description_pattern = re.compile(r'<meta\s+name="description"\s+content="([^"]*)"\s*/>', re.IGNORECASE)

# 正则表达式用于提取<code>内容
code_pattern = re.compile(r'<pre\s+class="example-spec"><code\s+class="language-json">([^<]*)</code></pre>', re.IGNORECASE | re.DOTALL)

def load_existing_data(file_path):
    if not os.path.exists(file_path):
        return []
    with open(file_path, 'r') as file:
        data = json.load(file)
    return list(data)

results = load_existing_data(output_file)

df_filtered = df[~df['name'].isin([item['name'] for item in results])]
print(f"已有{len(results)}条，待爬{len(df_filtered)}条")
# 遍历CSV文件的每一行
for _, row in tqdm(df_filtered.iterrows()):
    name = row['name']
    url = row['url']

    try:
        # 发起GET请求获取HTML内容
        response = requests.get(url, timeout=60)
        response.raise_for_status()  # 确保请求成功
        
        html_content = response.text

        # 提取description
        description_match = description_pattern.search(html_content)
        description = description_match.group(1) if description_match else ""

        # 提取code
        code_match = code_pattern.search(html_content)
        code = code_match.group(1).strip() if code_match else ""

        # 将提取的数据添加到结果列表
        results.append({
            'name': name,
            'description': description,
            'code': code
        })
        print(f"name:{name}\n\ndescription:{description}\n\ncode:{code}\n\n\n")
        
        with open(output_file, 'w') as f:
            json.dump(results, f, indent=4)

    except Exception as e:
        print(f"Error processing URL {url}: {e}")


已有188条，待爬1条


1it [00:03,  3.98s/it]

name:Connected Scatterplot (Lines with Custom Paths)

description:A connected scatterplot can be created by customizing line order and adding point overlay in the line mark definition.

code:{
  "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
  "data": {"url": "data/driving.json"},
  "mark": {"type": "line", "point": true},
  "encoding": {
    "x": {
      "field": "miles", "type": "quantitative",
      "scale": {"zero": false}
    },
    "y": {
      "field": "gas", "type": "quantitative",
      "scale": {"zero": false}
    },
    "order": {"field": "year"}
  }
}






