In [1]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import os

# 保存先ディレクトリ
os.makedirs("logs", exist_ok=True)

# F-Droid の index.xml を取得
url = "https://f-droid.org/repo/index.xml"
print("Downloading:", url)
response = requests.get(url)
response.raise_for_status()

root = ET.fromstring(response.content)

data = []

for app in root.findall("application"):
    package_id = app.findtext("id")
    name = app.findtext("name")
    summary = app.findtext("summary")
    source = app.findtext("source")
    web = app.findtext("web")
    category = app.findtext("category")

    data.append({
        "package_id": package_id,
        "name": name,
        "summary": summary,
        "category": category,
        "source": source,
        "web": web
    })

df = pd.DataFrame(data).drop_duplicates(subset=["package_id"])

# CSV 出力
output_path = "logs/fdroid_all_apps.csv"
df.to_csv(output_path, index=False, encoding="utf-8")

print(f"抽出完了: {len(df)} 件")

Downloading: https://f-droid.org/repo/index.xml
抽出完了: 3938 件


In [7]:
# Join with F-Droid mapping (GitHub ↔ package_id)

import requests
import xml.etree.ElementTree as ET
import re
import pandas as pd

# F-Droid index.xml を取得
url = "https://f-droid.org/repo/index.xml"
response = requests.get(url)
root = ET.fromstring(response.content)

data = []

for app in root.findall("application"):
    package_id = app.find("id").text if app.find("id") is not None else None
    source = app.find("source").text if app.find("source") is not None else None

    if source and "github.com" in source:
        # GitHub の repo 部分を抽出 (例: https://github.com/signalapp/Signal-Android → signalapp/Signal-Android)
        match = re.search(r"github\.com/([^/]+/[^/]+)/?", source)
        if match:
            repo = match.group(1).strip()
            data.append({"repo": repo, "package_id": package_id})

df = pd.DataFrame(data).drop_duplicates()
df.to_csv("logs/repo_package_map.csv", index=False)

print("抽出完了:", len(df), "件")

抽出完了: 3121 件


In [8]:
# リポジトリ名の正規化
from tqdm import tqdm
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../../")))
import project_config as config


def resolve_repo(repo):
    """
    GitHubのリポジトリ名を正規化する。
    - 存在すればそのまま返す
    - 移転されていれば moved_to の full_name を返す
    """
    url = f"https://api.github.com/repos/{repo}"
    r = requests.get(url, headers={"Authorization": f"token {config.GITHUB_AUTH_TOKEN}"})
    
    if r.status_code == 200:
        data = r.json()
        if "moved_to" in data and data["moved_to"]:
            return data["moved_to"]["full_name"]
        else:
            return data["full_name"]  # 正規表現済みの名前を返す
    elif r.status_code == 404:
        return None  # 存在しない（削除済み or private）
    else:
        print(f"Error {r.status_code} for {repo}")
        return None

# マッピングデータを読み込み
repo_map = pd.read_csv("logs/repo_package_map.csv") 

normalized = []
for i, row in tqdm(repo_map.iterrows(), total=len(repo_map)):
    repo = row["repo"]
    fixed = resolve_repo(repo)
    normalized.append({"original": repo, "normalized": fixed})

normalized_df = pd.DataFrame(normalized).drop_duplicates()
normalized_df.to_csv("logs/normalized_repos.csv", index=False)
print("正規化完了:", len(normalized_df), "件")


100%|██████████| 3121/3121 [20:22<00:00,  2.55it/s]

正規化完了: 3001 件





In [9]:
import pandas as pd

normalized_df = pd.read_csv("logs/normalized_repos.csv")  # 正規化済みリポジトリ名
mapping = pd.read_csv("logs/repo_package_map.csv")  # F-Droidから抽出したマップ
seart = pd.read_csv("logs/seart_dataset.csv")  # SEART データ

merged_id = normalized_df.merge(mapping, left_on="original", right_on="repo", how="inner").drop(columns=["original"])
merged_seart = seart.merge(merged_id, left_on="name", right_on="normalized", how="inner").drop(columns=["normalized", "repo", "labels", "topics"])
merged_seart["app_store"] = "https://play.google.com/store/apps/details?id=" + merged_seart["package_id"] + "&hl=en"
print("Google Playと突合できた件数:", len(merged_seart))
merged_seart.to_csv("logs/seart_with_package.csv", index=False)

Google Playと突合できた件数: 62
