Data Collect

In [1]:
import requests
import json
import time
from datetime import datetime

# =========================
# CONFIGURATION
# =========================

TOKEN = "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"
URL_GRAPHQL = "https://api.github.com/graphql"

HEADERS = {
    "Authorization": f"Bearer {TOKEN}",
    "Content-Type": "application/json"
}

OUTPUT_FILE = "github_data_raw.jsonl"

# =========================
# GRAPHQL QUERY
# =========================

QUERY = """
query ($queryString: String!, $cursor: String) {
  search(query: $queryString, type: REPOSITORY, first: 50, after: $cursor) {
    pageInfo {
      hasNextPage
      endCursor
    }
    edges {
      node {
        ... on Repository {
          databaseId
          nameWithOwner
          url
          description
          stargazerCount
          forkCount
          pushedAt
          languages(first: 5, orderBy: {field: SIZE, direction: DESC}) {
            nodes { name }
          }
          repositoryTopics(first: 10) {
            nodes { topic { name } }
          }
          object(expression: "HEAD:README.md") {
            ... on Blob { text }
          }
        }
      }
    }
  }
}
"""

# =========================
# SAFE GRAPHQL REQUEST
# =========================

def safe_graphql_request(query, variables, retries=3, sleep=5):
    for attempt in range(retries):
        try:
            response = requests.post(
                URL_GRAPHQL,
                json={"query": query, "variables": variables},
                headers=HEADERS,
                timeout=15
            )

            if response.status_code != 200:
                raise Exception(response.text)

            data = response.json()

            if "errors" in data:
                raise Exception(data["errors"])

            return data["data"]

        except Exception as e:
            print(f"⚠️ Erreur GraphQL ({attempt+1}/{retries}) : {e}")
            time.sleep(sleep)

    return None

# =========================
# SAVE REPO (JSONL)
# =========================

def save_repo(repo):
    with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
        f.write(json.dumps(repo, ensure_ascii=False) + "\n")

# =========================
# FETCH REPOSITORIES
# =========================

def fetch_repos(query_str, domain, limit=500):
    repos_count = 0
    cursor = None

    while repos_count < limit:
        variables = {
            "queryString": query_str,
            "cursor": cursor
        }

        data = safe_graphql_request(QUERY, variables)

        if data is None:
            print("❌ Abandon de ce domaine (trop d’erreurs)")
            break

        search = data["search"]

        for edge in search["edges"]:
            repo = edge["node"]
            repo["collected_at"] = datetime.utcnow().isoformat()
            repo["target_domain"] = domain  # ✅ Assignation correcte ici
            save_repo(repo)
            repos_count += 1

            if repos_count >= limit:
                break

        print(f"Collectés : {repos_count}...")

        if not search["pageInfo"]["hasNextPage"]:
            break

        cursor = search["pageInfo"]["endCursor"]
        time.sleep(1.2)  # rate limit safe

    return repos_count

# =========================
# DOMAIN COLLECTION
# =========================

domains = [
    "machine-learning",
    "data-engineering",
    "deep-learning",
    "devops",
    "cloud",
    "blockchain",
    "cybersecurity",
    "natural-language-processing",
    "computer-vision",
    "internet-of-things",
    "automation",
    "business-intelligence",
    "big-data",
    "recommendation-systems",
    "fraud-detection",
    "game-development",
    "scientific-computing",
    "web-development",
    "mobile-development",
    "iot-smart-devices"
]

def collect_by_domains(domain_list, limit_per_domain=500):
    total = 0

    for domain in domain_list:
        print(f"\n--- Collecte du domaine : {domain} ---")

        query_string = (
            f"{domain} in:name,description,readme "
            f"fork:false stars:>20 "
            f"-library -framework -boilerplate"
        )

        collected = fetch_repos(query_string, domain, limit=limit_per_domain)
        total += collected

    print(f"\n✅ Collecte terminée : {total} repositories")

# =========================
# MAIN
# =========================

if __name__ == "__main__":
    collect_by_domains(domains, limit_per_domain=750)



--- Collecte du domaine : machine-learning ---


  repo["collected_at"] = datetime.utcnow().isoformat()


Collectés : 50...
Collectés : 100...
Collectés : 150...
Collectés : 200...
Collectés : 250...
Collectés : 300...
Collectés : 350...
Collectés : 400...
Collectés : 450...
Collectés : 455...

--- Collecte du domaine : data-engineering ---
Collectés : 50...
Collectés : 88...

--- Collecte du domaine : deep-learning ---
Collectés : 50...
Collectés : 100...
Collectés : 150...
Collectés : 200...
Collectés : 250...
Collectés : 300...
Collectés : 301...

--- Collecte du domaine : devops ---
Collectés : 50...
Collectés : 100...
Collectés : 150...
Collectés : 200...
Collectés : 250...
Collectés : 300...
Collectés : 325...

--- Collecte du domaine : cloud ---
Collectés : 50...
Collectés : 100...
⚠️ Erreur GraphQL (1/3) : ('Connection broken: IncompleteRead(1501 bytes read, 8739 more expected)', IncompleteRead(1501 bytes read, 8739 more expected))
Collectés : 150...
Collectés : 200...
Collectés : 250...
Collectés : 300...
Collectés : 350...
Collectés : 400...
Collectés : 450...
Collectés : 500...


Data Preprocessing

In [12]:
import pandas as pd
import numpy as np
import json
import re
from datetime import datetime, timezone

# =========================
# CONFIG
# =========================
INPUT_FILE = "github_data_raw.jsonl"  # your raw JSONL file
OUTPUT_FILE = "github_data_cleaned.jsonl"  # cleaned dataset

# =========================
# FUNCTION TO CLEAN TEXT
# =========================
def clean_text(text, max_len=3000):
    if not text:
        return ""
    text = text.lower()  # lowercase
    # Remove Markdown, badges, URLs, multiple spaces
    text = re.sub(r"#|\*|`|>|-|\!\[.*?\]\(.*?\)", " ", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text[:max_len]

# =========================
# LOAD RAW DATA
# =========================
repos = []
with open(INPUT_FILE, encoding="utf-8") as f:
    for line in f:
        repos.append(json.loads(line))

df = pd.DataFrame(repos)
print(f"Raw repos count: {len(df)}")

# =========================
# DROP DUPLICATES
# =========================
df = df.drop_duplicates(subset=['databaseId'])
print(f"After deduplication: {len(df)}")

# =========================
# CLEAN DESCRIPTION AND README
# =========================
def get_readme_text(obj):
    if obj and 'text' in obj:
        return obj['text']
    return ""

df['description_clean'] = df['description'].apply(clean_text)
df['readme_clean'] = df['object'].apply(lambda x: clean_text(get_readme_text(x)))

# =========================
# EXTRACT LANGUAGES AND TOPICS
# =========================
df['languages_list'] = df['languages'].apply(
    lambda x: [l['name'] for l in x['nodes']] if x and 'nodes' in x and x['nodes'] else []
)
df['topics_list'] = df['repositoryTopics'].apply(
    lambda x: [t['topic']['name'] for t in x['nodes']] if x and 'nodes' in x and x['nodes'] else []
)

# =========================
# CREATE FULL_TEXT FOR EMBEDDINGS
# =========================
df['full_text'] = (
    df['description_clean'] + " " +
    df['readme_clean'] + " " +
    df['topics_list'].apply(lambda x: " ".join(x)) + " " +
    df['languages_list'].apply(lambda x: " ".join(x))
)

# Drop repos with empty full_text
df = df[df['full_text'].str.strip() != ""]
print(f"After removing repos without text: {len(df)}")

# =========================
# NUMERIC FEATURES
# =========================
df['log_stars'] = np.log1p(df['stargazerCount'])
df['fork_ratio'] = df['forkCount'] / (df['stargazerCount'] + 1)

df['pushedAt'] = pd.to_datetime(df['pushedAt'])
df['recency_days'] = (datetime.now(timezone.utc) - df['pushedAt']).dt.days

# =========================
# KEEP ONLY USEFUL COLUMNS
# =========================
final_cols = [
    'databaseId', 'nameWithOwner', 'url', 'target_domain',
    'full_text', 'languages_list', 'topics_list',
    'stargazerCount', 'forkCount', 'log_stars', 'fork_ratio', 'recency_days'
]
df_final = df[final_cols]

# =========================
# SAVE CLEANED DATA
# =========================
df_final.to_json(OUTPUT_FILE, orient='records', lines=True, force_ascii=False)
print(f"✅ Preprocessing finished, saved to: {OUTPUT_FILE}")


Raw repos count: 4462
After deduplication: 1350
After removing repos without text: 1349
✅ Preprocessing finished, saved to: github_data_cleaned.jsonl


Enrechissement

In [3]:
from keybert import KeyBERT
import pandas as pd
from sentence_transformers import SentenceTransformer
import swifter  # pip install swifter pour parallélisation

# --- 1️⃣ Détecter le GPU et charger le modèle dessus ---
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device utilisé :", device)

# Charger le modèle MiniLM sur GPU si dispo
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
kw_model = KeyBERT(model)

# --- 2️⃣ Fonction d'enrichissement ---
def enrich_repo_metadata(df):
    """
    Enrichit les colonnes tools_list et topics_list à partir de full_text
    et des informations existantes (languages_list et repositoryTopics)
    """

    # --- Construire sets uniques des langages et topics déjà présents ---
    all_languages = set()
    for langs in df['languages_list']:
        all_languages.update(langs)

    all_topics = set()
    for topics in df['topics_list']:
        all_topics.update(topics)

    # --- Fonction pour extraire tools automatiquement ---
    def extract_tools(text):
        tools = set()
        text_lower = text.lower()

        # Ajouter langages connus
        for lang in all_languages:
            if lang.lower() in text_lower:
                tools.add(lang)

        # Ajouter topics existants
        for topic in all_topics:
            if topic.lower() in text_lower:
                tools.add(topic)

        # Extraire keywords du texte (frameworks, libs, etc.)
        kws = kw_model.extract_keywords(text, keyphrase_ngram_range=(1,2), top_n=10)
        for kw in kws:
            tools.add(kw[0])
        return list(tools)

    # --- Fonction pour extraire topics automatiquement ---
    def extract_topics(text, top_n=10):
        if not text.strip():
            return []
        keywords = kw_model.extract_keywords(
            text,
            keyphrase_ngram_range=(1,2),
            stop_words='english',
            top_n=top_n
        )
        return [kw[0] for kw in keywords]

    # --- 3️⃣ Appliquer sur le dataframe avec parallélisation swifter ---
    df['tools_list'] = df['full_text'].swifter.apply(extract_tools)
    df['topics_list'] = df['full_text'].swifter.apply(lambda x: extract_topics(x, top_n=10))

    return df

# --- 4️⃣ Charger le fichier JSON, enrichir et sauvegarder ---
df = pd.read_json("github_data_cleaned.jsonl", lines=True)
df = enrich_repo_metadata(df)
df.to_json("github_data_enriched.jsonl", orient='records', lines=True, force_ascii=False)

print("✅ Enrichissement terminé sur GPU si disponible")


Device utilisé : cuda


Pandas Apply:   0%|          | 0/1349 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1349 [00:00<?, ?it/s]

✅ Enrichissement terminé sur GPU si disponible


In [8]:
import pandas as pd

# Charger le fichier enrichi
df = pd.read_json("github_data_enriched.jsonl", lines=True)

# --- Stoplist : mots génériques à ignorer ---
stop_words = {
    'list', 'software', 'course', 'blog', 'request', 'new', 'it', 'professional',
    'free', 'books', 'learning', 'data', 'resource', 'awesome', 'curated', 'news'
}

# --- Fonction de nettoyage ---
def clean_tools(tools):
    if not tools:
        return []
    # 1️⃣ minuscules
    tools = [t.lower() for t in tools]
    # 2️⃣ supprimer doublons
    tools = list(set(tools))
    # 3️⃣ supprimer stopwords
    tools = [t for t in tools if t not in stop_words]
    # 4️⃣ tri alphabétique
    tools.sort()
    return tools

# --- Appliquer le nettoyage ---
df['tools_list'] = df['tools_list'].apply(clean_tools)

# --- Supprimer la colonne languages_list ---
if 'languages_list' in df.columns:
    df.drop(columns=['languages_list'], inplace=True)

# --- Sauvegarder le résultat ---
df.to_json("github_data_cleaned_tools.jsonl", orient='records', lines=True, force_ascii=False)

print("✅ Nettoyage terminé et languages_list supprimée")


✅ Nettoyage terminé et languages_list supprimée


In [10]:
import pandas as pd

# Charger le fichier enrichi
df = pd.read_json("github_data_cleaned_tools.jsonl", lines=True)
# afficher le premier enregistrement
print(df.iloc[0])

databaseId                                                 21872392
nameWithOwner                 josephmisiti/awesome-machine-learning
url               https://github.com/josephmisiti/awesome-machin...
target_domain                                      machine-learning
full_text         a curated list of awesome machine learning fra...
topics_list       [learning curated, learning frameworks, learni...
stargazerCount                                                71548
forkCount                                                     15288
log_stars                                                 11.178138
fork_ratio                                                 0.213672
recency_days                                                      2
tools_list        [ai, ast, awesome machine, blogs, book, c, c++...
Name: 0, dtype: object
