In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import base64
import time
import random

In [2]:
GITHUB_TOKEN = 'git_access_token'
TARGET_REPO="microsoft/autogen"

In [3]:
import re


def get_github_api_data(url):
    """Make GitHub API request with authentication"""
    headers = {
        'Authorization': f'token {GITHUB_TOKEN}',
        'Accept': 'application/vnd.github.v3+json'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()
    print(f"API Error: {response.status_code} for URL: {url}")
    return None

def clean_text(text):
    """for CSV storage"""
    if not text:
        return None
    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = ' '.join(text.split())
    return text

def get_readme(owner, repo):
    """Get repository README content"""
    readme_url = f"https://api.github.com/repos/{owner}/{repo}/readme"
    readme_data = get_github_api_data(readme_url)
    if readme_data and 'content' in readme_data:
        try:
            content = base64.b64decode(readme_data['content']).decode('utf-8')
            return clean_text(content)
        except:
            return None
    return None

def get_user_details(username):
    """Get detailed user information"""
    user_data = get_github_api_data(f"https://api.github.com/users/{username}")
    if user_data:
        return {
            'name': user_data.get('name'),
            'email': user_data.get('email'),
            'blog': user_data.get('blog'),
            'twitter_username': user_data.get('twitter_username'),
            'company': user_data.get('company'),
            'location': user_data.get('location'),
            'bio': clean_text(user_data.get('bio')),
            'public_repos': user_data.get('public_repos'),
            'followers': user_data.get('followers'),
            'following': user_data.get('following'),
            'created_at': user_data.get('created_at')
        }
    return None

In [4]:
def get_initial_dependents(repo):
    """Get list of all dependent repositories with stars"""
    url = f'https://github.com/{repo}/network/dependents'
    nextExists = True
    result = []
    page = 1
    
    while nextExists:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        print(f"Processing page {page}...")
        
        for box in soup.findAll("div", {"class": "Box-row"}):
            try:
                owner = box.find('a', {"data-repository-hovercards-enabled": ""}).text.strip()
                repo_name = box.find('a', {"data-hovercard-type": "repository"}).text.strip()
                
                stars_element = box.find("svg", {"class": "octicon-star"})
                stars = 0
                if stars_element and stars_element.parent:
                    stars_text = stars_element.parent.text.strip().replace(',', '')
                    try:
                        stars = int(stars_text)
                    except:
                        pass
                        
                result.append((owner, repo_name, stars))
                print(f"Found: {owner}/{repo_name} ({stars} stars)")
                
            except Exception as e:
                print(f"Error processing repository: {str(e)}")
                continue
        
        nextExists = False
        paginate_container = soup.find("div", {"class": "paginate-container"})
        if paginate_container:
            for u in paginate_container.findAll('a'):
                if u.text == "Next":
                    nextExists = True
                    url = u["href"]
                    if not url.startswith('http'):
                        url = f"https://github.com{url}"
                    page += 1
                    time.sleep(1) 
                    break
    
    print(f"\nTotal dependents found: {len(result)}")
    return result

In [5]:
dependents_list = get_initial_dependents(TARGET_REPO)
print(f"Found {len(dependents_list)} dependents to analyze")

Collecting top 10 dependents for microsoft/autogen...
Processing page 1...
Found: Zabiullahkhan/Chat_Bot_RIVA (0 stars)
Found: linmou/AgentSims2 (0 stars)
Found: sleek-aryan/Data-Analysis-Team-Autogen- (0 stars)
Found: mondweep/GenAI_CyberSecurity (0 stars)
Found: JapiKredi/My_first_agents-main (0 stars)
Found: lucasraniere/autogen-test (0 stars)
Found: manshikakhera/ChainGenie-Chainlit-Chatbot-with-Autogen (0 stars)
Found: kagHarsh/Agentic-AI-Travel-Planner (0 stars)
Found: Drlordbasil/autogen_only_flow (0 stars)
Found: afnanenayet/regulAItor (2 stars)
Found: varunraghav/Asset_Analysis_AI_Agents (0 stars)
Found: microsoft/ACV (8 stars)
Found: tosin2013/persona-driven-prompt-generator (0 stars)
Found: carsonbring/vid-traffic-sim (0 stars)
Found: ddiddi/try-streamlit (0 stars)
Found: minhnhat2001vt/Code-Validating-Multi-Agent-LLMs-with-Autogen (1 stars)
Found: IsmailKonak/aisuite-chatbotui (2 stars)
Found: daniel-dqsdatalabs/multi-agent-postgres-data-analytics (0 stars)
Found: sanggi-wj

In [6]:
all_data = []
for idx, (owner, repo, stars) in enumerate(dependents_list, 1):
    
    owner_info = get_user_details(owner)
    repo_info = get_github_api_data(f"https://api.github.com/repos/{owner}/{repo}")
    readme = get_readme(owner, repo)
    
    dependent_info = {
        'dependent_owner': owner,
        'dependent_repo': repo,
        'dependent_stars': stars,
        'readme_text': readme,
        'owner_name': owner_info.get('name') if owner_info else None,
        'owner_email': owner_info.get('email') if owner_info else None,
        'owner_blog': owner_info.get('blog') if owner_info else None,
        'owner_twitter': owner_info.get('twitter_username') if owner_info else None,
        'owner_company': owner_info.get('company') if owner_info else None,
        'owner_location': owner_info.get('location') if owner_info else None,
        'owner_bio': owner_info.get('bio') if owner_info else None,
        'owner_public_repos': owner_info.get('public_repos') if owner_info else None,
        'owner_followers': owner_info.get('followers') if owner_info else None,
        'owner_following': owner_info.get('following') if owner_info else None,
        'owner_created_at': owner_info.get('created_at') if owner_info else None,
        'repo_description': clean_text(repo_info.get('description')) if repo_info else None,
        'repo_language': repo_info.get('language') if repo_info else None,
        'repo_created_at': repo_info.get('created_at') if repo_info else None,
        'repo_updated_at': repo_info.get('updated_at') if repo_info else None,
        'repo_homepage': repo_info.get('homepage') if repo_info else None,
        'repo_forks': repo_info.get('forks_count') if repo_info else None,
        'repo_watchers': repo_info.get('watchers_count') if repo_info else None,
        'repo_open_issues': repo_info.get('open_issues_count') if repo_info else None
    }
    
    all_data.append(dependent_info)
    time.sleep(1)


Processing 1/10: Zabiullahkhan/Chat_Bot_RIVA

Processing 2/10: linmou/AgentSims2

Processing 3/10: sleek-aryan/Data-Analysis-Team-Autogen-

Processing 4/10: mondweep/GenAI_CyberSecurity
API Error: 404 for URL: https://api.github.com/repos/mondweep/GenAI_CyberSecurity/readme

Processing 5/10: JapiKredi/My_first_agents-main

Processing 6/10: lucasraniere/autogen-test

Processing 7/10: manshikakhera/ChainGenie-Chainlit-Chatbot-with-Autogen

Processing 8/10: kagHarsh/Agentic-AI-Travel-Planner

Processing 9/10: Drlordbasil/autogen_only_flow

Processing 10/10: afnanenayet/regulAItor

Processing 11/10: varunraghav/Asset_Analysis_AI_Agents

Processing 12/10: microsoft/ACV

Processing 13/10: tosin2013/persona-driven-prompt-generator

Processing 14/10: carsonbring/vid-traffic-sim

Processing 15/10: ddiddi/try-streamlit
API Error: 404 for URL: https://api.github.com/repos/ddiddi/try-streamlit/readme

Processing 16/10: minhnhat2001vt/Code-Validating-Multi-Agent-LLMs-with-Autogen

Processing 17/10

In [53]:
print(all_data)

[{'dependent_owner': 'Rizzi66', 'dependent_repo': 'ToDoListBackend', 'dependent_stars': 0, 'readme_text': "# Type ## Installation ```bash npm install npm run dev # Démarrage en mode développement npm run build # Compilation TypeScript npm start # Démarrage en production ``` ## Variables d'Environnement - `PORT`: Port sur lequel le serveur écoute (défaut: 3000) - `NODE_ENV`: Environnement d'exécution Voici les commandes `curl` pour tester les deux API. --- ### **API avec Interface (Port 3000)** 1. **Obtenir tous les utilisateurs** ```bash curl -X GET http://localhost:3001/users ``` 2. **Obtenir un utilisateur par ID (par exemple, ID = 1)** ```bash curl -X GET http://localhost:3001/users/1 ``` 3. **Obtenir un utilisateur par ID non existant (par exemple, ID = 99)** ```bash curl -X GET http://localhost:3001/users/99 ``` 4. **Obtenir la liste des administrateurs** ```bash curl -X GET http://localhost:3001/admin-users ``` --- ### **API avec Type (Port 3001)** 1. **Obtenir tous les utilisate

In [7]:
df = pd.DataFrame(all_data)

print("\nCollected Repositories:")
summary_columns = ['dependent_owner', 'dependent_repo', 'dependent_stars', 
                  'owner_email', 'owner_twitter', 'repo_language']
display(df)

csv_filename = f"{TARGET_REPO.replace('/', '_')}dependents_final.csv"
df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
print(f"\nResults saved to {csv_filename}")


Collected Repositories:

Results saved to microsoft_autogendependents_final.csv
