# Scraping GitHub

### Importation des bibliothèques

In [28]:
from dotenv import load_dotenv
import os
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import concurrent.futures
from datetime import datetime, timedelta
from tqdm import tqdm
import csv

In [31]:
# Chargement des variables d'environnement depuis le fichier .env
load_dotenv('.env')

# URL de base de l'API GitHub
base_url = 'https://api.github.com'

# Nom d'utilisateur GitHub
username = os.getenv('USERNAME')

# Token d'accès GitHub
access_token = os.getenv('ACCESS_TOKEN')

# Nombre de dépôts par jour
repositories_per_day = 500

# Dates de début et de fin du scraping
start_date = datetime(2023, 1, 1).date()
end_date = datetime(2023, 6, 15).date()

# Nom du fichier de sortie CSV
output_file = 'repositories.csv'

# Calcul du nombre de jours à scraper
days_to_scrape = (end_date - start_date).days + 1

# Liste des dépôts
repositories = []

# Mécanisme de reprise pour les requêtes API
retry_strategy = Retry(
    total=3,
    backoff_factor=0.5,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)

# Initialisation de la barre de progression
progress = tqdm(total=repositories_per_day * days_to_scrape, unit='repo')

# Fonction pour récupérer les informations des dépôts
def fetch_repository_info(item):
    repository_url = item['url']
    headers = {'Authorization': f'token {access_token}'} if access_token else {}
    response = http.get(repository_url, headers=headers)
    details_data = response.json()

    issues_count = details_data.get('open_issues_count', 0)

    return issues_count

# Scraping des dépôts
for day in range(days_to_scrape):
    current_date = start_date + timedelta(days=day)
    formatted_date = current_date.strftime('%Y-%m-%d')

    # Récupération des dépôts en utilisant la pagination
    page = 1
    while len(repositories) < repositories_per_day * (day + 1):
        # Création de l'URL de l'API pour récupérer les dépôts créés le jour courant et la page spécifique
        url = f'{base_url}/search/repositories?q=created:{formatted_date}&sort=stars&order=desc&per_page=100&page={page}'

        # Exécution de la requête API avec la logique de reprise
        headers = {'Authorization': f'token {access_token}'} if access_token else {}
        response = http.get(url, headers=headers)
        data = response.json()

        if 'items' in data:
            # Récupération des informations supplémentaires des dépôts
            with concurrent.futures.ThreadPoolExecutor() as executor:
                futures = [executor.submit(fetch_repository_info, item) for item in data['items']]
                results = [future.result() for future in futures]

            for item, issues_count in zip(data['items'], results):
                repository = {
                    'full_name': item['full_name'],
                    'topics': item.get('topics', []),
                    'stars': item['stargazers_count'],
                    'forks': item['forks'],
                    'description': item['description'],
                    'language': item.get('language', ''),
                    'creation_date': item['created_at'],
                    'last_updated': item['updated_at'],
                    'url': item['html_url'],
                    'issues': issues_count
                }

                repositories.append(repository)
                progress.update(1)

        page += 1

        if 'next' not in response.links:
            break

# Fermeture de la barre de progression
progress.close()

# Écriture des données des dépôts dans un fichier CSV
with open(output_file, 'w', newline='', encoding='utf-8') as file:
    fieldnames = ['full_name', 'topics', 'stars', 'forks', 'description', 'language', 'creation_date', 'last_updated', 'url', 'issues']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(repositories)

# Nombre de dépôts récupérés
num_repos_scraped = len(repositories)
print(f'Nombre de dépôts récupérés : {num_repos_scraped}')

print('Scraping des dépôts terminé avec succès !')


83096repo [1:06:51, 20.72repo/s]                                                                                       


Nombre de dépôts récupérés : 83096
Scraping des dépôts terminé avec succès !


In [34]:
import pandas as pd
data = pd.read_csv('repositories.csv')
data

Unnamed: 0,full_name,topics,stars,forks,description,language,creation_date,last_updated,url,issues
0,sumn2u/learn-javascript,"['beginner', 'beginner-friendly', 'book', 'col...",619,29,A book that teaches JavaScript,HTML,2023-01-01T15:16:26Z,2023-06-16T03:16:58Z,https://github.com/sumn2u/learn-javascript,0
1,bianchenglequ/NetCodeTop,"['csharp', 'dotnet', 'net', 'netcore']",567,106,收集GitHub上有关.Net、.NetCore有趣、有用、热门的开源项目。,,2023-01-01T16:52:06Z,2023-06-16T04:21:24Z,https://github.com/bianchenglequ/NetCodeTop,0
2,rupali-codes/LinksHub,"['beginner-friendly', 'developers', 'links', '...",343,270,LinksHub aims to provide developers with acces...,TypeScript,2023-01-01T18:55:44Z,2023-06-16T15:30:32Z,https://github.com/rupali-codes/LinksHub,0
3,LondheShubham153/90DaysOfDevOps,"['devops', 'devops-tools', 'docker', 'grafana'...",336,2173,This repository is a Challenge for the DevOps ...,Python,2023-01-01T11:41:21Z,2023-06-13T09:47:39Z,https://github.com/LondheShubham153/90DaysOfDe...,0
4,jahidulislamzim/JavaScriptCodingChallenges,"['coding', 'coding-challenge', 'coding-challen...",221,40,Hello JavaScript code newbie! In this reposito...,,2023-01-01T14:04:36Z,2023-05-28T14:24:58Z,https://github.com/jahidulislamzim/JavaScriptC...,0
...,...,...,...,...,...,...,...,...,...,...
83091,jdev1022/laravel-rest-autotask,[],4,0,,Blade,2023-06-15T02:34:57Z,2023-06-15T06:08:00Z,https://github.com/jdev1022/laravel-rest-autotask,0
83092,wuxiaofei883/wuxiaofei,[],4,0,,,2023-06-15T02:55:49Z,2023-06-15T03:03:16Z,https://github.com/wuxiaofei883/wuxiaofei,0
83093,Sandy857/Hi-Matee,[],4,0,,,2023-06-15T07:56:57Z,2023-06-15T11:07:37Z,https://github.com/Sandy857/Hi-Matee,0
83094,Archi69/Archiman2,[],4,0,Archiman2 is a designer in the world,,2023-06-15T19:07:22Z,2023-06-15T19:08:34Z,https://github.com/Archi69/Archiman2,0


In [37]:
data.shape

(83096, 10)

In [38]:
data.duplicated().sum()

130

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83096 entries, 0 to 83095
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   full_name      83096 non-null  object
 1   topics         83096 non-null  object
 2   stars          83096 non-null  int64 
 3   forks          83096 non-null  int64 
 4   description    61801 non-null  object
 5   language       71467 non-null  object
 6   creation_date  83096 non-null  object
 7   last_updated   83096 non-null  object
 8   url            83096 non-null  object
 9   issues         83096 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 6.3+ MB
