In [1]:
'''
Install libraries that will be crutial for web scraping
'''

!pip install requests --upgrade --quiet
!pip install beautifulsoup4 --upgrade --quiet

In [10]:
'''
Step 1: Requests will make request to the server.

Step 2: BeautifulSoup will be used to parse though the html reponse we will get
        by using requests.
        
Step 3: After we have parsed the html file and collcted the desired data in a
        a dictionary we will use csv module to expore data in a .csv file.
'''

import requests
from bs4 import BeautifulSoup
import csv

In [2]:
def github_repo_scraper():
    
    '''
    This function returns top github repositores of any given topic 
    in a step by step process
    1.First we ask the user to provide a topic name
    2.Then we fetch the topic page using fetch_page()
    3.After we have the page we will tags_parser() to get list of all the tags containing repository data
    4.Now we will use repo_parser() to parse the repositories from all the tags
    5.After we have all the data write_repo_csv() will convert the dictionary output to .csv file
    '''
    
    topic = input("Please enter a topic :")
    
    doc = fetch_page('topic')
    
    doc_tags = tags_parser(doc)
    
    top_repositories = [repo_parser(tag) for tag in doc_tags]
    
    write_repo_csv(top_repositories,topic)
    
    return print("Top repositories saved as .csv in directory")

In [8]:
# All the fucntions called  by github_repo_scraper()

def fetch_page(topic):
    
    '''
    fetch_page() is a funtion that takes a topic name as input and return a beautful soup parsed file for the topic,
    status_code is used here to check if we got a valid response from our request
    '''
    
    base_url = 'https://github.com/topics/'
    topic_url = base_url + topic
    response = requests.get(topic_url)
    if response.status_code != 200:
        raise Exception("Not able to fetch page ",topic_url," please check the input again")
    return BeautifulSoup(response.text)    

def tags_parser(doc):
    
    '''tags_parser() fetches the articles that contain repository data and return it in a list that will be itterated'''
    
    return doc.find_all('article',class_='border rounded color-shadow-small color-bg-subtle my-4') 

def repo_parser(tag):
    
    '''
    repo_parser() will goes though the tags and return the owner_name,repo_name,repo_url and repo_stars in a dictionary
    '''
    
    owner_name = tag.find('h3').find_all('a')[0].text.strip()
    
    repo_name = tag.find('h3').find_all('a')[1].text.strip()
    
    repo_tag = tag.find('h3').find_all('a')[1].attrs['href'].strip()
    repo_url = 'https://github.com'+repo_tag
    
    star_string = tag.find('span',id='repo-stars-counter-star').text.strip()
    repo_stars = parse_star(star_string)
    
    return{
        'repository_name':repo_name,
        'owner_username':owner_name,
        'stars':repo_stars,
        'repository_url':repo_url
    }

def parse_star(star_string):
    
    '''
    parse_star() takes the sting value fo stars as input and return an integer star count
    '''
    
    if star_string[-1] == 'k':
        return int(float(star_string[:-1])*1000)
    else: return int(star_string)
    
def write_repo_csv(dic_output,topic): 
    
    '''
    using the csv module write_repo_csv() return a .csv file after taking the dictionary of repositories as input
    '''
    
    headers = list(dic_output[1].keys())
    file_name = topic+"_repo.csv"
    with open(file_name,'w') as csvfile:
        writer = csv.DictWriter(csvfile,fieldnames=headers)
        writer.writeheader()
        writer.writerows(dic_output)    

In [11]:
github_repo_scraper()

Please enter a topic :python
Top repositories saved as .csv in directory
