# Top Repositories for GitHub Projects

In [1]:
import requests
from bs4 import BeautifulSoup 
import pandas as pd

In [2]:
topics_url = 'https://github.com/topics'
response = requests.get(topics_url)
response.status_code

200

In [3]:
len(response.text)

170789

In [4]:
#Let's see how the content of page look like
page_content = response.text
page_content[:1000]

'\n\n<!DOCTYPE html>\n<html\n  lang="en"\n  \n  data-color-mode="auto" data-light-theme="light" data-dark-theme="dark"\n  data-a11y-animated-images="system" data-a11y-link-underlines="true"\n  >\n\n\n\n\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>\n  <link rel="preconnect" href="https://avatars.githubusercontent.com">\n\n  \n\n  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/light-38f1bf52eeeb.css" /><link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/dark-56010aa53a8f.css" /><link data-color-theme="dark_dimmed" crossor

In [5]:
#Let's save the html content to see it 
with open('githubtopics.html', 'w', encoding='utf-8') as f:
    f.write(page_content)

In [6]:
doc = BeautifulSoup(page_content, 'html.parser')

In [7]:
type(doc)

bs4.BeautifulSoup

In [8]:
selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
topic_title_tags = doc.find_all('p', {'class': selection_class})
len(topic_title_tags)

30

In [9]:
topic_title_tags[:5]

[<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>]

In [10]:
desc_selector = 'f5 color-fg-muted mb-0 mt-1'
topic_desc_tags = doc.find_all('p', {'class': desc_selector})
len(topic_desc_tags)

30

In [11]:
topic_desc_tags[:5]

[<p class="f5 color-fg-muted mb-0 mt-1">
           3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Ajax is a technique for creating interactive web applications.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Algorithms are self-contained sequences that carry out a variety of tasks.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Amp is a non-blocking concurrency library for PHP.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Android is an operating system built by Google designed for mobile devices.
         </p>]

In [12]:
topic_title_tag0 = topic_title_tags[0]

In [13]:
topic_title_tag0.parent

<a class="no-underline flex-1 d-flex flex-column" href="/topics/3d">
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>
<p class="f5 color-fg-muted mb-0 mt-1">
          3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.
        </p>
</a>

In [14]:
topic_link_tags = doc.find_all('a', {'class': 'no-underline flex-grow-0'})
len(topic_link_tags)

30

In [15]:
#Let's check that we are getting correct result 

topic0_url = "https://github.com" + topic_link_tags[0]['href']
print(topic0_url)

https://github.com/topics/3d


In [16]:
topic_titles = []

for tag in topic_title_tags:
    topic_titles.append(tag.text)
    
print(topic_titles)

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Atom', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'COVID-19', 'C++']


In [17]:
topic_descriptions = []

for desc in topic_desc_tags:
    topic_descriptions.append(desc.text.strip())
    
topic_descriptions[:5]

['3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.',
 'Ajax is a technique for creating interactive web applications.',
 'Algorithms are self-contained sequences that carry out a variety of tasks.',
 'Amp is a non-blocking concurrency library for PHP.',
 'Android is an operating system built by Google designed for mobile devices.']

In [18]:
topic_urls = []
base_url = "https://github.com"

for url in topic_link_tags:
    topic_urls.append(base_url + url['href'])

topic_urls[:5]

['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm',
 'https://github.com/topics/amphp',
 'https://github.com/topics/android']

In [19]:
#Let's create 

topics_dict = {'topic_title' : topic_titles, 'description' : topic_descriptions, 'url' : topic_urls}

topics_df = pd.DataFrame(topics_dict)
topics_df

Unnamed: 0,topic_title,description,url
0,3D,3D refers to the use of three-dimensional grap...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source platform for buildin...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


In [20]:
#Let's create  a csv file 

topics_df.to_csv('github_topics.csv', index=False)

## Getting information out of a topic page

In [21]:
topic_page_url = topic_urls[0]
topic_page_url

'https://github.com/topics/3d'

In [22]:
response = requests.get(topic_page_url)
response

<Response [200]>

In [23]:
len(response.text)

488728

In [24]:
topic_doc = BeautifulSoup(response.text, 'html.parser')

In [25]:
h3_class = 'f3 color-fg-muted text-normal lh-condensed'
repo_tags = topic_doc.find_all('h3', {'class': h3_class})
repo_tags

[<h3 class="f3 color-fg-muted text-normal lh-condensed">
 <a class="Link" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="c72fbd5c69a8ee7c9c53a4e65de2b93c8fc7552dd793945819639bc165c0f0ba" data-turbo="false" data-view-component="true" href="/mrdoob">
             mrdoob
 </a>          /
           <a class="Link text-bold wb-break-word" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4a2667db3d63a1739c412e059e5da95afe419df83f70949b5d59dc3478f5c79a" data-turbo="false" data-view-component="true"

In [26]:
a_tags = repo_tags[0].find_all('a')
print(a_tags[0].text.strip())              #Giving us user name
print(a_tags[1].text.strip())              #Giving us repository name for that user

mrdoob
three.js


In [27]:
repo_urls = base_url + a_tags[1]['href']        #we also want repository url so 
repo_urls

'https://github.com/mrdoob/three.js'

In [28]:
star_tags = topic_doc.find_all('span', {'class': 'Counter js-social-count'})

In [29]:
star_tags[0].text.strip()

'96.6k'

In [30]:
#Let's conver it to number
def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    if stars_str[-1] == 'k': 
        return int(float(stars_str[:-1]) * 1000)
    return int(stars_str)

In [31]:
parse_star_count(star_tags[0].text.strip())

96600

In [32]:
def get_repo_info(h3_tag, star_tag):
    #return all the required info about a repository
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, stars, repo_url

In [33]:
print(get_repo_info(repo_tags[0], star_tags[0]))
print(get_repo_info(repo_tags[1], star_tags[1]))


('mrdoob', 'three.js', 96600, 'https://github.com/mrdoob/three.js')
('pmndrs', 'react-three-fiber', 24800, 'https://github.com/pmndrs/react-three-fiber')


In [34]:
topic_repos_dict = {
    'username': [], 
    'repo_name': [], 
    'stars': [], 
    'repo_url': []
}


for i in range(len(repo_tags)):
    repo_info = get_repo_info(repo_tags[i], star_tags[i])
    topic_repos_dict['username'].append(repo_info[0])
    topic_repos_dict['repo_name'].append(repo_info[1])
    topic_repos_dict['stars'].append(repo_info[2])
    topic_repos_dict['repo_url'].append(repo_info[3])

In [35]:
topic_repos_df = pd.DataFrame(topic_repos_dict)
topic_repos_df.head()

Unnamed: 0,username,repo_name,stars,repo_url
0,mrdoob,three.js,96600,https://github.com/mrdoob/three.js
1,pmndrs,react-three-fiber,24800,https://github.com/pmndrs/react-three-fiber
2,libgdx,libgdx,22300,https://github.com/libgdx/libgdx
3,BabylonJS,Babylon.js,21800,https://github.com/BabylonJS/Babylon.js
4,ssloy,tinyrenderer,18600,https://github.com/ssloy/tinyrenderer


## Final code

In [71]:
#Now Let's do it for all topics (till now we are just working for 3D which is 1st topic) and combine everything in one cell
import os

def get_topic_page(topic_url):
    #Download the page
    response = requests.get(topic_url)
    #Check sucessful response
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    #Parse using beautifulSoup
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc


def get_repo_info(h3_tag, star_tag):
    #return all the required info about a repository
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, stars, repo_url


def get_topic_repos(topic_doc):
    #get the h3 tags containg repo title, repo url and username
    h3_class = 'f3 color-fg-muted text-normal lh-condensed'
    repo_tags = topic_doc.find_all('h3', {'class': h3_class})
    #get star tages
    star_tags = topic_doc.find_all('span', {'class': 'Counter js-social-count'})
    #get repo info
    
    topic_repos_dict = {
        'username': [], 
        'repo_name': [], 
        'stars': [], 
        'repo_url': []
    }
    
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])
        
    return pd.DataFrame(topic_repos_dict)
 
def scrape_topic(topic_url, path):
    
    if os.path.exists(path):
        print("The file {} already exists. Skipping...".format(path))
        return
    topic_df = get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(path + '.csv', index=False)

In [37]:
#Let's try one more example using created function 

get_topic_repos(get_topic_page(topic_urls[7]))

Unnamed: 0,username,repo_name,stars,repo_url
0,public-apis,public-apis,273000,https://github.com/public-apis/public-apis
1,neovim,neovim,72200,https://github.com/neovim/neovim
2,tiangolo,fastapi,66500,https://github.com/tiangolo/fastapi
3,strapi,strapi,58100,https://github.com/strapi/strapi
4,hoppscotch,hoppscotch,57700,https://github.com/hoppscotch/hoppscotch
5,ocornut,imgui,52800,https://github.com/ocornut/imgui
6,slatedocs,slate,35600,https://github.com/slatedocs/slate
7,Kong,insomnia,31600,https://github.com/Kong/insomnia
8,trpc,trpc,30600,https://github.com/trpc/trpc
9,httpie,cli,30600,https://github.com/httpie/cli


In [38]:
#Let's try another example
topic_urls[6]

'https://github.com/topics/ansible'

In [39]:
#Let's see the output and save it to the csv file 
get_topic_repos(get_topic_page(topic_urls[6])).to_csv('githubansible.csv', index=False)

Write a single function to:
    
    1. Get the list of topics from the topics page
    2. Get the list of top repos from the individual topic page
    3. for each topic, create a csv of the top repos for the topic

In [75]:
def get_topic_titles(doc):
    selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = doc.find_all('p', {'class': selection_class})
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles
  
def get_topic_descs(doc):
    desc_selector = 'f5 color-fg-muted mb-0 mt-1'
    topic_desc_tags = doc.find_all('p', {'class': desc_selector})
    topic_descriptions = []
    for desc in topic_desc_tags:
        topic_descriptions.append(desc.text.strip())
    return topic_descriptions

def get_topic_urls(doc):
    topic_link_tags = doc.find_all('a', {'class': 'no-underline flex-grow-0'})
    topic_urls = []
    base_url = "https://github.com"
    for url in topic_link_tags:
        topic_urls.append(base_url + url['href'])
    return topic_urls
    
def scrape_topics():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    #Check sucessful response
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    doc = BeautifulSoup(response.text, 'html.parser')
    
    topics_dict = {
        'topic_title': get_topic_titles(doc),
        'description': get_topic_descs(doc),
        'url': get_topic_urls(doc)
    }
    return pd.DataFrame(topics_dict)



In [57]:
scrape_topics()

Unnamed: 0,topic_title,description,url
0,3D,3D refers to the use of three-dimensional grap...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source platform for buildin...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


In [73]:
def scrape_topics_repos():
    print('Scraping list of topics')
    topics_df = scrape_topics()
    
    os.makedirs('Github_topics', exist_ok=True)    
    for index, row in topics_df.iterrows():
        print('Scraping top repositories for "{}"'. format(row['topic_title']))
        scrape_topic(row['url'], 'Github_topics/{}.csv'.format(row['topic_title']))

In [74]:
scrape_topics_repos()

Scraping list of topics
Scraping top repositories for "3D"
Scraping top repositories for "Ajax"
Scraping top repositories for "Algorithm"
Scraping top repositories for "Amp"
Scraping top repositories for "Android"
Scraping top repositories for "Angular"
Scraping top repositories for "Ansible"
Scraping top repositories for "API"
Scraping top repositories for "Arduino"
Scraping top repositories for "ASP.NET"
Scraping top repositories for "Atom"
Scraping top repositories for "Awesome Lists"
Scraping top repositories for "Amazon Web Services"
Scraping top repositories for "Azure"
Scraping top repositories for "Babel"
Scraping top repositories for "Bash"
Scraping top repositories for "Bitcoin"
Scraping top repositories for "Bootstrap"
Scraping top repositories for "Bot"
Scraping top repositories for "C"
Scraping top repositories for "Chrome"
Scraping top repositories for "Chrome extension"
Scraping top repositories for "Command line interface"
Scraping top repositories for "Clojure"
Scrapin

In [61]:
scrape_topics_repos()

Scraping list of topics
Scraping top repositories for "3D"
The file 3D.csv already exists. Skipping...
Scraping top repositories for "Ajax"
The file Ajax.csv already exists. Skipping...
Scraping top repositories for "Algorithm"
The file Algorithm.csv already exists. Skipping...
Scraping top repositories for "Amp"
The file Amp.csv already exists. Skipping...
Scraping top repositories for "Android"
The file Android.csv already exists. Skipping...
Scraping top repositories for "Angular"
The file Angular.csv already exists. Skipping...
Scraping top repositories for "Ansible"
The file Ansible.csv already exists. Skipping...
Scraping top repositories for "API"
The file API.csv already exists. Skipping...
Scraping top repositories for "Arduino"
The file Arduino.csv already exists. Skipping...
Scraping top repositories for "ASP.NET"
The file ASP.NET.csv already exists. Skipping...
Scraping top repositories for "Atom"
The file Atom.csv already exists. Skipping...
Scraping top repositories for "