In [1]:
# import libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
page = requests.get('https://github.com/topics').text  # downloading a page

In [3]:
len(page)

174512

In [4]:
#page[:3000]

In [5]:
doc = BeautifulSoup(page,'html.parser')  # parsing the content of page using beautiful soap 

In [6]:
##print title of the page
doc.find('title')

<title>Topics on GitHub · GitHub</title>

In [7]:
##Lets put all above steps in function######
def get_page(url):
    """Download a web page and return a beautiful soup doc"""
    # Download the page
    response = requests.get(url)
    
    # Check if download was sucessful
    if response.status_code != 200:
        raise Exception('Unable to download page {}'.format(url))
    
    # Get the page HTML
    page_content = response.text
    
    # Create a bs4 doc
    doc = BeautifulSoup(response.text, 'html.parser')
    return doc

# Extract topic names, descriptions and URLs from page

In [8]:
def get_topic_titles(doc):
    # comments
    selector = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    p_tags = doc.find_all('p', {'class': selector})
    return [tag.text for tag in p_tags]

In [9]:
titles = get_topic_titles(doc)
print(len(titles))

30


In [10]:
def get_topic_descriptions(doc):
    selector = 'f5 color-fg-muted mb-0 mt-1'
    p_tags = doc.find_all('p', {'class': selector})
    return [tag.text.strip() for tag in p_tags]

In [11]:
descriptions = get_topic_descriptions(doc)
len(descriptions)

30

In [12]:
base_url = 'https://github.com'

def get_topic_urls(doc):
    selector = 'no-underline flex-grow-0'
    a_tags = doc.find_all('a', {'class': selector})
    return [base_url + tag['href'] for tag in a_tags]
        

        

In [13]:
topic_urls = get_topic_urls(doc)

In [14]:
len(topic_urls)

30

In [15]:
topics_data = {
    'title': titles,
    'description': descriptions,
    'url': topic_urls
}

In [16]:
pd.DataFrame(topics_data)


Unnamed: 0,title,description,url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


In [17]:
###Extract Data FRom Multiple Pages

In [18]:
def get_page_topics(page_number):
    url = 'https://github.com/topics?page=' + str(page_number)
    doc = get_page(url)
    titles = get_topic_titles(doc)
    descriptions = get_topic_descriptions(doc)
    urls = get_topic_urls(doc)
    return titles, descriptions, urls  

In [19]:
from time import sleep

In [20]:
all_titles, all_descriptions, all_urls = [], [], []

for page_number in range(1,5):
    print('Downloading page number', page_number)
    titles, descriptions, urls = get_page_topics(page_number)
    all_titles += titles
    all_descriptions += descriptions
    all_urls += urls
    sleep(1)

Downloading page number 1
Downloading page number 2
Downloading page number 3
Downloading page number 4


In [21]:
topics_all_pages = {
    'title': all_titles,
    'description': all_descriptions,
    'url': all_urls
}

In [22]:
dataframe = pd.DataFrame(topics_all_pages)

In [23]:
dataframe.to_csv('topics.csv', index=None)