# Use the request library to download web pages

In [1]:
!pip install requests --upgrade --quiet


In [2]:
import requests

In [3]:
topics_url = 'https://github.com/topics' #topics_url contains the the url that contains the list of topics

In [4]:
response = requests.get(topics_url)

In [5]:
response.status_code

200

In [9]:
page_contents=response.text #page_contents contains the information in html format

In [10]:
with open('webpage.html','w',encoding="utf-8") as f:
    f.write(page_contents)

In [11]:
len(page_contents)

139333

# Use Beautiful Soup to parse and extract information

In [12]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [13]:
from bs4 import BeautifulSoup

In [14]:
doc = BeautifulSoup(page_contents, 'html.parser') #doc contains the parsed information of that page

In [30]:
doc


<!DOCTYPE html>

<html data-color-mode="auto" data-dark-theme="dark" data-light-theme="light" lang="en">
<head>
<meta charset="utf-8"/>
<link href="https://github.githubassets.com" rel="dns-prefetch"/>
<link href="https://avatars.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
<link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
<link crossorigin="" href="https://github.githubassets.com" rel="preconnect"/>
<link href="https://avatars.githubusercontent.com" rel="preconnect"/>
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/light-92c7d381038e.css" integrity="sha512-ksfTgQOOnE+FFXf+yNfVjKSlEckJAdufFIYGK7ZjRhWcZgzAGcmZqqArTgMLpu90FwthqcCX4ldDgKXbmVMeuQ==" media="all" rel="stylesheet"><link crossorigin="anonymous" href="https://github.githubassets.com/assets/dark-d4a90c367f0c.css" integrity="sha512-1KkMNn8M/al/dtzBLupRwkIOgnA9MWkm8oxS+solP87jByEvY/g4BmoxLihRogKcX1obPnf4Yp7

In [12]:
p_tags=doc.find_all('p') #p_tags contains the all the paragraph tags in that page

In [13]:
len(p_tags)

67

In [14]:
p_tags[:5]

[<p class="f4 color-fg-muted col-md-6 mx-auto">Browse popular topics on GitHub.</p>,
 <p class="f3 lh-condensed text-center Link--primary mb-0 mt-1">
         COVID-19
       </p>,
 <p class="f5 color-fg-muted text-center mb-0 mt-1">The coronavirus disease 2019 (COVID-19) is an infectious disease caused by SARS-CoV-2.</p>,
 <p class="f3 lh-condensed text-center Link--primary mb-0 mt-1">
         React
       </p>,
 <p class="f5 color-fg-muted text-center mb-0 mt-1">React is an open source JavaScript library used for designing user interfaces.</p>]

In [17]:
selection_class='f3 lh-condensed mb-0 mt-1 Link--primary'
topic_tags=doc.find_all('p',{'class':selection_class}) 

#topic_tags contains the all the p tags with a specied class for the topics

In [18]:
len(topic_tags)

30

In [19]:
topic_tags[:5]


[<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>]

In [20]:

selection_class="f5 color-fg-muted mb-0 mt-1"
desc_tags=doc.find_all('p',{'class':selection_class})

#desc_tags contains the all the p tags with a specied class for the description

In [21]:
desc_tags[:5]

[<p class="f5 color-fg-muted mb-0 mt-1">
           3D modeling is the process of virtually developing the surface and structure of a 3D object.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Ajax is a technique for creating interactive web applications.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Algorithms are self-contained sequences that carry out a variety of tasks.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Amp is a non-blocking concurrency library for PHP.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Android is an operating system built by Google designed for mobile devices.
         </p>]

In [22]:
len(desc_tags)

30

In [24]:
selection_class="no-underline flex-1 d-flex flex-column"
link_tags=doc.find_all('a',{'class':selection_class})

#link_tags contains the all the a tags contains the url, topics, description

In [25]:
len(link_tags)

30

In [26]:
link_tags[:]

[<a class="no-underline flex-1 d-flex flex-column" href="/topics/3d">
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>
 <p class="f5 color-fg-muted mb-0 mt-1">
           3D modeling is the process of virtually developing the surface and structure of a 3D object.
         </p>
 </a>,
 <a class="no-underline flex-1 d-flex flex-column" href="/topics/ajax">
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>
 <p class="f5 color-fg-muted mb-0 mt-1">
           Ajax is a technique for creating interactive web applications.
         </p>
 </a>,
 <a class="no-underline flex-1 d-flex flex-column" href="/topics/algorithm">
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>
 <p class="f5 color-fg-muted mb-0 mt-1">
           Algorithms are self-contained sequences that carry out a variety of tasks.
         </p>
 </a>,
 <a class="no-underline flex-1 d-flex flex-column" href="/topics/amphp">
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>
 <p class="f

In [27]:
link_tags[0]['href'] 

'/topics/3d'

In [28]:
topic0url="https://github.com"+link_tags[0]['href'] #generate the url by combining the linktags_href with the base url
topic0url

'https://github.com/topics/3d'

In [31]:
topic_tags[0].text

'3D'

In [32]:
title_list=[]  # creating the list for the topics
for x in topic_tags:
    title_list.append(x.text)
print(title_list)

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Atom', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'COVID-19', 'C++']


In [33]:
desc_list=[] # creating the list for the descriptions
for x in desc_tags:
    desc_list.append(x.text.strip())
desc_list[:5]

['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
 'Ajax is a technique for creating interactive web applications.',
 'Algorithms are self-contained sequences that carry out a variety of tasks.',
 'Amp is a non-blocking concurrency library for PHP.',
 'Android is an operating system built by Google designed for mobile devices.']

In [34]:
link_list=[] # creating the list for the urls
base_url='https://github.com'
for x in link_tags:
    link_list.append(base_url+x['href'])
link_list[:5]

['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm',
 'https://github.com/topics/amphp',
 'https://github.com/topics/android']

In [35]:
!pip install pandas --quiet

In [36]:
import pandas as pd

In [43]:
#creating the dictionary contains the 3 lists

topics_dict={ 
    "titles":title_list,
    "description":desc_list,
    "url":link_list
}

In [44]:
topics_dict


{'titles': ['3D',
  'Ajax',
  'Algorithm',
  'Amp',
  'Android',
  'Angular',
  'Ansible',
  'API',
  'Arduino',
  'ASP.NET',
  'Atom',
  'Awesome Lists',
  'Amazon Web Services',
  'Azure',
  'Babel',
  'Bash',
  'Bitcoin',
  'Bootstrap',
  'Bot',
  'C',
  'Chrome',
  'Chrome extension',
  'Command line interface',
  'Clojure',
  'Code quality',
  'Code review',
  'Compiler',
  'Continuous integration',
  'COVID-19',
  'C++'],
 'description': ['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
  'Ajax is a technique for creating interactive web applications.',
  'Algorithms are self-contained sequences that carry out a variety of tasks.',
  'Amp is a non-blocking concurrency library for PHP.',
  'Android is an operating system built by Google designed for mobile devices.',
  'Angular is an open source web application platform.',
  'Ansible is a simple and powerful automation engine.',
  'An API (Application Programming Interface) is a colle

In [41]:
#creating the dataframe for the dictionary

import pandas as pd
topics_df=pd.DataFrame(topics_dict)
topics_df

Unnamed: 0,titles,description,url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


# Create a csv file with the extracted information

In [42]:
#load the dataframe into csv file

topics_df.to_csv('topic.csv',index=None)

# Getting information out of topic page

In [45]:
topic_page_url=link_list[0] # topic page url --select one url from the link list
topic_page_url

'https://github.com/topics/3d'

In [46]:
response=requests.get(link_list[0])
response.status_code
pages=response.text

In [47]:
doc2=BeautifulSoup(pages,'html.parser')


In [48]:
len(pages)

633746

In [49]:
#repo tags contains the list of tags that contains the username and the repository name

repo_tags=doc2.find_all('h3',{ 
    'class':"f3 color-fg-muted text-normal lh-condensed"
})

In [50]:
len(repo_tags)

30

In [68]:
repo_tags[:2]

[<h3 class="f3 color-fg-muted text-normal lh-condensed">
 <a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-turbo="false" data-view-component="true" href="/mrdoob">
   
             mrdoob
 
   
 </a>          /
           <a class="text-bold wb-break-word" data-ga-click="Explore, go to repository, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-h

In [53]:
repo_tags[0]

<h3 class="f3 color-fg-muted text-normal lh-condensed">
<a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-turbo="false" data-view-component="true" href="/mrdoob">
  
            mrdoob

  
</a>          /
          <a class="text-bold wb-break-word" data-ga-click="Explore, go to repository, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-cli

In [54]:
#extract a_tags out of repo tags
#repotags[0] is username and repotags[1] is repository name
a_tags=repo_tags[0].find_all('a')
a_tags[0]

<a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-turbo="false" data-view-component="true" href="/mrdoob">
  
            mrdoob

  
</a>

In [55]:
a_tags

[<a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-turbo="false" data-view-component="true" href="/mrdoob">
   
             mrdoob
 
   
 </a>,
 <a class="text-bold wb-break-word" data-ga-click="Explore, go to repository, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="517d3d5cb9d89752156923904a4238816bc9b51ab7772f3e3644ce897d8d

In [57]:
a_tags[0].text.strip()

'mrdoob'

In [58]:
base_url

'https://github.com'

In [61]:
a_tags[1].text.strip()


'three.js'

In [60]:
a_tags[1]['href'] #extracting the href value from the a_tags[1]

'/mrdoob/three.js'

In [69]:
base_url+a_tags[1]['href'] #generate the repository url

'https://github.com/mrdoob/three.js'

In [70]:
#extract the span tags for a specific class to get the star_tags

star_tags=doc2.find_all('span',{
    'class':"Counter js-social-count"
})
len(star_tags)

30

In [71]:
star_tags[0].text.strip()

'80.8k'

In [66]:
stars_str='80.4k'
stars_str[-1]

'k'

In [67]:
int(float(stars_str[:-1])*1000)

80400

# Defining Functions

In [73]:
#function for returning starvalues

def parse_star_count(stars_str):
    stars_str=stars_str.strip()
    if stars_str[-1]=='k':
        return int(float(stars_str[:-1])*1000)
    return int(stars_str)

In [74]:
parse_star_count(star_tags[0].text.strip())

80800

In [76]:
def get_repo_info(h3_tag,star_tag):
    a_tags=h3_tag.find_all('a')
    username=a_tags[0].text.strip()
    repo_name=a_tags[1].text.strip()
    repo_url=base_url+a_tags[1]['href']
    stars=parse_star_count(star_tag.text.strip())
    return username,repo_name,stars,repo_url  

In [77]:
repo_tags[:2]


[<h3 class="f3 color-fg-muted text-normal lh-condensed">
 <a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-turbo="false" data-view-component="true" href="/mrdoob">
   
             mrdoob
 
   
 </a>          /
           <a class="text-bold wb-break-word" data-ga-click="Explore, go to repository, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-h

In [78]:
get_repo_info(repo_tags[0],star_tags[0])

('mrdoob', 'three.js', 80800, 'https://github.com/mrdoob/three.js')

In [79]:
repo_dict={
    'username':[],
    'repo_name':[],
    'stars':[],
    'repo_url':[]
}
for i in range(len(repo_tags)):
    repo_info=get_repo_info(repo_tags[i],star_tags[i])
    repo_dict['username'].append(repo_info[0])
    repo_dict['repo_name'].append(repo_info[1])
    repo_dict['stars'].append(repo_info[2])
    repo_dict['repo_url'].append(repo_info[3])
    

In [80]:
repo_dict


{'username': ['mrdoob',
  'libgdx',
  'pmndrs',
  'BabylonJS',
  'aframevr',
  'ssloy',
  'lettier',
  'FreeCAD',
  'metafizzy',
  'CesiumGS',
  'timzhang642',
  'a1studmuffin',
  'isl-org',
  'blender',
  'domlysz',
  'spritejs',
  'openscad',
  'tensorspace-team',
  'jagenjo',
  'YadiraF',
  'AaronJackson',
  'google',
  'ssloy',
  'mosra',
  'FyroxEngine',
  'gfxfundamentals',
  'tengbao',
  'cleardusk',
  'jasonlong',
  'cnr-isti-vclab'],
 'repo_name': ['three.js',
  'libgdx',
  'react-three-fiber',
  'Babylon.js',
  'aframe',
  'tinyrenderer',
  '3d-game-shaders-for-beginners',
  'FreeCAD',
  'zdog',
  'cesium',
  '3D-Machine-Learning',
  'SpaceshipGenerator',
  'Open3D',
  'blender',
  'BlenderGIS',
  'spritejs',
  'openscad',
  'tensorspace',
  'webglstudio.js',
  'PRNet',
  'vrn',
  'model-viewer',
  'tinyraytracer',
  'magnum',
  'Fyrox',
  'webgl-fundamentals',
  'vanta',
  '3DDFA',
  'isometric-contributions',
  'meshlab'],
 'stars': [80800,
  19800,
  17500,
  16300,
  1400

In [81]:
repo_dict_df=pd.DataFrame(repo_dict)

In [82]:
repo_dict_df

Unnamed: 0,username,repo_name,stars,repo_url
0,mrdoob,three.js,80800,https://github.com/mrdoob/three.js
1,libgdx,libgdx,19800,https://github.com/libgdx/libgdx
2,pmndrs,react-three-fiber,17500,https://github.com/pmndrs/react-three-fiber
3,BabylonJS,Babylon.js,16300,https://github.com/BabylonJS/Babylon.js
4,aframevr,aframe,14000,https://github.com/aframevr/aframe
5,ssloy,tinyrenderer,13400,https://github.com/ssloy/tinyrenderer
6,lettier,3d-game-shaders-for-beginners,12600,https://github.com/lettier/3d-game-shaders-for...
7,FreeCAD,FreeCAD,11100,https://github.com/FreeCAD/FreeCAD
8,metafizzy,zdog,9100,https://github.com/metafizzy/zdog
9,CesiumGS,cesium,8500,https://github.com/CesiumGS/cesium


In [83]:
import os
def get_topic_page(topic_url):
     #download the page
    response=requests.get(topic_url)
    #check successful response
    if response.status_code!=200:
        raise Exception('Failed to load page {}'.format(topic_url))
    #parse using beautiful soup
    topic_doc=BeautifulSoup(response.text,'html.parser')
    return topic_doc

def get_repo_info(h3_tag,star_tag):
    a_tags=h3_tag.find_all('a')
    username=a_tags[0].text.strip()
    repo_name=a_tags[1].text.strip()
    repo_url=base_url+a_tags[1]['href']
    stars=parse_star_count(star_tag.text.strip())
    return username,repo_name,stars,repo_url
    
def get_topic_repos(topic_doc):
   
    #get the h3 tags containing repo title, repo url and username
    repo_tags=topic_doc.find_all('h3',{
    'class':"f3 color-fg-muted text-normal lh-condensed"
})
    #get the star tags
    star_tags=topic_doc.find_all('span',{
    'class':"Counter js-social-count"
})
    repo_dict={
    'username':[],
    'repo_name':[],
    'stars':[],
    'repo_url':[]
}
    #get repo info
    for i in range(len(repo_tags)):
        repo_info=get_repo_info(repo_tags[i],star_tags[i])
        repo_dict['username'].append(repo_info[0])
        repo_dict['repo_name'].append(repo_info[1])
        repo_dict['stars'].append(repo_info[2])
        repo_dict['repo_url'].append(repo_info[3])
    
    return pd.DataFrame(repo_dict)

# def scrape_topic(topic_url,topic_name):
#     topic_df=get_topic_repos(get_topic_page(topic_url))
#     topic_df.to_csv(topic_name+ '.csv',index=None)

# def scrape_topic(topic_url, topic_name):
#     fname=topic_name + '.csv'
#     if os.path.exists(fname):
#         print("The file {} already exists. Skipping....".format(fname))
#         return
#     topic_df=get_topic_repos(get_topic_page(topic_url))
#     topic_df.to_csv(fname,index=None)

def scrape_topic(topic_url, path):
    if os.path.exists(path):
        print("The file {} already exists. Skipping....".format(path))
        return
    topic_df=get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(path,index=None)


In [68]:
url4=link_list[4]

In [69]:
topic4_doc=get_topic_page(url4)

In [70]:
topic4_repos=get_topic_repos(topic4_doc)

In [71]:
url4

'https://github.com/topics/android'

In [72]:
topic4_repos

Unnamed: 0,username,repo_name,stars,repo_url
0,flutter,flutter,138000,https://github.com/flutter/flutter
1,justjavac,free-programming-books-zh_CN,89300,https://github.com/justjavac/free-programming-...
2,Genymobile,scrcpy,63100,https://github.com/Genymobile/scrcpy
3,Hack-with-Github,Awesome-Hacking,50100,https://github.com/Hack-with-Github/Awesome-Ha...
4,google,material-design-icons,45400,https://github.com/google/material-design-icons
5,wasabeef,awesome-android-ui,42300,https://github.com/wasabeef/awesome-android-ui
6,square,okhttp,41800,https://github.com/square/okhttp
7,android,architecture-samples,40400,https://github.com/android/architecture-samples
8,Solido,awesome-flutter,39900,https://github.com/Solido/awesome-flutter
9,square,retrofit,39700,https://github.com/square/retrofit


In [73]:
get_topic_repos(get_topic_page(link_list[4]))

Unnamed: 0,username,repo_name,stars,repo_url
0,flutter,flutter,138000,https://github.com/flutter/flutter
1,justjavac,free-programming-books-zh_CN,89300,https://github.com/justjavac/free-programming-...
2,Genymobile,scrcpy,63100,https://github.com/Genymobile/scrcpy
3,Hack-with-Github,Awesome-Hacking,50100,https://github.com/Hack-with-Github/Awesome-Ha...
4,google,material-design-icons,45400,https://github.com/google/material-design-icons
5,wasabeef,awesome-android-ui,42300,https://github.com/wasabeef/awesome-android-ui
6,square,okhttp,41800,https://github.com/square/okhttp
7,android,architecture-samples,40400,https://github.com/android/architecture-samples
8,Solido,awesome-flutter,39900,https://github.com/Solido/awesome-flutter
9,square,retrofit,39700,https://github.com/square/retrofit


In [74]:
get_topic_repos(get_topic_page(link_list[4])).to_csv('ansible.csv',index=None)

# Write a single function


In [None]:
#1. Get the list of topics from topics page
#2.Get the list of top repos from individual topic pages
#3. For each topic, create a csv of the top repos for the topic

In [85]:
def get_topic_titles(doc):
    selection_class='f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_tags=doc.find_all('p',{'class':selection_class})
    
    title_list=[]
    for x in topic_tags:
        title_list.append(x.text)
    return title_list

def get_topic_descs(doc):
    selection_class="f5 color-fg-muted mb-0 mt-1"
    desc_tags=doc.find_all('p',{'class':selection_class})
    
    desc_list=[]
    for x in desc_tags:
        desc_list.append(x.text.strip())
    return desc_list

def get_topic_urls(doc):
    selection_class="no-underline flex-1 d-flex flex-column"
    link_tags=doc.find_all('a',{'class':selection_class})
    
    link_list=[]
    base_url='https://github.com'
    for x in link_tags:
        link_list.append(base_url+x['href'])
    return link_list

    
def scrape_topics():
    topics_url='https://github.com/topics'
    response=requests.get(topics_url)
    if response.status_code!=200:
        raise Exception('Failed to load page {}'.format(topics_url))
    
    topics_dict={
        "title":get_topic_titles(doc),
        "description":get_topic_descs(doc),
        "url":get_topic_urls(doc)
    }
    return pd.DataFrame(topics_dict)
    
    

    

In [84]:
topics_url

'https://github.com/topics'

In [86]:
scrape_topics()



Unnamed: 0,title,description,url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


In [87]:

topics_df=scrape_topics()
    

In [88]:
topics_df

Unnamed: 0,title,description,url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


In [89]:
for index,row in topics_df.iterrows():
    print(row['title'],row['url'])

3D https://github.com/topics/3d
Ajax https://github.com/topics/ajax
Algorithm https://github.com/topics/algorithm
Amp https://github.com/topics/amphp
Android https://github.com/topics/android
Angular https://github.com/topics/angular
Ansible https://github.com/topics/ansible
API https://github.com/topics/api
Arduino https://github.com/topics/arduino
ASP.NET https://github.com/topics/aspnet
Atom https://github.com/topics/atom
Awesome Lists https://github.com/topics/awesome
Amazon Web Services https://github.com/topics/aws
Azure https://github.com/topics/azure
Babel https://github.com/topics/babel
Bash https://github.com/topics/bash
Bitcoin https://github.com/topics/bitcoin
Bootstrap https://github.com/topics/bootstrap
Bot https://github.com/topics/bot
C https://github.com/topics/c
Chrome https://github.com/topics/chrome
Chrome extension https://github.com/topics/chrome-extension
Command line interface https://github.com/topics/cli
Clojure https://github.com/topics/clojure
Code quality h

In [90]:
def scrape_topics_repos():
    topics_df=scrape_topics()
    
    os.makedirs('data',exist_ok=True)
    
    for index, row in topics_df.iterrows():
        print('Scraping top repositories for "{}"'.format(row['title']))
#         scrape_topic(row['url'],row['title'])
        scrape_topic(row['url'],'data/{}.csv'.format(row['title']))

In [91]:
scrape_topics_repos()

Scraping top repositories for "3D"
The file data/3D.csv already exists. Skipping....
Scraping top repositories for "Ajax"
The file data/Ajax.csv already exists. Skipping....
Scraping top repositories for "Algorithm"
The file data/Algorithm.csv already exists. Skipping....
Scraping top repositories for "Amp"
The file data/Amp.csv already exists. Skipping....
Scraping top repositories for "Android"
The file data/Android.csv already exists. Skipping....
Scraping top repositories for "Angular"
The file data/Angular.csv already exists. Skipping....
Scraping top repositories for "Ansible"
The file data/Ansible.csv already exists. Skipping....
Scraping top repositories for "API"
The file data/API.csv already exists. Skipping....
Scraping top repositories for "Arduino"
The file data/Arduino.csv already exists. Skipping....
Scraping top repositories for "ASP.NET"
The file data/ASP.NET.csv already exists. Skipping....
Scraping top repositories for "Atom"
The file data/Atom.csv already exists. Sk