-
Notifications
You must be signed in to change notification settings - Fork 85
/
github.py
62 lines (54 loc) · 2.3 KB
/
github.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""@desc
Parser for GitHub search results
"""
from search_engine_parser.core.base import BaseSearch
class GitHubSearch(BaseSearch):
"""
Searches GitHub for string
"""
name = "GitHub"
base_url = "https://github.com"
search_url = base_url + "/search?q={query}&p={page}"
summary = "\tGitHub is an American company that provides hosting for software development "\
"version control using Git. It is a subsidiary of Microsoft, which acquired the company "\
"in 2018 for $7.5 billion.\n\tIt offers all of the distributed version control and source"\
" code management (SCM) functionality of Git as well as adding its own features."\
"\n\tAs of May 2019, GitHub reports having over 37 million users and more than 100 million"\
" repositories (including at least 28 million public repositories), making it the largest "\
"host of source code in the world."
def parse_soup(self, soup):
"""
Parses GitHub for a search query.
"""
# find all li tags
return soup.find_all('li', class_='repo-list-item')
def parse_single_result(self, single_result):
"""
Parses the source code to return
:param single_result: single result found in <li class="repo-list-item">
:type single_result: `bs4.element.ResultSet`
:return: parsed title, link and description of single result
:rtype: dict
"""
h3 = single_result.find('h3') #pylint: disable=invalid-name
link_tag = h3.find('a')
# Get the text and link
title = link_tag.text
ref_link = link_tag.get('href')
link = self.base_url + ref_link
desc = single_result.find('p', class_="col-12")
stars_and_lang_div = single_result.find('div', class_='flex-shrink-0')
lang = stars_and_lang_div.find(
'span', itemprop="programmingLanguage").text
stars = stars_and_lang_div.find('a', class_='muted-link').text.strip()
desc = desc.text
title = "{title}\t {lang}\t Stars-{stars}".format(
title=title, lang=lang, stars=stars)
rdict = {
"titles": title,
"links": link,
"descriptions": desc,
"stars": stars,
"languages": lang,
}
return rdict