In [16]:
import pickle
import requests
from bs4 import BeautifulSoup
import re
import time
import json

In [17]:
file_path = '/Users/abdulrafay/Desktop/Research Project/Data/Repo_URL/repo_urls.pkl'

In [18]:
with open(file_path, 'rb') as f:
    my_list = pickle.load(f)

In [19]:
def get_reponame_author(url):
    try:
        pattern = r"^https://github\.com/([^/]+)/([^/]+)"

        match = re.search(pattern, url)

        if match:
            return match.group(1), match.group(2)
        else:
            return 'No author', 'no repository name' 
    
    except Exception as e:
        print(e)

In [20]:
def get_about(soup):
    try:
        links = soup.find_all('h2', class_='mb-3 h4')
        
        if not links:
            print("No sections with the class 'mb-3 h4' were found.")
            return
        
        about_found = False
        for link in links:
            if 'About' in link.get_text():
                p = link.find_next_sibling('p', class_='f4 my-3')

                if p:
                   return p.get_text(strip=True)
                else:
                    return ''
                    
                about_found = True
                break 

        if not about_found:
            print("About section not found.")
    except Exception as e:
        print("Error occurred:", e)


In [21]:
def get_topics(soup):
    all_topics = []
    topic_section = soup.find_all('h3', {'class': 'sr-only'})

    for topic in topic_section:
        if topic.text == 'Topics':
            topic_link = topic.find_next_sibling('div', class_='my-3')
            topic_tag = topic_link.find_all('a', class_='topic-tag topic-tag-link')
            
            for tag in topic_tag:
                all_topics.append(tag.get_text(strip=True))

    return all_topics


In [22]:
def get_stars(url):
    base_url = url + "/" + "stargazers"

    all_stars = []
    page = 1

    while True:
        url = f"{base_url}?page={page}"
        while True: 
            response = requests.get(url)
            if response.status_code == 200:
                break
                    
            else:
                print("Sleeping...")
                time.sleep(2)

        soup = BeautifulSoup(response.text, 'html.parser')

        no_star = soup.find('h2', string="Be the first to star this repository")
        tag = soup.find('div', class_='clearfix container-xl px-3 px-md-4 px-lg-5 mt-4')
        end_star = tag.find('p').text.strip().replace("\n", " ")

        if no_star or 'That’s it. You’ve reached the end of' in end_star:
            break
        
        else:
            star_links = soup.find_all('li', class_='col-md-4 mb-3')
            
            for star_link in star_links:
                starer = star_link.find('span', class_='Truncate-text').text.strip()
                about = star_link.find('p', class_='mb-3').text.strip().split('\n', 1)[0]
                all_stars.append({'user_name': starer, 'user_about': about})

        page+=1
    
    return all_stars

In [23]:
def get_watchers(url):
    base_url = url + "/" + "watchers"

    all_watchers = []
    page = 1

    while True:
        url = f"{base_url}?page={page}"

        while True: 
            response = requests.get(url)
            if response.status_code == 200:
                break
                    
            else:
                print("Sleeping...")
                time.sleep(2)

        soup = BeautifulSoup(response.text, 'html.parser')

        no_watcher = soup.find('h2', class_="blankslate-heading")

        if no_watcher:
            break
        
        else:
            watcher_links = soup.find_all('li', class_='col-md-4 mb-3')
            
            for watcher_link in watcher_links:
                watcher = watcher_link.find('span', class_='Truncate-text').text.strip()
                about = watcher_link.find('p', class_='mb-3').text.strip().split('\n', 1)[0]
                all_watchers.append({'user_name': watcher, 'user_about': about})

        page+=1

    return all_watchers

In [24]:
def get_forks(url):
    base_url = url + "/" + "forks"

    all_forks = []
    page = 1

    while True:
        url = f"{base_url}?page={page}&period=&include=active%2Cinactive"
        
        while True: 
            response = requests.get(url)
            if response.status_code == 200:
                break
                    
            else:
                print("Sleeping...")
                time.sleep(2)

        soup = BeautifulSoup(response.text, 'html.parser')

        no_forks = soup.find('div', class_="Box mt-3")

        if not no_forks:
            break

        else:
            fork_links = no_forks.find_all('h2', class_="f4")

            for fork_link in fork_links:
                author = fork_link.find('span',class_='f4 d-inline-block').get_text(strip=True)
                forked_as = fork_link.find('a',class_='Link f4').get_text(strip=True)
                active_inactive_tag = fork_link.find_next_sibling('div', class_='d-flex flex-column flex-sm-row mt-2')
                active_flag = active_inactive_tag.find('span', class_='text-small')

                if active_flag is not None and active_flag.get_text(strip=True) == "Never updated":
                    active_flag = "Inactive"
                else:
                    active_flag = "Active"


                all_forks.append({'user_name': author, 'repo_forked_as': forked_as, 'repo_status': active_flag})      

        page+=1


    return all_forks

In [25]:
def get_langauges(soup):
    links = soup.find_all('h2', class_='h4 mb-3')

    languages = {}

    for link in links:
        if 'Languages' in link.text:
            ul = link.find_next_sibling('ul', class_='list-style-none')
            span1 = ul.find_all('span',class_='color-fg-default text-bold mr-1')

            for i in span1:
                key = i.text.strip()
                value = i.find_next_sibling().text.strip()
                languages[key] = value


    return languages

In [26]:
def get_contributors(owner, repo, token):
    url = f"https://api.github.com/repos/{owner}/{repo}/contributors"
    headers = {'Authorization': f'token {token}'} if token else {}
    page = 1
    all_contributors = []

    while True:
        response = requests.get(url, headers=headers, params={'per_page': 100, 'page': page})

        if response.status_code == 200:
            contributors = response.json()
            if not contributors: 
                return all_contributors

            for contributor in contributors:
                all_contributors.append({'user_name': contributor['login'], 'no_of_commits': contributor['contributions']})

            page += 1
        else:
            print(f"Failed to fetch contributors: {response.status_code}")
            return None

    return all_contributors

In [27]:
def get_issues(url):
    base_url = url + "/" + "issues"
    all_issues = []

    states = ['open', 'closed']
    
    for state in states:
        page = 1
        while True:
            url = f"{base_url}?page={page}&q=is%3Aissue+is%3A{state}"
            while True: 

                response = requests.get(url)
                if response.status_code == 200:
                    break
                
                else:
                    print("Sleeping...")
                    time.sleep(2)

            soup = BeautifulSoup(response.text, 'html.parser')

            no_issues_tag_open = soup.find('h2', string="There aren’t any open issues.")
            no_issues_tag_closed = soup.find('h2', string="No results matched your search.")
            no_issues_anyway = soup.find('h2', string='Welcome to issues!')

            if no_issues_tag_open or no_issues_tag_closed or no_issues_anyway:
                break

            else:
                issue_links = soup.find_all('div', class_="flex-auto min-width-0 p-2 pr-3 pr-md-2")

                for issue in issue_links:
                    title_tag = issue.find('a', class_='Link--primary')
                    title = title_tag.text.strip()
                    issue_id = title_tag['href'].split('/')[-1]
                    author = issue.find('a', class_='Link--muted').text
                    all_issues.append({'issue_id': issue_id, 'issue_title': title, 'issue_status': state, 'issue_author': author})

            page+=1

    return all_issues

In [28]:
def get_pull_requests(url):
    base_url = url + "/" + "pulls"
    all_pr = []

    states = ['open', 'closed']
    
    for state in states:
        page = 1
        while True:
            url = f"{base_url}?page={page}&q=is%3Apr+is%3A{state}"
            while True: 

                response = requests.get(url)
                if response.status_code == 200:
                    break
                
                else:
                    print("Sleeping...")
                    time.sleep(2)

            soup = BeautifulSoup(response.text, 'html.parser')

            no_PR_open = soup.find('h2', string="There aren’t any open pull requests.")
            no_PR_closed = soup.find('h2', string="No results matched your search.")
            no_pr_anyway = soup.find('h2', string='Welcome to pull requests!')

            if no_PR_open or no_PR_closed or no_pr_anyway:
                break

            else:
                PR_links = soup.find_all('div', class_="flex-auto min-width-0 p-2 pr-3 pr-md-2")

                for pr in PR_links:
                    title_tag = pr.find('a', class_='Link--primary')
                    pr_title = title_tag.text.strip()
                    pr_id = title_tag['href'].split('/')[-1]
                    pr_author = pr.find('a', class_='Link--muted').text
                    all_pr.append({'pr_id': pr_id, 'pr_title': pr_title, 'pr_status': state, 'pr_author': pr_author})

            page+=1

    return all_pr

In [29]:
def make_json(url, author, repo_name, about, topic_lst, star_lst, watcher_lst, forks_lst, languages_lst, contributors_lst, issues_lst, pr_lst):
    data = {
        'url': url,
        'author': author,
        'repositoryName': repo_name,
        'about': about,
        'topics': topic_lst,
        'languages': languages_lst,
        'stars': star_lst,
        'watchers': watcher_lst,
        'forks': forks_lst,
        'contributors': contributors_lst,
        'issues': issues_lst,
        'pull_requests': pr_lst
    }

    filename = './p_scaads_finetune/crawled_github_data/5000/' + repo_name + ".json"

    with open(filename, "w") as json_file:
        json.dump(data, json_file, indent=4)

In [None]:
for url in my_list[:1]:
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    author, repo_name = get_reponame_author(url)
    about = get_about(soup)
    topic_lst = get_topics(soup)
    star_lst = get_stars(url)
    watcher_lst = get_watchers(url)
    forks_lst = get_forks(url)
    languages_lst = get_langauges(soup)
    contributors_lst = get_contributors(author, repo_name, '')
    issues_lst = get_issues(url)
    pr_lst = get_pull_requests(url)

    make_json(url, author, repo_name, about, topic_lst, star_lst, watcher_lst, forks_lst, languages_lst, contributors_lst, issues_lst, pr_lst)
    print("=====================================")