In [1]:
import pymongo
import requests
import re
import time

from pymongo import MongoClient
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError, CollectionInvalid
from threading import Thread
from dateutil import parser

In [2]:
NAME = ['kernel', 'ozlabs', 'ffmpeg']
CATEGORY = ['projects', 'series', 'patches']
INVALID_PAGE = {
    "detail": "Invalid page."
}

In [3]:
def get_database(username, password, database_name):
    # Provide the mongodb atlas url to connect python to mongodb using pymongo
    CONNECTION_STRING = f"mongodb+srv://{username}:{password}@cluster0.hls0ye8.mongodb.net/?retryWrites=true&w=majority"

    # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
    client = MongoClient(CONNECTION_STRING)

    # Create the database for our example (we will use the same database throughout the tutorial
    return client[database_name]

In [4]:
# Get the database
username = 'default'
password = 'comp90055codereview'
database_name = 'code_review'
db = get_database(username, password, database_name)

In [5]:
# drop collections
# try:
#     db['project'].drop()
#     db['series'].drop()
#     db['patch'].drop()
#     db['comment'].drop()
#     db['account'].drop()
# except:
#     pass

In [6]:
try:
    project = db.create_collection('project')
    series = db.create_collection('series')
    patch = db.create_collection('patch')
    comment = db.create_collection('comment')
    account = db.create_collection('account')
except CollectionInvalid:
    pass
    # project = db['project']
    # series = db['series']
    # patch = db['patch']
    # comment = db['comment']
    # account = db['account']
db.list_collection_names()

['account', 'comment', 'project', 'patch', 'series']

In [7]:
project.create_index([("original_id", pymongo.ASCENDING)],unique=True)
series.create_index([("original_id", pymongo.ASCENDING)],unique=True)
patch.create_index([("original_id", pymongo.ASCENDING)],unique=True)
comment.create_index([("original_id", pymongo.ASCENDING)],unique=True)
account.create_index([("original_id", pymongo.ASCENDING)],unique=True)

'original_id_1'

In [8]:
def collection_insert_one(collection, item):
    try:
        collection.insert_one(item)
    except DuplicateKeyError:
        return

In [9]:
# t_url = 'https://patchwork.ffmpeg.org/api/patches/17384/'
# response = requests.get(t_url).json()
# response

In [10]:
# api_id, api_url, name, web_url = retrieve_basic_info(response['submitter'])
# print(api_id, api_url, name, web_url)

In [11]:
def retrieve_basic_info(json):
    api_id = 'id'
    api_url = 'url'
    name = 'name'
    web_url = 'web_url'
    
    info = [api_id, api_url, name, web_url]
    
    for i in range(len(info)):
        try:
            info[i] = json[info[i]]
        except KeyError:
            info[i] = None
    return info

In [12]:
# string = 'https://patchwork.ffmpeg.org/api/projects/'

# try:
#     found = re.search('https://patchwork\.(.*?)\.org', string).group(1)
#     print(found)
# except AttributeError:
#     print('error')

# 'A'.lower()

In [13]:
def retrieve_project_data(endpoint_name, json_project, database):
    # print(f"retrieving project: {json_project['url']}")
    # project info
    project_api_id, project_api_url, project_name, project_web_url = retrieve_basic_info(json_project)
    
    project_repo_url = json_project['webscm_url']
    project_list_id = json_project['list_id']
    project_list_address = json_project['list_email']
    project_original_id = '-'.join([endpoint_name, 'project', str(project_api_id)])

    # maintainer_info
    maintainers = json_project['maintainers']
    maintainer_list = list()
    for maintainer in maintainers:
        
        maintainer_api_id, maintainer_api_url, _, _ = retrieve_basic_info(maintainer)
        maintainer_username = maintainer['username']
        maintainer_email = maintainer['email']
        maintainer_original_id = '-'.join([endpoint_name, 'people', str(maintainer_api_id)])

        maintainer_list.append(maintainer_original_id)

        item_account = {
            'original_id': maintainer_original_id,
            'email': maintainer_email,
            'username': maintainer_username,
            'api_url': maintainer_api_url
        }

        collection_insert_one(database['account'], item_account)

    item_project = {
        'original_id': project_original_id,
        'name': project_name,
        'repo_url': project_repo_url,
        'list_id': project_list_id,
        'list_address': project_list_address,
        'web_url': project_web_url,
        'api_url': project_api_url,
        'maintainers': maintainer_list
    }

    collection_insert_one(database['project'], item_project)


In [14]:
def retrieve_series_data(endpoint_name, json_series, database):
    # print(f"retrieving series: {json_series['url']}")
    # series info
    series_api_id, series_api_url, series_name, series_web_url = retrieve_basic_info(json_series)
    series_created_date = parser.parse(json_series['date'])
    series_version = json_series['version']
    series_total = json_series['total']
    series_received_total = json_series['received_total']
    
    series_project_api_id, _, series_project_name, _ = retrieve_basic_info(json_series['project'])
    project_original_id = '-'.join([endpoint_name, 'project', str(series_project_api_id)])
    series_original_id = '-'.join([endpoint_name, 'series', str(series_api_id)])

    #get cover letter content
    if json_series['cover_letter']:
        cover_letter_url = json_series['cover_letter']['url']
        cover_detail = requests.get(cover_letter_url).json()
        # series_cover_letter_content = deactivate_quote(cover_detail['content'])
        series_cover_letter_content = cover_detail['content']
    else:
        series_cover_letter_content = ''

    # get project id
    # series_proj_original_id = json_series['project']['url'][:-1]

    # submitter info
    series_submitter_api_id, series_submitter_api_url, _, _ = retrieve_basic_info(json_series['submitter'])
    series_submitter_original_id = '-'.join([endpoint_name, 'people', str(series_submitter_api_id)])
    series_submitter_email = json_series['submitter']['email']
    series_submitter_name = json_series['submitter']['name']

    item_account = {
        'original_id': series_submitter_original_id,
        'email': series_submitter_email,
        'username': series_submitter_name,
        'api_url': series_submitter_api_url
    }

    collection_insert_one(database['account'], item_account)

    item_series = {
        'original_id': series_original_id,
        'name': series_name,
        'created_date': series_created_date,
        'version': series_version,
        'total': series_total,
        'received_total': series_received_total,
        'cover_letter_content': series_cover_letter_content,
        'project_original_id': project_original_id,
        'submitter_account_original_id': series_submitter_original_id,
        'web_url': series_web_url,
        'api_url': series_api_url
    }

    collection_insert_one(database['series'], item_series)

In [15]:
def retrieve_comment_data(endpoint_name, json_comment, project_original_id, patch_original_id, database):
    # print(f"retrieving comment, id: {json_comment['web_url']}")
    #comment info
    comment_api_id = json_comment['id']
    comment_original_id = '-'.join([endpoint_name, 'comment', str(comment_api_id)])
    comment_web_url = json_comment['web_url']
    comment_msg_id = json_comment['msgid']
    comment_msg_content = json_comment['content']
    comment_date = parser.parse(json_comment['date'])
    comment_subject = json_comment['subject']
    comment_reply_to_msg_id = ''
    if 'In-Reply-To' in json_comment['headers'].keys():
        in_reply_to = json_comment['headers']['In-Reply-To']
        if in_reply_to[:2] == '\n ':
            comment_reply_to_msg_id = in_reply_to[2:]

    # get submitter account id
    # insert account if not exist
    comment_submitter_api_id, comment_submitter_api_url, _, _ = retrieve_basic_info(json_comment['submitter'])
    comment_submitter_original_id = '-'.join([endpoint_name, 'people', str(comment_submitter_api_id)])
    comment_submitter_username = json_comment['submitter']['name']
    comment_submitter_email = json_comment['submitter']['email']

    item_account = {
        'original_id': comment_submitter_original_id,
        'email': comment_submitter_email,
        'username': comment_submitter_username,
        'api_url': comment_submitter_api_url
    }

    collection_insert_one(database['account'], item_account)

    item_comment = {
        'original_id': comment_original_id,
        'msg_id': comment_msg_id,
        'msg_content': comment_msg_content,
        'date': comment_date,
        'subject': comment_subject,
        'in_reply_to': comment_reply_to_msg_id,
        'project_original_id': project_original_id,
        'patch_original_id': patch_original_id,
        'submitter_account_original_id': comment_submitter_original_id,
        'change_id': '',
        'mailing_list_id': '',
        'web_url': comment_web_url
    }

    collection_insert_one(database['comment'], item_comment)

    # TODO get change id
    # TODO get mailing list id


In [16]:
def retrieve_patch_data(endpoint_name, json_patch, database):
    # print(f"retrieving patch {json_patch['url']}")
    #patch info
    patch_api_id, patch_api_url, patch_name, patch_web_url = retrieve_basic_info(json_patch)
    patch_project_original_id = '-'.join([endpoint_name, 'project', str(json_patch['project']['id'])])
    patch_original_id = '-'.join([endpoint_name, 'patch', str(patch_api_id)])
    
    patch_state = json_patch['state']
    patch_date = parser.parse(json_patch['date'])
    patch_msg_id = json_patch['msgid']
    patch_msg_content = json_patch['content']
    patch_code_diff = json_patch['diff']

    # TODO get change id
    # TODO get mailing list id

    # get series id
    if json_patch['series']:
        patch_series_api_id = json_patch['series']['id']
        patch_series_original_id = '-'.join([endpoint_name, 'series', str(patch_series_api_id)])
    else:
        patch_series_original_id = ''

    # submitter info
    patch_submitter_api_id, patch_submitter_api_url, _, _ = retrieve_basic_info(json_patch['submitter'])
    patch_submitter_original_id = '-'.join([endpoint_name, 'people', str(patch_submitter_api_id)])
    patch_submitter_username = json_patch['submitter']['name']
    patch_submitter_email = json_patch['submitter']['email']

    item_account = {
        'original_id': patch_submitter_original_id,
        'email': patch_submitter_email,
        'username': patch_submitter_username,
        'api_url': patch_submitter_api_url
    }

    collection_insert_one(database['account'], item_account)

    item_patch = {
        'original_id': patch_original_id,
        'name': patch_name,
        'state': patch_state,
        'date': patch_date,
        'msg_id': patch_msg_id,
        'msg_content': patch_msg_content,
        'code_diff': patch_code_diff,
        'project_original_id': patch_project_original_id,
        'series_original_id': patch_series_original_id,
        'submitter_account_original_id': patch_submitter_original_id,
        'change_id': '',
        'mailing_list_id': '',
        'api_url': patch_api_url,
        'web_url': patch_web_url
    }

    collection_insert_one(database['patch'], item_patch)

    comment_url = json_patch['comments']
    comment_list = requests.get(comment_url).json()
    if comment_list:
        for c in comment_list:
            retrieve_comment_data(endpoint_name, c, patch_project_original_id, patch_original_id, database)



In [17]:
# def retrieved_items(collection):
#     items = collection.find()
#     return [item['original_id'] for item in items]

In [18]:
MAX_THREAD = 6
PAGE_START = 0
BATCH = 250

def main_func(api_url_base, endpoint_name, database, entity_type, thread_no):
    page_num = PAGE_START + thread_no
    response = requests.get(api_url_base %(endpoint_name, entity_type, page_num)).json()

    retrieval_func = {
        'projects': retrieve_project_data,
        'series': retrieve_series_data,
        'patches': retrieve_patch_data
    }

    while response != INVALID_PAGE and page_num <= PAGE_START + BATCH:
        # print('%s: page%d started' %entity_type)
        p_start_time = time.time()

        for entity in response:
            entity_api_url = entity['url']
            entity_detail = requests.get(entity_api_url).json()

            retrieval_func[entity_type](endpoint_name, entity_detail, database)
        
        total_time = time.time() - p_start_time
        print('%s:\tpage%d\tcompleted in %.2f s' %(entity_type, page_num, total_time))

        page_num += MAX_THREAD
        response = requests.get(api_url_base %(endpoint_name, entity_type, page_num)).json()

def crawl_entity(api_url_base, endpoint_name, database, entity_type):
    # page_num = 1
    # api_url = api_url_base %(endpoint_name, entity_type, page_num)
    # thread1 = Thread(target=main_func, args=(api_url_base, endpoint_name, database, entity_type, 1))
    # thread2 = Thread(target=main_func, args=(api_url_base, endpoint_name, database, entity_type, 2))
    
    # thread1.start()
    # thread2.start()
    
    threads = [Thread(target=main_func, args=(api_url_base, endpoint_name, database, entity_type, thread_no)) for thread_no in range(1, MAX_THREAD + 1)]
    for thread in threads:
        thread.start()
    
    for thread in threads:
        thread.join()
    # main_func(api_url_base, endpoint_name, database, entity_type, 1)

In [19]:
def crawl_data(endpoint_name, database):
    start_time = time.time()
    api_url_base = 'https://patchwork.%s.org/api/%s/?page=%d'
    entity_types = ['projects', 'series', 'patches']
    # entity_types = ['series']
    
#     threads = [Thread(target=crawl_entity, args=(api_url_base, endpoint_name, database, entity_type)) for entity_type in entity_types]
    
#     for thread in threads:
#         thread.start()
        
#     for thread in threads:
#         thread.join()

    [crawl_entity(api_url_base, endpoint_name, database, entity_type) for entity_type in entity_types]
    
    duration = (time.time() - start_time) / 60
    print('Retrieval completed in %.2f min' %duration)

In [20]:
# for cate in CATEGORY:
#     crawl_data(NAME[2], cate)
crawl_data(NAME[2], db)

projects:	page1	completed in 1.98 s
series:	page2	completed in 46.66 s
series:	page3	completed in 46.24 s
series:	page1	completed in 50.59 s
series:	page5	completed in 50.44 s
series:	page6	completed in 50.91 s
series:	page4	completed in 53.37 s
series:	page8	completed in 44.04 s
series:	page9	completed in 46.59 s
series:	page11	completed in 44.15 s
series:	page12	completed in 45.35 s
series:	page7	completed in 46.89 s
series:	page10	completed in 46.70 s
series:	page14	completed in 47.07 s
series:	page15	completed in 44.56 s
series:	page18	completed in 44.04 s
series:	page13	completed in 48.50 s
series:	page17	completed in 50.92 s
series:	page16	completed in 50.07 s
series:	page20	completed in 49.24 s
series:	page21	completed in 48.91 s
series:	page24	completed in 47.49 s
series:	page19	completed in 47.51 s
series:	page23	completed in 48.16 s
series:	page22	completed in 49.58 s
series:	page26	completed in 49.36 s
series:	page30	completed in 47.97 s
series:	page27	completed in 49.17 s
s