In [1]:
import pymongo
import requests

from pymongo import MongoClient
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError

from dateutil import parser

In [2]:
NAME = ['kernel', 'ozlabs', 'ffmpeg']
CATEGORY = ['projects', 'series', 'patches']
INVALID_PAGE = {
    "detail": "Invalid page."
}

In [3]:
def get_database():
    # Provide the mongodb atlas url to connect python to mongodb using pymongo
    CONNECTION_STRING = "mongodb+srv://default:comp90055codereview@cluster0.hls0ye8.mongodb.net/?retryWrites=true&w=majority"

    # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
    client = MongoClient(CONNECTION_STRING)

    # Create the database for our example (we will use the same database throughout the tutorial
    return client['code_review']

In [4]:
def deactivate_quote(text):
    if text:
        split_text = list(text)
        for i in range(len(split_text)):
            if split_text[i] in ["'", '"'] and split_text[i - 1] != '\\':
                split_text[i] = '\\' + split_text[i]
        temp_text = ''.join(split_text)
        return temp_text
    else:
        return ''

In [5]:
# Get the database
db = get_database()

In [6]:
# project = db.create_collection('project')
# series = db.create_collection('series')
# patch = db.create_collection('patch')
# comment = db.create_collection('comment')
# account = db.create_collection('account')

project = db['project']
series = db['series']
patch = db['patch']
comment = db['comment']
account = db['account']

db.list_collection_names()

['project', 'comment', 'patch', 'series', 'account']

In [7]:
# project.create_index([("original_id", pymongo.ASCENDING)],unique=True)
# series.create_index([("original_id", pymongo.ASCENDING)],unique=True)
# patch.create_index([("original_id", pymongo.ASCENDING)],unique=True)
# comment.create_index([("original_id", pymongo.ASCENDING)],unique=True)
# account.create_index([("original_id", pymongo.ASCENDING)],unique=True)

In [8]:
def collection_insert_one(collection, item):
    try:
        collection.insert_one(item)
    except DuplicateKeyError:
        return

In [9]:
def crawl_project_data(json_project, col_account, col_project):
    print(f"retrieving project: {json_project['url']}")
    # project info
    ## delete the last slash in the api url
    project_original_id = json_project['url'][:-1]
    project_name = deactivate_quote(json_project['name'])
    project_repo_url = deactivate_quote(json_project['webscm_url'])
    project_list_id = deactivate_quote(json_project['list_id'])
    project_list_address = deactivate_quote(json_project['list_email'])
    project_web_url = deactivate_quote(json_project['web_url'])

    # maintainer_info
    maintainers = json_project['maintainers']
    maintainer_list = list()
    for maintainer in maintainers:
        maintainer_original_id = maintainer['url'][:-1]
        maintainer_username = deactivate_quote(maintainer['username'])
        maintainer_email = deactivate_quote(maintainer['email'])

        maintainer_list.append(maintainer_original_id)

        item_account = {
            'original_id': maintainer_original_id,
            'email': maintainer_email,
            'username': maintainer_username
        }

        collection_insert_one(col_account, item_account)

    item_project = {
        'original_id': project_original_id,
        'name': project_name,
        'repo_url': project_repo_url,
        'list_id': project_list_id,
        'list_address': project_list_address,
        'web_url': project_web_url,
        'maintainers': maintainer_list
    }

    collection_insert_one(col_project, item_project)


In [10]:
def crawl_series_data(json_series, col_account, col_series):
    print(f"retrieving series: {json_series['url']}")
    # series info
    series_original_id = json_series['url'][:-1]
    series_name = deactivate_quote(json_series['name'])
    series_created_date = parser.parse(json_series['date'])
    series_version = json_series['version']
    series_total = json_series['total']
    series_received_total = json_series['received_total']

    #get cover letter content
    if json_series['cover_letter']:
        cover_letter_url = json_series['cover_letter']['url']
        cover_detail = requests.get(cover_letter_url).json()
        series_cover_letter_content = deactivate_quote(cover_detail['content'])
    else:
        series_cover_letter_content = ''

    # get project id
    series_proj_original_id = json_series['project']['url'][:-1]

    # submitter info
    series_submitter_original_id = json_series['submitter']['url'][:-1]
    series_submitter_email = deactivate_quote(json_series['submitter']['email'])
    series_submitter_name = deactivate_quote(json_series['submitter']['name'])

    item_account = {
        'original_id': series_submitter_original_id,
        'email': series_submitter_email,
        'username': series_submitter_name
    }

    collection_insert_one(col_account, item_account)

    item_series = {
        'original_id': series_original_id,
        'name': series_name,
        'created_date': series_created_date,
        'version': series_version,
        'total': series_total,
        'received_total': series_received_total,
        'cover_letter_content': series_cover_letter_content,
        'project_original_id': series_proj_original_id,
        'submitter_account_original_id': series_submitter_original_id
    }

    collection_insert_one(col_series, item_series)

In [11]:
def crawl_comment_data(json_comment, patch_original_id, col_comment, col_account):
    print(f"retrieving comment, id: {json_comment['web_url']}")
    #comment info
    comment_original_id = json_comment['web_url']
    comment_msg_id = json_comment['msgid']
    comment_msg_content = deactivate_quote(json_comment['content'])
    comment_date = parser.parse(json_comment['date'])
    comment_subject = deactivate_quote(json_comment['subject'])
    comment_reply_to_msg_id = ''
    if 'In-Reply-To' in json_comment['headers'].keys():
        in_reply_to = json_comment['headers']['In-Reply-To']
        if in_reply_to[:2] == '\n ':
            comment_reply_to_msg_id = in_reply_to[2:]

    # get submitter account id
    # insert account if not exist
    comment_submitter_original_id = json_comment['submitter']['url'][:-1]
    comment_submitter_username = deactivate_quote(json_comment['submitter']['name'])
    comment_submitter_email = deactivate_quote(json_comment['submitter']['email'])

    item_account = {
        'original_id': comment_submitter_original_id,
        'email': comment_submitter_email,
        'username': comment_submitter_username
    }

    collection_insert_one(col_account, item_account)

    item_comment = {
        'original_id': comment_original_id,
        'msg_id': comment_msg_id,
        'msg_content': comment_msg_content,
        'date': comment_date,
        'subject': comment_subject,
        'in_reply_to': comment_reply_to_msg_id,
        'patch_original_id': patch_original_id,
        'submitter_account_original_id': comment_submitter_original_id,
        'change_id': '',
        'mailing_list_id': ''
    }

    collection_insert_one(col_comment, item_comment)

    # TODO get change id
    # TODO get mailing list id


In [12]:
def crawl_patch_date(json_patch, col_account, col_patch, col_comment):
    print(f"retrieving patch {json_patch['url']}")
    #patch info
    patch_original_id = json_patch['url'][:-1]
    patch_name = deactivate_quote(json_patch['name'])
    patch_state = deactivate_quote(json_patch['state'])
    patch_date = parser.parse(json_patch['date'])
    patch_msg_id = json_patch['msgid']
    patch_msg_content = deactivate_quote(json_patch['content'])
    patch_code_diff = deactivate_quote(json_patch['diff'])

    # TODO get change id
    # TODO get mailing list id

    # get series id
    if json_patch['series']:
        patch_series_original_id = json_patch['series']['url'][:-1]
    else:
        patch_series_original_id = -9999

    # submitter info
    patch_submitter_original_id = json_patch['submitter']['url'][:-1]
    patch_submitter_username = deactivate_quote(json_patch['submitter']['name'])
    patch_submitter_email = deactivate_quote(json_patch['submitter']['email'])

    item_account = {
        'original_id': patch_submitter_original_id,
        'email': patch_submitter_email,
        'username': patch_submitter_username
    }

    collection_insert_one(col_account, item_account)

    item_patch = {
        'original_id': patch_original_id,
        'name': patch_name,
        'state': patch_state,
        'date': patch_date,
        'msg_id': patch_msg_id,
        'msg_content': patch_msg_content,
        'code_diff': patch_code_diff,
        'series_original_id': patch_series_original_id,
        'submitter_account_original_id': patch_submitter_original_id,
        'change_id': '',
        'mailing_list_id': ''
    }

    collection_insert_one(col_patch, item_patch)

    comment_url = json_patch['comments']
    comment_list = requests.get(comment_url).json()
    if comment_list:
        for c in comment_list:
            crawl_comment_data(c, patch_original_id, col_comment, col_account)



In [13]:
def retrieved_items(collection):
    items = collection.find()
    return [item['original_id'] for item in items]

In [14]:
def crawl_data(name, category):
    platform_url = f'https://patchwork.{name}.org'
    api_url_base = f'{platform_url}/api/{category}/?page='
    page_num = 1
    flag = True

    retrieved_projects = retrieved_items(project)
    retrieved_series = retrieved_items(series)
    retrieved_patches = retrieved_items(patch)

    while flag and page_num <= 3:
        url = api_url_base + f'{page_num}'
        response = requests.get(url).json()
        if type(response) == dict and response == INVALID_PAGE:
            flag = False
        else:
            for entity in response:
                entity_api_url = entity['url']
                entity_detail = requests.get(entity_api_url).json()
                if category == 'projects':
                    if not(entity_detail['url'][:-1] in retrieved_projects):
                        crawl_project_data(entity_detail, account, project)
                elif category == 'series':
                    if not(entity_detail['url'][:-1] in retrieved_series):
                        crawl_series_data(entity_detail, account, series)
                elif category == 'patches':
                    if not(entity_detail['url'][:-1] in retrieved_patches):
                        crawl_patch_date(entity_detail, account, patch, comment)
            page_num += 1

In [15]:
for cate in CATEGORY:
    crawl_data(NAME[2], cate)

retrieving patch https://patchwork.ffmpeg.org/api/patches/6/
retrieving comment, id: https://patchwork.ffmpeg.org/comment/10/
retrieving comment, id: https://patchwork.ffmpeg.org/comment/12/
retrieving comment, id: https://patchwork.ffmpeg.org/comment/13/
retrieving comment, id: https://patchwork.ffmpeg.org/comment/15/
retrieving comment, id: https://patchwork.ffmpeg.org/comment/16/
retrieving patch https://patchwork.ffmpeg.org/api/patches/7/
retrieving comment, id: https://patchwork.ffmpeg.org/comment/14/
retrieving comment, id: https://patchwork.ffmpeg.org/comment/19/
retrieving comment, id: https://patchwork.ffmpeg.org/comment/24/
retrieving comment, id: https://patchwork.ffmpeg.org/comment/43/
retrieving patch https://patchwork.ffmpeg.org/api/patches/8/
retrieving comment, id: https://patchwork.ffmpeg.org/comment/124/
retrieving patch https://patchwork.ffmpeg.org/api/patches/9/
retrieving comment, id: https://patchwork.ffmpeg.org/comment/22/
retrieving comment, id: https://patchwor