In [1]:
import requests, json
import time

from threading import Thread
from dateutil import parser
from django.core.serializers.json import DjangoJSONEncoder

In [2]:
NAME = ['kernel', 'ozlabs', 'ffmpeg']
INVALID_PAGE = {
    "detail": "Invalid page."
}
MAX_THREAD = 6
PAGE_START = 0
PAGE_NUM = 250
TARGET_API_URL = 'http://127.0.0.1:8080/api/msr/%s'



TEST_ERROR = ['''{"original_id": ["people with this original id already exists."]}''',
             '''{"original_id": ["project with this original id already exists."]}''',
             '''{"original_id": ["series with this original id already exists."]}''',
             '''{"original_id": ["patch with this original id already exists."]}''',
             '''{"original_id": ["comment with this original id already exists."]}''']

In [3]:
# string = 'https://patchwork.ffmpeg.org/api/projects/'

# try:
#     found = re.search('https://patchwork\.(.*?)\.org', string).group(1)
#     print(found)
# except AttributeError:
#     print('error')

# 'A'.lower()

In [4]:
def retrieve_basic_info(json):
    
    info = ['id', 'url', 'name', 'web_url']
    result = list()
    
    for i in range(len(info)):
        try:
            result.append(json[info[i]])
        except KeyError:
            if info[i] == 'name':
                result.append(json['username'])
            elif info[i] == 'web_url':
                result.append(json['email'])
            else:
                result.append(None)
    return result

In [5]:
def post_data(entity_type, json_data):
    url = TARGET_API_URL %entity_type
    payload = json.dumps(json_data, cls=DjangoJSONEncoder)
    headers = {'Content-Type': 'application/json'}
    response = requests.request("POST", url, headers=headers, data=payload)
    
    if response.status_code != 201 and response.text not in TEST_ERROR:
        print(entity_type, response.text)
        # print(json_data)

In [6]:
def retrieve_people_data(endpoint_name, json_people):
    api_id, api_url, username, email = retrieve_basic_info(json_people)
    original_id = '-'.join([endpoint_name, 'people', str(api_id)])
    
    item_people = {
        'original_id': original_id,
        'email': email,
        'username': username,
        'api_url': api_url
    }
    
    post_data(entity_type='people', json_data=item_people)
    
    return original_id

In [7]:
def retrieve_project_data(endpoint_name, json_project):
    # print(f"retrieving project: {json_project['url']}")
    # project info
    project_api_id, project_api_url, project_name, project_web_url = retrieve_basic_info(json_project)
    
    project_repo_url = json_project['webscm_url']
    project_list_id = json_project['list_id']
    project_list_address = json_project['list_email']
    project_original_id = '-'.join([endpoint_name, 'project', str(project_api_id)])

    # maintainer_info
    maintainers = json_project['maintainers']
    maintainer_list = list()
    for maintainer in maintainers:
        maintainer_original_id = retrieve_people_data(endpoint_name, maintainer)
        maintainer_list.append(maintainer_original_id)

    #post project
    item_project = {
        'original_id': project_original_id,
        'name': project_name,
        'repo_url': project_repo_url,
        'api_url': project_api_url,
        'web_url': project_web_url,
        'list_id': project_list_id,
        'list_address': project_list_address,
        'maintainers': maintainer_list
    }
    
    post_data(entity_type='projects', json_data=item_project)

In [8]:
def retrieve_series_data(endpoint_name, json_series):
    # print(f"retrieving series: {json_series['url']}")
    # series info
    series_api_id, series_api_url, series_name, series_web_url = retrieve_basic_info(json_series)
    
    series_created_date = parser.parse(json_series['date'])
    series_version = json_series['version']
    series_total = json_series['total']
    series_received_total = json_series['received_total']
    
    series_original_id = '-'.join([endpoint_name, 'series', str(series_api_id)])
    
    series_project_api_id, _, series_project_name, _ = retrieve_basic_info(json_series['project'])
    project_original_id = '-'.join([endpoint_name, 'project', str(series_project_api_id)])

    #get cover letter content
    if json_series['cover_letter']:
        cover_letter_url = json_series['cover_letter']['url']
        cover_detail = requests.get(cover_letter_url).json()
        # series_cover_letter_content = deactivate_quote(cover_detail['content'])
        series_cover_letter_content = cover_detail['content']
    else:
        series_cover_letter_content = None
    
    # submitter info
    series_submitter_original_id = retrieve_people_data(endpoint_name, json_series['submitter'])
    
    #post series
    item_series = {
        'original_id': series_original_id,
        'name': series_name,
        'created_date': series_created_date,
        'version': series_version,
        'total': series_total,
        'received_total': series_received_total,
        'cover_letter_content': series_cover_letter_content,
        'web_url': series_web_url,
        'api_url': series_api_url,
        'project_original_id': project_original_id,
        'submitter_original_id': series_submitter_original_id,
    }
    
    post_data(entity_type='series', json_data=item_series)

In [9]:
def retrieve_comment_data(endpoint_name, json_comment, project_original_id, patch_original_id):
    # print(f"retrieving comment, id: {json_comment['web_url']}")
    #comment info
    comment_api_id = json_comment['id']
    comment_web_url = json_comment['web_url']
    comment_msg_id = json_comment['msgid']
    comment_msg_content = json_comment['content']
    comment_date = parser.parse(json_comment['date'])
    comment_subject = json_comment['subject']
    
    comment_original_id = '-'.join([endpoint_name, 'comment', str(comment_api_id)])
    
    comment_reply_to_msg_id = None
    if 'In-Reply-To' in json_comment['headers'].keys():
        in_reply_to = json_comment['headers']['In-Reply-To']
        if in_reply_to[:2] == '\n ':
            comment_reply_to_msg_id = in_reply_to[2:]

    # submitter info
    comment_submitter_original_id = retrieve_people_data(endpoint_name, json_comment['submitter'])
    
    # post comment
    item_comment = {
        'original_id': comment_original_id,
        'msg_id': comment_msg_id,
        'msg_content': comment_msg_content,
        'date': comment_date,
        'subject': comment_subject,
        'reply_to_msg_id': comment_reply_to_msg_id,
        'web_url': comment_web_url,
        'change_id': None,
        'mailing_list_id': None,
        'submitter_original_id': comment_submitter_original_id,
        'patch_original_id': patch_original_id,
        'project_original_id': project_original_id,
    }
    
    post_data(entity_type='comments', json_data=item_comment)

    # TODO get change id
    # TODO get mailing list id


In [10]:
def retrieve_patch_data(endpoint_name, json_patch):
    # print(f"retrieving patch {json_patch['url']}")
    #patch info
    patch_api_id, patch_api_url, patch_name, patch_web_url = retrieve_basic_info(json_patch)
    
    patch_state = json_patch['state']
    patch_date = parser.parse(json_patch['date'])
    patch_msg_id = json_patch['msgid']
    patch_msg_content = json_patch['content']
    patch_code_diff = json_patch['diff']
    
    patch_original_id = '-'.join([endpoint_name, 'patch', str(patch_api_id)])
    patch_project_original_id = '-'.join([endpoint_name, 'project', str(json_patch['project']['id'])])

    # TODO get change id
    # TODO get mailing list id

    # get series original id
    if json_patch['series']:
        patch_series_api_id = json_patch['series']['id']
        patch_series_original_id = '-'.join([endpoint_name, 'series', str(patch_series_api_id)])
    else:
        patch_series_original_id = None

    # submitter info
    patch_submitter_original_id = retrieve_people_data(endpoint_name, json_patch['submitter'])
    
    item_patch = {
        'original_id': patch_original_id,
        'name': patch_name,
        'state': patch_state,
        'date': patch_date,
        'msg_id': patch_msg_id,
        'msg_content': patch_msg_content,
        'code_diff': patch_code_diff,
        'api_url': patch_api_url,
        'web_url': patch_web_url,
        'change_id': None,
        'mailing_list_id': None,
        'series_original_id': patch_series_original_id,
        'submitter_original_id': patch_submitter_original_id,
        'project_original_id': patch_project_original_id,
    }
    
    post_data(entity_type='patches', json_data=item_patch)

    comment_url = json_patch['comments']
    comment_list = requests.get(comment_url).json()
    if comment_list:
        for c in comment_list:
            retrieve_comment_data(endpoint_name, c, patch_project_original_id, patch_original_id)



In [11]:
def main_func(api_url_base, endpoint_name, entity_type, thread_no):
    page_num = PAGE_START + thread_no
    response = requests.get(api_url_base %(endpoint_name, entity_type, page_num)).json()

    retrieval_func = {
        'projects': retrieve_project_data,
        'series': retrieve_series_data,
        'patches': retrieve_patch_data
    }

    while response != INVALID_PAGE and page_num <= PAGE_START + PAGE_NUM:
        # print('%s: page%d started' %entity_type)
        p_start_time = time.time()

        for entity in response:
            entity_api_url = entity['url']
            entity_detail = requests.get(entity_api_url).json()

            retrieval_func[entity_type](endpoint_name, entity_detail)
        
        total_time = time.time() - p_start_time
        print('%s:\tpage%d\tcompleted in %.2f s' %(entity_type, page_num, total_time))

        page_num += MAX_THREAD
        response = requests.get(api_url_base %(endpoint_name, entity_type, page_num)).json()

def crawl_entity(api_url_base, endpoint_name, entity_type):
    
    threads = [Thread(target=main_func, args=(api_url_base, endpoint_name, entity_type, thread_no)) for thread_no in range(1, MAX_THREAD + 1)]
    for thread in threads:
        thread.start()
    
    for thread in threads:
        thread.join()

In [12]:
def crawl_data(endpoint_name):
    start_time = time.time()
    api_url_base = 'https://patchwork.%s.org/api/%s/?page=%d'
    entity_types = ['projects', 'series', 'patches']
    # entity_types = ['series']
    
    [crawl_entity(api_url_base, endpoint_name, entity_type) for entity_type in entity_types]
    
    duration = (time.time() - start_time) / 60
    print('Retrieval completed in %.2f min' %duration)

In [13]:
crawl_data(NAME[2])

projects:	page1	completed in 14.17 s
series:	page2	completed in 118.51 s
series:	page3	completed in 121.63 s
series:	page4	completed in 123.31 s
series:	page6	completed in 124.28 s
series:	page1	completed in 127.54 s
series:	page5	completed in 127.71 s
series:	page8	completed in 140.72 s
series:	page10	completed in 139.96 s
series:	page11	completed in 135.55 s
series:	page7	completed in 140.01 s
series:	page9	completed in 145.46 s
series:	page12	completed in 147.24 s
series:	page14	completed in 141.91 s
series:	page17	completed in 139.18 s
series:	page15	completed in 137.95 s
series:	page13	completed in 145.58 s
series:	page16	completed in 149.84 s
series:	page18	completed in 146.50 s
series:	page20	completed in 121.72 s
series:	page23	completed in 123.63 s
series:	page21	completed in 126.60 s
series:	page19	completed in 121.32 s
series:	page22	completed in 134.78 s
series:	page24	completed in 133.16 s
series:	page26	completed in 120.78 s
series:	page25	completed in 118.76 s
series:	pa

    return self.read(nbytes, buffer)
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\ssl.py", line 1099, in read
    return self._sslobj.read(len, buffer)
ConnectionResetError: [WinError 10054] 远程主机强迫关闭了一个现有的连接。

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-packages\requests\adapters.py", line 489, in send
    resp = conn.urlopen(
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-packages\urllib3\connectionpool.py", line 787, in urlopen
    retries = retries.increment(
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-packages\urllib3\util\retry.py", line 550, in increment
    raise six.reraise(type(error), error, _stacktrace)
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-packages\urllib3\packages\six.py", line 769, in reraise
    raise value.with_traceback(tb)
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-packages\urllib3\connectio

comments DatabaseError at /api/msr/comments
No exception message supplied

Request Method: POST
Request URL: http://127.0.0.1:8080/api/msr/comments
Django Version: 2.2.5
Python Executable: C:\Users\Timothy\anaconda3\envs\MSR_MDB\python.exe
Python Version: 3.8.13
Python Path: ['C:\\Users\\Timothy\\DjangoRestApiMongoDB', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\python38.zip', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\DLLs', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\lib', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\lib\\site-packages', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\lib\\site-packages\\win32', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\lib\\site-packages\\win32\\lib', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\lib\\site-packages\\Pythonwin']
Server time: Tue, 23 Aug 2022 12:43:15 +0000
Installed Applications:
['django.contrib.admin',
 'django.contrib.auth',
 'django.contrib.contenttypes',
 'django.c

Exception in thread Thread-18:
Traceback (most recent call last):
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-packages\urllib3\connectionpool.py", line 703, in urlopen
Exception in thread Exception in thread Thread-17:
Traceback (most recent call last):
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-packages\urllib3\connectionpool.py", line 703, in urlopen
Thread-19:
Traceback (most recent call last):
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-packages\urllib3\connectionpool.py", line 703, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-packages\urllib3\connectionpool.py", line 386, in _make_request
    httplib_response = self._make_request(
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-packages\urllib3\connectionpool.py", line 386, in _make_request
        self._validate_conn(conn)httplib_response = self._make_request(
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-

people DatabaseError at /api/msr/people
No exception message supplied

Request Method: POST
Request URL: http://127.0.0.1:8080/api/msr/people
Django Version: 2.2.5
Python Executable: C:\Users\Timothy\anaconda3\envs\MSR_MDB\python.exe
Python Version: 3.8.13
Python Path: ['C:\\Users\\Timothy\\DjangoRestApiMongoDB', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\python38.zip', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\DLLs', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\lib', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\lib\\site-packages', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\lib\\site-packages\\win32', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\lib\\site-packages\\win32\\lib', 'C:\\Users\\Timothy\\anaconda3\\envs\\MSR_MDB\\lib\\site-packages\\Pythonwin']
Server time: Tue, 23 Aug 2022 12:45:17 +0000
Installed Applications:
['django.contrib.admin',
 'django.contrib.auth',
 'django.contrib.contenttypes',
 'django.contrib

Exception in thread Thread-20:
Traceback (most recent call last):
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-packages\urllib3\connectionpool.py", line 703, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-packages\urllib3\connectionpool.py", line 449, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\site-packages\urllib3\connectionpool.py", line 444, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\http\client.py", line 1348, in getresponse
    response.begin()
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\http\client.py", line 316, in begin
    version, status, reason = self._read_status()
  File "C:\Users\Timothy\anaconda3\envs\MSR_MDB\lib\http\client.py", line 277, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Us

Retrieval completed in 425.36 min
