In [181]:
import pandas as pd
import numpy as np
import json
import requests
from pathlib import Path

### Fetching the data by performing an HTTP call to the OpenDev resources

In [182]:
full_data_length = 10000
full_data = np.array([])
for i in range(500, full_data_length + 1, 500):
    params = {
        'O': 81,
        'S': 0,
        'n': i,  
        'q': '-status:merged'
    }

    response = requests.get("https://review.opendev.org/changes/",
                            params=params)

    data_per_request = response.text.split("\n")[1]

    data_per_request = np.array(json.loads(data_per_request))

    full_data = np.concatenate((full_data, data_per_request))


### Make sure that the data has the same length as indicated in the url parameters

In [183]:
len(full_data)

10000

### A simple function that combines the columns of the entire dataset, and excluding duplicated ones

In [184]:
def retierve_cols():
    '''This fuction serves for extracting all columns of the data
    \nIt is optional and not used for the extraction process
    '''
    columns = np.array([])

    for i in range(full_data_length):
        # dd = dict()
        columns = np.hstack((columns, list(full_data[i].keys())))

    return columns

In [185]:
columns = set(retierve_cols())

In [186]:
len(columns)

31

### The original columns of the dataset

In [187]:
print(columns)

{'owner', 'created', '_more_changes', 'project', 'change_id', 'total_comment_count', 'labels', 'cherry_pick_of_change', 'insertions', 'requirements', 'has_review_started', 'reviewers', 'status', 'branch', 'id', 'subject', 'removable_reviewers', 'hashtags', 'submit_records', 'unresolved_comment_count', 'cherry_pick_of_patch_set', 'meta_rev_id', 'deletions', 'submit_type', 'updated', 'mergeable', 'topic', 'pending_reviewers', 'work_in_progress', '_number', 'attention_set'}


### This utility function helps to deal with any given data scheme, be it simple or complex data structure. This way, it can easily be adapted to any future data scheme changes  

In [188]:
def build_object(obj):

    keys = obj.keys()
    new_item = {}

    for k in keys:
    
        if (type(obj[k]) is int or type(obj[k]) is str
                or type(obj[k]) is bool):
            new_item[k] = obj[k]
    
        elif type(obj[k]) is list or type(obj[k]) is tuple:
            new_attr_name = (k + "_count").replace("__", "_")
            new_item[new_attr_name] = len(obj[k])
    
        elif type(obj[k]) is dict:
            dict_obj = {}
           
            for k2 in obj[k].keys():
                new_dict_attr = (k+"_"+k2).replace("__", "_")
                dict_obj[new_dict_attr] = obj[k][k2]
           
            return build_object(new_item | dict_obj)
    
    return new_item

### This function is designed to transform the original dataset to a well-structured one.

In [189]:
def process_data(data):
    clean_data = []

    for d in data:

        new_obj = build_object(d)

        clean_data.append(new_obj)

    return clean_data

In [190]:
clean_data = process_data(full_data)

### Passing the newly-transformed dataset to pandas for further processes

In [191]:
df = pd.DataFrame(clean_data)

### Save the data in a CSV file

In [192]:
filepath = Path('./openstack_data.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath, index=False)