In [1]:
import pandas as pd
import requests
import json
import pickle
from tqdm import tqdm
import os
import math

token = "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"
username = "anishapant21"

PYTORCH_API = "https://api.github.com/repos/pytorch/pytorch/issues"
TENSOR_API = "https://api.github.com/repos/tensorflow/tensorflow/issues"

TENSORFLOW_DATA_LEN = math.floor(56989 / 100) + 1
PYTORCH_DATA_LEN = math.floor(83769 / 100) + 1

CHECK_DATE = "2021-02-15"

DATA_PATH = "data"

os.makedirs(DATA_PATH,exist_ok=True)

def get_data(API, page_no=1):
    parameters = {"filter": "repos", "state": "closed", "sort": "created",
                  "direction": "asc", "per_page": 100, "page": page_no}
    req = requests.get(API, auth=(username, token), params=parameters)
    return req.json()


def handle_pagination(api_link, total_pages=100):
    data = []
    for i in tqdm(range(1, total_pages+1)):
        data.extend(get_data(api_link, i))
    return data


def save_data(data,filename):
    try:
        pickle.dump(data, open(os.path.join(DATA_PATH,filename+".pkl"), "wb"))
        with open(os.path.join(DATA_PATH,filename+".json"), "w") as file:
            file.write(json.dumps(data, indent=4))
    
    except Exception as error:
        print("Saving Data Failed due to:",error)

In [17]:
# check for sample issue of tensorflow
data = get_data(TENSOR_API,570)
save_data(data[0], "sample")

In [2]:
# extract tensorflow issues
data_tensorflow = handle_pagination(TENSOR_API, 570)
save_data(data=data_tensorflow, filename="raw_tensorflow")

100%|██████████| 570/570 [19:46<00:00,  2.08s/it]


In [3]:
# extract pytorch issues
data_pytorch = handle_pagination(PYTORCH_API, 838)
save_data(data = data_pytorch, filename="raw_pytorch")

100%|██████████| 838/838 [26:26<00:00,  1.89s/it]


In [5]:
pytorch_df = pd.read_json(os.path.join(DATA_PATH,"raw_pytorch.json"))
tensorflow_df = pd.read_json(os.path.join(DATA_PATH,"raw_tensorflow.json"))
print("No. of issues in Pytorch (With pull requests):", len(pytorch_df))
print("No. of issues in Tensorflow (With pull requests):", len(tensorflow_df))

No. of issues in Pytorch (With pull requests): 83800
No. of issues in Tensorflow (With pull requests): 57000


In [6]:
print(pytorch_df.columns)
print(tensorflow_df.columns)

Index(['url', 'repository_url', 'labels_url', 'comments_url', 'events_url',
       'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels',
       'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments',
       'created_at', 'updated_at', 'closed_at', 'author_association',
       'active_lock_reason', 'body', 'reactions', 'timeline_url',
       'performed_via_github_app', 'state_reason', 'draft', 'pull_request'],
      dtype='object')
Index(['url', 'repository_url', 'labels_url', 'comments_url', 'events_url',
       'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels',
       'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments',
       'created_at', 'updated_at', 'closed_at', 'author_association',
       'active_lock_reason', 'body', 'reactions', 'timeline_url',
       'performed_via_github_app', 'state_reason', 'draft', 'pull_request'],
      dtype='object')


In [7]:
# remove pull requests from the data and only include issues
pytorch_df = pytorch_df[pytorch_df["pull_request"].isnull()].drop(["pull_request"],axis=1)
tensorflow_df = tensorflow_df[tensorflow_df["pull_request"].isnull()].drop(["pull_request"],axis=1)
print("No. of issues in Pytorch (Without pull requests):", len(pytorch_df))
print("No. of issues in Tensorflow (Without pull requests):", len(tensorflow_df))

No. of issues in Pytorch (Without pull requests): 21191
No. of issues in Tensorflow (Without pull requests): 34646


In [8]:
# filter issues by closed date [2021-02-15]
pytorch_df = pytorch_df.loc[pytorch_df["closed_at"] >= CHECK_DATE]
tensorflow_df = tensorflow_df.loc[tensorflow_df["closed_at"] >= CHECK_DATE]
print("No. of issues in Pytorch closed with the last two years:", len(pytorch_df))
print("No. of issues in Tensorflow closed with the last two years:", len(tensorflow_df))

No. of issues in Pytorch closed with the last two years: 8288
No. of issues in Tensorflow closed with the last two years: 8240


In [16]:
# save dataset (within 2 years and no pull requests)
# save_data(pytorch_df,"pytorch")
# save_data(tensorflow_df,"tensorflow")

pickle.dump(pytorch_df, open(os.path.join(DATA_PATH,"pytorch"+".pkl"), "wb"))
pickle.dump(tensorflow_df, open(os.path.join(DATA_PATH,"tensorflow"+".pkl"), "wb"))


### Select 5 issues for analysis

0. Issue ID as a unique identifier [ID] 
1. Time (in days) to fix an issue [MTTF] (Closed date - Start date)
2. Labels of an issue [Labels] (name key from the origin label)
3. Number of Comments [Comments]
4. Author Association [AurthorA]
5. Reactions [Reactions]

6. Active Date [ADate] (Last Updated - Start Date)

In [9]:
# selected_metrics = ["id","labels", "comments","author_association","reactions","Tfix"]

pytorch_metrics_df = pytorch_df[["id","comments","author_association"]].copy()
tensorflow_metrics_df = tensorflow_df[["id","comments","author_association"]].copy()

pytorch_metrics_df.rename(columns={"id":"ID","comments":"Comments","author_association":"AuthorA"},inplace=True)

tensorflow_metrics_df.rename(columns={"id":"ID","comments":"Comments","author_association":"AuthorA"},inplace=True)


In [10]:
tensorflow_metrics_df.head()

Unnamed: 0,ID,Comments,AuthorA
21,115928097,542,NONE
350,119009957,14,CONTRIBUTOR
1525,141753240,27,CONTRIBUTOR
2104,151235937,28,CONTRIBUTOR
2982,161807205,12,CONTRIBUTOR


In [11]:
pytorch_metrics_df.head()

Unnamed: 0,ID,Comments,AuthorA
38,177669684,26,CONTRIBUTOR
142,184100988,10,CONTRIBUTOR
692,206271010,2,NONE
743,207957684,130,NONE
875,211165936,9,CONTRIBUTOR


In [12]:
# MTTF for Pytorch
pytorch_metrics_df["MTTF"] = (pytorch_df["closed_at"] - pytorch_df["created_at"]).dt.days

# MTTF for Tensorflow
tensorflow_metrics_df["MTTF"] = (tensorflow_df["closed_at"] - tensorflow_df["created_at"]).dt.days

In [13]:
# function for parsing label data
def filter_labels(label_series):
    labels = []
    for i in range(len(label_series)):
        label_instance = label_series.iloc[i]
        if(len(label_instance)==0):
            labels.append(None)
        else:
            labels.append(label_instance[0]["name"])
    return labels

# Label of an Issue from Pytorch
pytorch_metrics_df["Labels"] = pd.Series(filter_labels(pytorch_df["labels"]))

# Label of an Issue from Tensorflow
tensorflow_metrics_df["Labels"] = pd.Series(filter_labels(tensorflow_df["labels"]))


In [14]:

# Reactions of an Issue from Pytorch
pytorch_metrics_df["Reactions"] = pytorch_df["reactions"].apply(lambda rxn:rxn["+1"])

# Reactions of an Issue from Tensorflow
tensorflow_metrics_df["Reactions"] = tensorflow_df["reactions"].apply(lambda rxn:rxn["+1"])

In [15]:
# save the data in csv format
pytorch_metrics_df.to_csv(os.path.join(DATA_PATH,"pytorch.csv"),index=False)
tensorflow_metrics_df.to_csv(os.path.join(DATA_PATH,"tensorflow.csv"),index=False)