In [1]:
#Import Modules
import pydriller as pdl
from git import Repo
import os
import pandas as pd
import numpy as np
import re
from datetime import datetime
import time
import requests
import json

In [5]:
#Initialize Variables
owner = 'apache'
dstPath = 'projects'
projectsName = ['camel', 'hadoop','hbase','impala','thrift']
token = open('gh-token.txt','r').readlines()[0]
ghLink = 'https://api.github.com/repos'

<YOUR GITHUB ACCESS TOKEN>


In [None]:
#Methods
def download_repo(owner, project,dstPath):
    
    projectUrl = f'https://github.com/{owner}/{project}'
    
    print(f'Cloning {owner}/{project}')
    
    if f'{owner}-{project}' in os.listdir(f'{dstPath}'):
        os.rmdir(f'{dstPath}/{owner}-{project}')
        
    os.mkdir(f'{dstPath}/{owner}-{project}')
    
    try:
        Repo.clone_from(projectUrl, f'{dstPath}/{owner}-{project}')
    except:
        print(f'Error while cloning {owner}/{project}')
        
def extractCommits(owner, project, dstPath):
    if f'issues-dataset.csv' in os.listdir():
        issues_df = pd.read_csv('issues-dataset.csv')
    else:
        issues_df = pd.read_csv('../technical-debt-issues-dataset.csv')
        issues_df['fixed_commit'] = np.nan
    
    print(f'Analyzing {owner}/{project}')
    
    #Initialize a PyDriller repo to analyze the commits
    repo = pdl.Repository(f'{dstPath}/{owner}-{project}')
    
    #Filter all issues related to a project
    issues_project = issues_df.loc[issues_df['project'] == project]
    
    #Filter all issues that has debt
    issues_project_debt = issues_project.loc[issues_project['classification'] != 'non_debt']
    
    issue_numbers = issues_project_debt['issue_number'].unique()
  
    issues_commits = {}
    
    print(issue_numbers)
    
    #For each isssue, go throught all commits to find those thats refers the issue.
    print(len(issue_numbers))
    for issue in issue_numbers:
        issues_commits[issue] = []
        
        print(f'Finding Commits for Issue {issue}')
        
        for i,commit in enumerate(repo.traverse_commits()):
            
            #Since commiters use the pattern PROJECT-<issue_number>, we used a regex to filter the commits
            close_issue = re.search(f'{project.lower()}-{issue}[^0-9]', commit.msg.lower())
            
            #close_issue has true if the commit message refers to the issue
            if close_issue:
                 issues_commits[issue].append(commit.hash)
      
    for i in issues_project_debt.index:
        try:
            issues_df['fixed_commit'][i] = str(issues_commits[issues_project_debt['issue_number'][i]]).replace('[','').replace(']','').replace("'","").replace(' ','')
        except:
            print(f'None commit were found for Issue {i}')
    #Storing the dataset with the commits ha
    issues_df.to_csv('issues-dataset.csv', index = False)


    
# Using GH Api to recover all Pull Requests associated with a commit
def getPullRequestPerCommit(owner, project, token, commitSHA):
    time.sleep(3.5)
    headers = {"accept": "application/vnd.github.v3+json",
               "authorization": f"token {token}"}
    url = f'{ghLink}/{owner}/{project}/commits/{commitSHA}/pulls'
    return requests.get(url,headers=headers).json()

def getCommitsFromPullRequests(url):
    print(f'{url}/commits (from getCommits)')
    time.sleep(3.5)
    headers = {"accept": "application/vnd.github.v3+json",
               "authorization": f"token {token}"}
    return requests.get(f'{url}/commits',headers=headers).json()

def extractPullRequests(owner, project, dstPath):
    issues_df = pd.read_csv('issues-dataset.csv')
    
    #If there is no column for PRs URL, we create a new column
    if 'pull_requests_urls' not in list(issues_df):
        issues_df['pull_requests_urls'] = '-'
    
    #Filter all issues related to a project
    issues_project = issues_df.loc[issues_df['project'] == project]
    
    #Filter all issues that has debt
    issues_project_debt = issues_project.loc[issues_project['classification'] != 'non_debt']
    
    #Select the unique issues numbers, since there are multiple rows per issue
    issue_numbers = issues_project_debt['issue_number'].unique()
  
    #Creating a dictionary to store the PR urls per issue number
    issues_pr = {}

    for issue in issue_numbers:
        #Creating an array to store the urls for a specific issue
        issues_pr[issue] = []
        
        #Since there are several rows per issue, we can take a single index to recover the commits for that issue
        first_index = issues_project_debt.loc[issues_project_debt['issue_number'] == issue].index[0]
        
        #Recovering the commit list
        commitIssueList = str(issues_project_debt['fixed_commit'][first_index]).replace('"', '').split(',')
        
        #If there is a commit related to a issue, we will look for PRs related to those commits
        if len(commitIssueList) > 0:
            for commit in commitIssueList:
                
                #If there is a valid commit SHA, we access the GH API to recover the PRs
                if commit != 'nan':
                    try:
                        print(f'Search for Pull Requests related to commit {commit}...')
                        response = getPullRequestPerCommit(owner,project,token, commit)
                        if len(response) > 0:
                            issues_pr[issue].extend([pull['url'] for pull in response])
                    except:
                        print(f'ERROR: {response}')
        
    #After analyze every commit, we have a dictionary with issue numbers as keys and a list of urls as values
    #Now, we go through the dataframe and update all rows related to a specific issue, in order to add the PRs URLs
    for i in issues_project_debt.index:
        try:
            issues_df['pull_requests_urls'][i] = str(issues_pr[issues_project_debt['issue_number'][i]]).replace('[','').replace(']','').replace("'","").replace(' ','')
        except:
            print(f'None commit were found for Issue {i}')
    
    #After update the dataframe, we store it in a .csv file
    issues_df.to_csv('issues-dataset.csv', index = False)

def extractCommitsFromPullRequests(owner, project, dstPath):
    issues_df = pd.read_csv('issues-dataset.csv')
    
    #If there is no column for PRs URL, we create a new column
    if 'commits_in_pull_requests' not in list(issues_df):
        issues_df['commits_in_pull_requests'] = '-'
        
    #Filter all issues related to a project
    issues_project = issues_df.loc[issues_df['project'] == project]
    
    #Filter all issues that has debt
    issues_project_debt = issues_project.loc[issues_project['classification'] != 'non_debt']
    
    #Select the unique issues numbers, since there are multiple rows per issue
    issue_numbers = issues_project_debt['issue_number'].unique()
  
    #Creating a dictionary to store the PR urls per issue number
    commits_from_pull_requests = {}

    for issue in issue_numbers:
        #Creating an array to store the urls for a specific issue
        commits_from_pull_requests[issue] = []
        
        #Since there are several rows per issue, we can take a single index to recover the commits for that issue
        first_index = issues_project_debt.loc[issues_project_debt['issue_number'] == issue].index[0]
        
        #Recovering the commit list
        pull_request_list = str(issues_project_debt['pull_requests_urls'][first_index]).replace('"', '').split(',')
        
        if len(pull_request_list) > 0:
            for pr in pull_request_list:
                #If there is a valid commit SHA, we access the GH API to recover the PRs
                if pr != 'nan':
                    try:
                        response = getCommitsFromPullRequests(pr)
                        if len(response) > 0:
                            commits_from_pull_requests[issue].extend([commit['sha'] for commit in response])
                    except:
                        print(f'ERROR: {response}')
                        
    for i in issues_project_debt.index:
            try:
                current_commits = issues_project_debt['fixed_commit'][i].replace('"', '').split(',')
                extracted_commits = list(set(commits_from_pull_requests[issues_project_debt['issue_number'][i]]))
                final_list = [pr for pr in extracted_commits if pr not in current_commits]
                
                print(f'current_commits => {current_commits}')
                print(f'extracted_commits => {extracted_commits}')
                print(f'final_list => {final_list}')
                print('----------')
                
                issues_df['commits_in_pull_requests'][i] = str(final_list).replace('[','').replace(']','').replace("'","").replace(' ','')
            except:
                print(f'None commit were found for Issue {i}')

        #After update the dataframe, we store it in a .csv file
    issues_df.to_csv('issues-dataset.csv', index = False)
        
        
        
def extractModifiedFiles(owner, project, dstPath):
    if 'commits' not in os.listdir(dstPath):
        os.mkdir(f'{dstPath}/commits')
    if f'commits-{owner}-{project}' not in os.listdir(f'{dstPath}/commits'):
        os.mkdir(f'{dstPath}/commits/commits-{owner}-{project}')
    
    path = f'{dstPath}/commits/commits-{owner}-{project}'
    
    #Open the respository for a project
    repo = pdl.Repository(f'{dstPath}/{owner}-{project}')
    
    #Filter all issues related to a project
    issues_project = issues_df.loc[issues_df['project'] == project]
    
    #Filter all issues that has debt
    issues_project_debt = issues_project.loc[issues_project['classification'] != 'non_debt']
    
    #Select the unique issues numbers, since there are multiple rows per issue
    issue_numbers = issues_project_debt['issue_number'].unique()    
    
    for issue in issue_numbers:   
        print(issue)
        #Since there are several rows per issue, we can take a single index to recover the commits for that issue
        first_index = issues_project_debt.loc[issues_project_debt['issue_number'] == issue].index[0]
        
        #Recovering the commit list
        commitIssueList = str(issues_project_debt['fixed_commit'][first_index]).replace('"', '').split(',')
        
        for commit in commitIssueList:
            if commit != 'nan':
                new_commit = {}
                com = [c for c in repo.traverse_commits() if c.hash == commit][0]
                new_commit['hash'] = com.hash
                new_commit['msg'] = com.msg
                new_commit['author'] = com.author.email
                new_commit['author_date'] = str(com.author_date)
                new_commit['committer'] = com.committer.email
                new_commit['committer_date '] = str(com.committer_date)
                new_commit['modified_files'] = []
                
                for modifiedFile in com.modified_files:
                    mf = {}
                    mf['filename'] = modifiedFile.filename
                    mf['old_path'] = modifiedFile.old_path
                    mf['new_path'] = modifiedFile.new_path
                    mf['diff'] = modifiedFile.diff
                    mf['diff_parsed'] = modifiedFile.diff_parsed
                    mf['added_lines'] = modifiedFile.added_lines
                    mf['deleted_lines'] = modifiedFile.deleted_lines
                    new_commit['modified_files'].append(mf)
                
                with open(f'{dstPath}/commits/commits-{owner}-{project}/{issue}-{commit}.json', 'w') as file:
                    json.dump(new_commit, file)

def extractModifiedFilesCommitsPR(owner, project, dstPath):
    if 'commits_pr' not in os.listdir(dstPath):
        os.mkdir(f'{dstPath}/commits_pr')
    if f'commits-{owner}-{project}' not in os.listdir(f'{dstPath}/commits_pr'):
        os.mkdir(f'{dstPath}/commits_pr/commits_pr-{owner}-{project}')
    
    path = f'{dstPath}/commits_pr/commits-{owner}-{project}'
    
    #Open the respository for a project
    repo = pdl.Repository(f'{dstPath}/{owner}-{project}')
    
    #Filter all issues related to a project
    issues_project = issues_df.loc[issues_df['project'] == project]
    
    #Filter all issues that has debt
    issues_project_debt = issues_project.loc[issues_project['classification'] != 'non_debt']
    
    #Select the unique issues numbers, since there are multiple rows per issue
    issue_numbers = issues_project_debt['issue_number'].unique()    
    
    for issue in issue_numbers:   
        print(issue)
        #Since there are several rows per issue, we can take a single index to recover the commits for that issue
        first_index = issues_project_debt.loc[issues_project_debt['issue_number'] == issue].index[0]
        
        #Recovering the commit list
        commitIssueList = str(issues_project_debt['commits_in_pull_requests'][first_index]).replace('"', '').split(',')
        print(commitIssueList)
        for commit in commitIssueList:
            if commit not in ['nan','NaN','-'] :
                print(commit)
                new_commit = {}
                comm = [c for c in repo.traverse_commits() if c.hash == commit]
                
                if len(comm) > 0:
                    com = comm[0]
                    new_commit['hash'] = com.hash
                    new_commit['msg'] = com.msg
                    new_commit['author'] = com.author.email
                    new_commit['author_date'] = str(com.author_date)
                    new_commit['committer'] = com.committer.email
                    new_commit['committer_date '] = str(com.committer_date)
                    new_commit['modified_files'] = []

                    for modifiedFile in com.modified_files:
                        mf = {}
                        mf['filename'] = modifiedFile.filename
                        mf['old_path'] = modifiedFile.old_path
                        mf['new_path'] = modifiedFile.new_path
                        mf['diff'] = modifiedFile.diff
                        mf['diff_parsed'] = modifiedFile.diff_parsed
                        mf['added_lines'] = modifiedFile.added_lines
                        mf['deleted_lines'] = modifiedFile.deleted_lines
                        new_commit['modified_files'].append(mf)

                    with open(f'{dstPath}/commits_pr/commits_pr-{owner}-{project}/{issue}-{commit}.json', 'w') as file:
                        json.dump(new_commit, file)

In [133]:
#Extracting Commits
for project in projectsName:
    extractCommits(owner, project, dstPath)

Analyzing apache/camel
[10048 10153 10476 10507 10517 10563 10678 10950  1107 11104  1112 11171
 11196 11282  1138 11408 11504 11524 11655  1165 11734  1173 11868  1196
 12042 12104 12166  1228 12414  1256 12624 12646  1270  1304 13111  1317
  1320 13214 13533 13681  1447   149  1507  1608  1641   168  1734  1842
  1846  1863  1881  1944   201  2094  2163   231  2446  2481   251  2535
  2559  2617  2650  2670  2682  2756  2879  2892  2901  2979   302  3048
  3050  3077  3100  3112  3125  3139  3157  3168  3287  3315  3349  3351
  3440  3469  3524  3576  3637  3677  3709  3764  3769  3777   383  3864
  3888  4007  4132  4139  4202  4226  4230  4273  4317  4331  4357  4398
  4417  4430  4543  4593  4602  4657  4676  4736  4741   478  4793  4905
  4928   493  4958  4959  4993  5008  5012  5045  5184    52  5342  5527
  5586  5950  5983  6043   612  6178  6188  6291  6296  6320  6403  6446
  6563  6578  6620  6635  6735  6826  6896  6973  7015   706  7071  7127
  7133  7139  7201   721  73

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issues_df['fixed_commit'][i] = str(issues_commits[issues_project_debt['issue_number'][i]]).replace('[','').replace(']','').replace("'","").replace(' ','')


Analyzing apache/hadoop
[10067 10106 10139 10169 10175 10214 10225 10291 10295 10343  1034 10353
 10374 10423 10427 10432 10485 10496 10498 10499 10501 10508 10526 10528
 10602 10673 10681 10729  1072 10748 10752 10770 10819 10822 10904 10915
 10930 10979 11013 11014 11063 11103 11117 11201 11219 11289 11309  1130
 11313 11355 11379 11384 11409 11421 11437  1147 11523 11544 11585 11607
 11658 11677 11690 11720 11730 11740 11786 11837 11844 11862 11880  1192
 11966 12002 12021 12135 12155 12268 12368 12371 12452 12458  1245 12460
 12484 12485 12496 12505  1251 12520 12534  1254 12701 12721 12733 12806
 12811 12829 12837 12864 12888 12923 12946 12952 13011 13030 13039 13051
 13063 13138 13158 13233 13353 13365 13386 13529 13638  1367 13730 13732
 13768 13770 13975 13991 14092 14314 14351 14359 14479  1453  1459 14634
 14692 14870  1488 14942 15066  1536 15476 15486 15569 15577 15645 15742
 15859  1586 16013 16044 16160 16207 16226 16265 16291 16318 16332 16359
 16409 16435 16461 16504 16

Finding Commits for Issue 3159
Finding Commits for Issue 3198
Finding Commits for Issue 3286
Finding Commits for Issue 3337
Finding Commits for Issue 3375
Finding Commits for Issue 3377
Finding Commits for Issue 3394
Finding Commits for Issue 3477
Finding Commits for Issue 3491
Finding Commits for Issue 3501
Finding Commits for Issue 3505
Finding Commits for Issue 3560
Finding Commits for Issue 3649
Finding Commits for Issue 3654
Finding Commits for Issue 3836
Finding Commits for Issue 3849
Finding Commits for Issue 3905
Finding Commits for Issue 3925
Finding Commits for Issue 3957
Finding Commits for Issue 3999
Finding Commits for Issue 400
Finding Commits for Issue 4066
Finding Commits for Issue 4182
Finding Commits for Issue 4300
Finding Commits for Issue 4436
Finding Commits for Issue 4576
Finding Commits for Issue 4603
Finding Commits for Issue 4611
Finding Commits for Issue 4634
Finding Commits for Issue 4719
Finding Commits for Issue 481
Finding Commits for Issue 4884
Finding Co

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issues_df['fixed_commit'][i] = str(issues_commits[issues_project_debt['issue_number'][i]]).replace('[','').replace(']','').replace("'","").replace(' ','')


Analyzing apache/hbase
[10001 10008 10074 10081 10083 10115  1012 10213 10334  1054 10562 10631
 10702 10864 10892  1089 10925 10968 11011 11053 11129 11229 11293 11352
 11421 11511 11575 11612 11621 11693 11730 11935 12017 12030 12059 12106
 12115 12211 12238 12266 12271 12293 12400 12428 12464 12562  1263 12673
  1271 12729 12749 12833 12846 12888 12905  1298  1309 13184 13341  1337
 13395 13528 13582  1359 13629 13710 13776 13799 13871 13905 13924 13928
 13942  1396 13973 14161 14162 14494 14517 14604 14622 14677 14753  1492
 14941 14956 14984 15192 15207 15287 15293 15397 15490  1558 15617 15640
 15704 15707 15732 15835 15892 16157 16273  1655 16789 16817 16856 16872
 16998 17025 17101 17184 17192  1723 17259 17325 17338 17383 17394 17480
 17500 17709  1770 17726 17808 17883 17918 18085 18092 18180 18304  1849
 18501 18549  1863 18646 18909  1897 19031 19073 19183 19187 19241 19300
 19373 19384 19478 19531 19570 19633  1964  1968 19775 19815 19862  1990
 19939 19969 19977 19991 199

Finding Commits for Issue 22400
Finding Commits for Issue 2241
Finding Commits for Issue 22424
Finding Commits for Issue 2247
Finding Commits for Issue 22656
Finding Commits for Issue 22707
Finding Commits for Issue 22832
Finding Commits for Issue 22837
Finding Commits for Issue 22869
Finding Commits for Issue 22933
Finding Commits for Issue 22936
Finding Commits for Issue 2295
Finding Commits for Issue 22981
Finding Commits for Issue 23061
Finding Commits for Issue 23087
Finding Commits for Issue 23200
Finding Commits for Issue 2341
Finding Commits for Issue 23646
Finding Commits for Issue 23651
Finding Commits for Issue 23752
Finding Commits for Issue 23789
Finding Commits for Issue 23792
Finding Commits for Issue 23825
Finding Commits for Issue 23863
Finding Commits for Issue 23867
Finding Commits for Issue 239
Finding Commits for Issue 23
Finding Commits for Issue 2555
Finding Commits for Issue 2585
Finding Commits for Issue 2621
Finding Commits for Issue 2694
Finding Commits for I

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issues_df['fixed_commit'][i] = str(issues_commits[issues_project_debt['issue_number'][i]]).replace('[','').replace(']','').replace("'","").replace(' ','')


Analyzing apache/impala
[1013  101 1022 1082 1118 1120 1147 1290  137 1382 1414 1430 1466 1487
  148 1493  153 1552 1577 1584 1587 1596 1598 1618 1651 1691 1697 1774
  187 1907 1927 1929 1934 1963 1984 2068 2076 2099  211 2128 2174 2178
 2208 2212 2244 2290 2295  231 2341 2355 2386  242 2435 2457 2632 2642
 2657 2659 2707 2724  290 2911 2962 3008 3017 3077 3099 3103 3144  321
  322  324 3252  326 3276  330 3329 3338 3344 3352 3403 3548 3652 3671
 3718 3742 3780 3828 3859 4009 4027 4145 4162 4173 4182 4210 4231 4245
 4267 4291 4320 4328 4387 4423 4485 4548 4612 4617 4639 4652 4671 4713
 4728 4778 4801 4831 4833 4862 4871 4933 4956 4967 4970 4996 5002 5042
 5070 5084 5130 5150 5273  530 5341 5481 5489 5499 5525 5535 5600 5612
 5618 5636  563 5640 5688 5732 5763 5779 5780 5849 5923 5963  596 5997
 6026 6030 6048  605 6077 6080 6106 6131 6132 6223 6285  636 6408 6442
 6601 6613  661 6623 6666 6694 6709 6806 6817 6847 6850 6858 6937 6993
  710 7161 7171 7205 7234   72 7349 7350 7388 7400 74

Finding Commits for Issue 8656
Finding Commits for Issue 8771
Finding Commits for Issue 8855
Finding Commits for Issue 8857
Finding Commits for Issue 8862
Finding Commits for Issue 8884
Finding Commits for Issue 8892
Finding Commits for Issue 8912
Finding Commits for Issue 8935
Finding Commits for Issue 8945
Finding Commits for Issue 9013
Finding Commits for Issue 902
Finding Commits for Issue 9146
Finding Commits for Issue 9209
Finding Commits for Issue 9265
Finding Commits for Issue 92
Finding Commits for Issue 9363
Finding Commits for Issue 9373
Finding Commits for Issue 9431
Finding Commits for Issue 9443
Finding Commits for Issue 9467
Finding Commits for Issue 946
Finding Commits for Issue 9530
Finding Commits for Issue 9543
Finding Commits for Issue 9560
Finding Commits for Issue 979
Finding Commits for Issue 991


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issues_df['fixed_commit'][i] = str(issues_commits[issues_project_debt['issue_number'][i]]).replace('[','').replace(']','').replace("'","").replace(' ','')


Analyzing apache/thrift
[1003 1048 1055 1062 1063 1065 1072 1100 1103 1121 1130 1135 1141 1174
 1176 1199 1202 1217 1231 1241 1243 1248 1269 1275 1290 1294 1311 1314
 1349  137 1393 1426 1431 1440 1452 1466  147 1480 1503 1504 1533 1583
 1595 1624  163 1661 1672 1688 1702 1734 1745 1799 1800 1810 1813 1819
 1829 1842 1853 1873 1879 1883 1886   18 1919  191 1924 1932 1953  195
 1982 1990 1999 2006 2017 2020 2021 2028 2031 2032 2088 2097 2116  211
 2124 2141 2143 2150 2171   21 2210  221 2225 2227 2246 2255 2263 2279
 2285  228 2290 2292 2293 2328 2329 2333 2344 2351 2375 2404 2405  240
 2415 2416  241 2431 2435 2449 2487 2511 2540 2545 2555  255 2561 2568
 2589 2590 2599 2605 2607 2609 2622 2636 2659 2666 2677 2759  275 2768
  277 2781  278 2791 2833 2851 2853 2868 2874 2907 2932 2937 2969 2972
  298 3011 3027 3040 3045 3047 3088 3114 3140 3157 3168 3191 3197 3226
 3241 3274 3276 3280 3283 3297 3311 3320 3364 3391 3409 3413 3415 3416
 3419 3440 3447 3495 3535  353 3559 3572 3592 3596 36

Finding Commits for Issue 397
Finding Commits for Issue 3983
Finding Commits for Issue 39
Finding Commits for Issue 4014
Finding Commits for Issue 4015
Finding Commits for Issue 4030
Finding Commits for Issue 4043
Finding Commits for Issue 4048
Finding Commits for Issue 4069
Finding Commits for Issue 4078
Finding Commits for Issue 4129
Finding Commits for Issue 4130
Finding Commits for Issue 4136
Finding Commits for Issue 4164
Finding Commits for Issue 418
Finding Commits for Issue 4230
Finding Commits for Issue 4231
Finding Commits for Issue 4245
Finding Commits for Issue 4256
Finding Commits for Issue 427
Finding Commits for Issue 4308
Finding Commits for Issue 4316
Finding Commits for Issue 4321
Finding Commits for Issue 4362
Finding Commits for Issue 4369
Finding Commits for Issue 4416
Finding Commits for Issue 4419
Finding Commits for Issue 4437
Finding Commits for Issue 4442
Finding Commits for Issue 4443
Finding Commits for Issue 4446
Finding Commits for Issue 4468
Finding Commi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issues_df['fixed_commit'][i] = str(issues_commits[issues_project_debt['issue_number'][i]]).replace('[','').replace(']','').replace("'","").replace(' ','')


In [None]:
#Extracting Pull Requests
for project in projectsName:
    extractPullRequests(owner, project, dstPath)

In [66]:
#Extracting Modifies Files
for project in projectsName:
    extractModifiedFiles(owner, project, dstPath)

10048
10153
10476
10507
10517
10563
10678
10950
1107
11104
1112
11171
11196
11282
1138
11408
11504
11524
11655
1165
11734
1173
11868
1196
12042
12104
12166
1228
12414
1256
12624
12646
1270
1304
13111
1317
1320
13214
13533
13681
1447
149
1507
1608
1641
168
1734
1842
1846
1863
1881
1944
201
2094
2163
231
2446
2481
251
2535
2559
2617
2650
2670
2682
2756
2879
2892
2901
2979
302
3048
3050
3077
3100
3112
3125
3139
3157
3168
3287
3315
3349
3351
3440
3469
3524
3576
3637
3677
3709
3764
3769
3777
383
3864
3888
4007
4132
4139
4202
4226
4230
4273
4317
4331
4357
4398
4417
4430
4543
4593
4602
4657
4676
4736
4741
478
4793
4905
4928
493
4958
4959
4993
5008
5012
5045
5184
52
5342
5527
5586
5950
5983
6043
612
6178
6188
6291
6296
6320
6403
6446
6563
6578
6620
6635
6735
6826
6896
6973
7015
706
7071
7127
7133
7139
7201
721
7300
7319
7342
7412
7413
7461
7587
7644
7681
7690
7715
7813
789
7923
7954
7956
8029
8068
808
8091
8101
8174
8312
8321
8328
8370
856
872
8734
8844
8879
903
910
9181
9226
9338
9403
9412
94

In [100]:
#Extracting new commits related to PRs
for project in projectsName:
    extractCommitsFromPullRequests(owner=owner,project=project,dstPath=dstPath)

https://api.github.com/repos/apache/camel/pulls/2271/commits (from getCommits)
https://api.github.com/repos/apache/camel/pulls/2423/commits (from getCommits)
https://api.github.com/repos/apache/camel/pulls/2423/commits (from getCommits)
https://api.github.com/repos/apache/camel/pulls/2752/commits (from getCommits)
https://api.github.com/repos/apache/camel/pulls/2806/commits (from getCommits)
current_commits => ['e3890695b8cb92dff1d14b38f2876ee925d9acff', 'ddb852cdf7da29827fcab0b25a2b2ed6ee443cf9']
extracted_commits => []
final_list => []
----------
current_commits => ['e3890695b8cb92dff1d14b38f2876ee925d9acff', 'ddb852cdf7da29827fcab0b25a2b2ed6ee443cf9']
extracted_commits => []
final_list => []
----------
current_commits => ['e3890695b8cb92dff1d14b38f2876ee925d9acff', 'ddb852cdf7da29827fcab0b25a2b2ed6ee443cf9']
extracted_commits => []
final_list => []
----------
current_commits => ['e3890695b8cb92dff1d14b38f2876ee925d9acff', 'ddb852cdf7da29827fcab0b25a2b2ed6ee443cf9']
extracted_commits

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issues_df['commits_in_pull_requests'][i] = str(final_list).replace('[','').replace(']','').replace("'","").replace(' ','')


https://api.github.com/repos/apache/hadoop/pulls/22/commits (from getCommits)
https://api.github.com/repos/apache/hadoop/pulls/22/commits (from getCommits)
https://api.github.com/repos/apache/hadoop/pulls/3990/commits (from getCommits)
https://api.github.com/repos/apache/hadoop/pulls/1054/commits (from getCommits)
https://api.github.com/repos/apache/hadoop/pulls/388/commits (from getCommits)
current_commits => ['bd5b23f4ce1f0780d28b592688f78cd9a37a4ead']
extracted_commits => []
final_list => []
----------
current_commits => ['bd5b23f4ce1f0780d28b592688f78cd9a37a4ead']
extracted_commits => []
final_list => []
----------
current_commits => ['bd5b23f4ce1f0780d28b592688f78cd9a37a4ead']
extracted_commits => []
final_list => []
----------
current_commits => ['bd5b23f4ce1f0780d28b592688f78cd9a37a4ead']
extracted_commits => []
final_list => []
----------
current_commits => ['763f073f41e3eaa9ecd11c6ec0b76234739272aa']
extracted_commits => []
final_list => []
----------
current_commits => ['763f

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issues_df['commits_in_pull_requests'][i] = str(final_list).replace('[','').replace(']','').replace("'","").replace(' ','')


https://api.github.com/repos/apache/hbase/pulls/113/commits (from getCommits)
https://api.github.com/repos/apache/hbase/pulls/113/commits (from getCommits)
https://api.github.com/repos/apache/hbase/pulls/133/commits (from getCommits)
https://api.github.com/repos/apache/hbase/pulls/133/commits (from getCommits)
https://api.github.com/repos/apache/hbase/pulls/137/commits (from getCommits)
https://api.github.com/repos/apache/hbase/pulls/142/commits (from getCommits)
https://api.github.com/repos/apache/hbase/pulls/240/commits (from getCommits)
https://api.github.com/repos/apache/hbase/pulls/475/commits (from getCommits)
https://api.github.com/repos/apache/hbase/pulls/550/commits (from getCommits)
https://api.github.com/repos/apache/hbase/pulls/591/commits (from getCommits)
https://api.github.com/repos/apache/hbase/pulls/670/commits (from getCommits)
https://api.github.com/repos/apache/hbase/pulls/990/commits (from getCommits)
https://api.github.com/repos/apache/hbase/pulls/991/commits (fro

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issues_df['commits_in_pull_requests'][i] = str(final_list).replace('[','').replace(']','').replace("'","").replace(' ','')


current_commits => ['c096d7688102e0570d2461ccc9745abded1b5ae6']
extracted_commits => []
final_list => []
----------
current_commits => ['c096d7688102e0570d2461ccc9745abded1b5ae6']
extracted_commits => []
final_list => []
----------
None commit were found for Issue 12257
current_commits => ['fe0646f76bf35b525f8f63948a7d3baa481e5bc0', '2e2d8ca4a5a5b1aa13b5732eaa31ee54a68a7f13']
extracted_commits => []
final_list => []
----------
None commit were found for Issue 12276
current_commits => ['5b4e8b79bf277d093a996d9b1465ab8486cf6dff']
extracted_commits => []
final_list => []
----------
current_commits => ['7542d719c941576d967876a89421dfe0d1e51524']
extracted_commits => []
final_list => []
----------
current_commits => ['2a59029c2cbac2176df424dfcd01f6653b65b955']
extracted_commits => []
final_list => []
----------
current_commits => ['2a59029c2cbac2176df424dfcd01f6653b65b955']
extracted_commits => []
final_list => []
----------
current_commits => ['f9b60bce43164d3a3598127ae078bc4f33640720']
ex

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issues_df['commits_in_pull_requests'][i] = str(final_list).replace('[','').replace(']','').replace("'","").replace(' ','')


https://api.github.com/repos/apache/thrift/pulls/1651/commits (from getCommits)
https://api.github.com/repos/apache/thrift/pulls/1549/commits (from getCommits)
https://api.github.com/repos/apache/thrift/pulls/1573/commits (from getCommits)
https://api.github.com/repos/apache/thrift/pulls/1704/commits (from getCommits)
https://api.github.com/repos/apache/thrift/pulls/1721/commits (from getCommits)
https://api.github.com/repos/apache/thrift/pulls/1765/commits (from getCommits)
https://api.github.com/repos/apache/thrift/pulls/1792/commits (from getCommits)
https://api.github.com/repos/apache/thrift/pulls/1792/commits (from getCommits)
https://api.github.com/repos/apache/thrift/pulls/1866/commits (from getCommits)
https://api.github.com/repos/apache/thrift/pulls/2189/commits (from getCommits)
current_commits => ['c101092ea742e1252207b6e8f680bf392292c916']
extracted_commits => []
final_list => []
----------
current_commits => ['c101092ea742e1252207b6e8f680bf392292c916']
extracted_commits =>

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issues_df['commits_in_pull_requests'][i] = str(final_list).replace('[','').replace(']','').replace("'","").replace(' ','')


In [141]:
#Extracting Modifies Files from commits in pull requests
for project in projectsName:
    extractModifiedFilesCommitsPR(owner, project, dstPath)

10048
['nan']
10153
['-']
10476
['nan']
10507
['nan']
10517
['nan']
10563
['nan']
10678
['nan']
10950
['nan']
1107
['nan']
11104
['-']
1112
['nan']
11171
['nan']
11196
['nan']
11282
['nan']
1138
['-']
11408
['-']
11504
['-']
11524
['nan']
11655
['nan']
1165
['nan']
11734
['-']
1173
['nan']
11868
['nan']
1196
['nan']
12042
['-']
12104
['0e53232d67252b15a2575c5c3bd08146f54edc76']
0e53232d67252b15a2575c5c3bd08146f54edc76
12166
['-']
1228
['nan']
12414
['-']
1256
['nan']
12624
['nan']
12646
['4dfc9f30d9002c06ae496cca997a5265e2cab262', '2b4e3cf5d625f88c0d01a5cc1730954aaec9f7a4', 'bd51298fe84ebcdd020ac93a0c8bbf1e581eded6']
4dfc9f30d9002c06ae496cca997a5265e2cab262
2b4e3cf5d625f88c0d01a5cc1730954aaec9f7a4
bd51298fe84ebcdd020ac93a0c8bbf1e581eded6
1270
['nan']
1304
['-']
13111
['ef0c0e0d6c688b11586c30202fb0e2d769ab5478', '16373c4622b26cb242d5c4831baf630f0e7899e9', '07e84f610f88137e3ed1e5ffa582afc0e232289e', '2c15bd7b68c37dd492348855092378dfadee3982', 'f0f9fd8e46157c7c64698643a79855810b5e55ce', '

16435
['nan']
16461
['nan']
16504
['nan']
16523
['15f35083896441fea34674acb678120cc2584beb']
15f35083896441fea34674acb678120cc2584beb
16601
['-']
16607
['-']
1664
['-']
1773
['-']
1926
['-']
1961
['-']
2077
['-']
2148
['-']
2181
['-']
2205
['-']
2208
['-']
2402
['-']
2424
['-']
2776
['nan']
2796
['-']
2850
['-']
2851
['-']
2897
['-']
289
['-']
2959
['-']
2965
['-']
3077
['-']
3081
['-']
3087
['-']
3108
['-']
3159
['-']
3198
['-']
3286
['-']
3337
['-']
3375
['-']
3377
['-']
3394
['-']
3477
['-']
3491
['-']
3501
['-']
3505
['-']
3560
['-']
3649
['-']
3654
['-']
3836
['-']
3849
['-']
3905
['-']
3925
['-']
3957
['nan']
3999
['-']
400
['-']
4066
['-']
4182
['-']
4300
['-']
4436
['-']
4576
['-']
4603
['-']
4611
['-']
4634
['-']
4719
['-']
481
['-']
4884
['-']
4941
['-']
4985
['-']
4997
['-']
5076
['-']
508
['-']
5097
['-']
511
['-']
5141
['-']
524
['-']
5298
['-']
537
['-']
5402
['-']
5465
['-']
551
['-']
5561
['-']
561
['-']
564
['-']
5657
['-']
5701
['-']
5771
['-']
5775
['-']
5824
['-']
5

4559
['cd48acd11fdad7d450aad3fdf1cfde7fd9b86e9b']
cd48acd11fdad7d450aad3fdf1cfde7fd9b86e9b
4604
['6b03da72617f450bba33158616a598fb25898700']
6b03da72617f450bba33158616a598fb25898700
4616
['-']
4715
['d5dc338f0c42e40e1b5151696548b07d6be6c353']
d5dc338f0c42e40e1b5151696548b07d6be6c353
471
['nan']
4745
['2fa751db3db3c9c6c11f7ab55bb99147ee0764c3', '8afa9473c5916a142a2e51e1a7aedffef608b3b6', 'd9f70cc5e719e793e5f233130c2332b6a3868297', 'e2105426e3779202e450b4c89224b72e3752044e', '86ef177db97083bc8888ccf5059a46440d10651a', '0740421d4ebcc4e1e7fe8913d3ad748580a81ede', '593e1d6a3c0501cb550228411efb6ad1e0b942e5', 'b072f3d7c1490d6b036c32827ffee7f882334e89']
2fa751db3db3c9c6c11f7ab55bb99147ee0764c3
8afa9473c5916a142a2e51e1a7aedffef608b3b6
d9f70cc5e719e793e5f233130c2332b6a3868297
e2105426e3779202e450b4c89224b72e3752044e
86ef177db97083bc8888ccf5059a46440d10651a
0740421d4ebcc4e1e7fe8913d3ad748580a81ede
593e1d6a3c0501cb550228411efb6ad1e0b942e5
b072f3d7c1490d6b036c32827ffee7f882334e89
4822
['nan']
4829
