# Pull Requests
---

## Drive Mounting

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import os
current_folder = "/content/gdrive/My Drive/Workshop/Round2/"
os.chdir(current_folder)

## Imports

In [3]:
!pip install PyGithub
import pandas as pd
import numpy as np
from github import Github
import pickle
import random
import time
import datetime
import pprint



## 1. The File
---

In [4]:
df = pd.read_excel("datax.xlsx")
df.head(2)

Unnamed: 0.1,Unnamed: 0,label,link
0,0.0,refactoring,https://github.com/Rafess/final-challenge-nava...
1,1.0,refactoring,https://github.com/f-lab-edu/hcs/pull/116


## Repository Sample

In [5]:
random_index = random.randint(0, df.shape[0])
print("random index: {}".format(random_index))
random_link = df.iloc[random_index, 2]
print("random pull request: {}".format(random_link))

def getOut(repo_link):
    """
    random_repo = getOut(random_repo)
    random_repo
    ('dmccoystephenson/ChatHub', '11')
    """
    other, pull = os.path.split(repo_link)
    other, _ = os.path.split(other)
    repo_name = other[19:]
    return repo_name, int(pull)

repository, pull = getOut(random_link)
print("repo: {}, pull: {}".format(repository, pull))

random index: 14
random pull request: https://github.com/clear-solutions/subtitles-parser/pull/4
repo: clear-solutions/subtitles-parser, pull: 4


In [6]:
df["pair"] = df["link"].apply(lambda x: getOut(x))

In [7]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,label,link,pair
0,0.0,refactoring,https://github.com/Rafess/final-challenge-nava...,"(Rafess/final-challenge-naval-battle, 3)"
1,1.0,refactoring,https://github.com/f-lab-edu/hcs/pull/116,"(f-lab-edu/hcs, 116)"
2,2.0,refactoring,https://github.com/Intrusion-Detection-System/...,"(Intrusion-Detection-System/IDS-Server, 27)"


## Proportion

Amount of lines added and amount of lines deleted

In [8]:
g = Github("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")
repo = g.get_repo(repository)
pull_request = repo.get_pull(pull)

In [9]:
pair = df["pair"][random_index] 
label = df["label"][random_index]

In [10]:
def metric1(pair, label):
    repository, pull = pair
    try:
        repo = g.get_repo(repository)
        pull_request = repo.get_pull(pull)

        def ratio(*args):
            try:
                numerator, denominator = args
                ratio = int(numerator)/int(denominator)
                return ratio
            except ZeroDivisionError:
                return None
            except TypeError:
                return None
        
        # numerator: deletions, denominator: additions
        result = ratio(pull_request.deletions, pull_request.additions)
        return pair[0], pair[1], label, result

    except:
        return pair[0], pair[1], label, None

metric1(pair, label)

('clear-solutions/subtitles-parser', 4, 'refactoring', 0.8674698795180723)

## Number of commits

How many commits were made in the pull request

In [11]:
def metric2(pair, label):
    repository, pull = pair
    try:
        repo = g.get_repo(repository)
        pull_request = repo.get_pull(pull)

        def action(argument):
            try:
                argument
                return argument
            except:
                return None
                
        # numerator: deletions, denominator: additions
        result = action(pull_request.commits)
        return pair[0], pair[1], label, result

    except:
        return pair[0], pair[1], label, None

metric2(pair, label)

('clear-solutions/subtitles-parser', 4, 'refactoring', 1)

## Files Changed

Number of files changed in the pull request

In [12]:
def metric3(pair, label):
    repository, pull = pair
    try:
        repo = g.get_repo(repository)
        pull_request = repo.get_pull(pull)

        def action(argument):
            try:
                argument
                return argument
            except:
                return None

        result = action(pull_request.changed_files)
        return pair[0], pair[1], label, result

    except:
        return pair[0], pair[1], label, None

metric3(pair, label)

('clear-solutions/subtitles-parser', 4, 'refactoring', 4)

## Proportion

For every changed file, how big is the percentage with respect to the total amount of files


In [13]:
def metric9(pair, label):
    repository, pull = pair
    try:
        repo = g.get_repo(repository)
        pull_request = repo.get_pull(pull)
        
        def action(argument):
            try:
                total_number_of_files = int(argument.get_files().totalCount)

                append_here = list()
                if total_number_of_files!=0:
                    for every_commit in argument.get_commits():
                        for every_file in every_commit.files:
                            append_here.append(every_file.changes/total_number_of_files)
                    return append_here
                else:
                    return None
            except:
                return None

        result = action(pull_request)
        return pair[0], pair[1], label, result

    except:
        return pair[0], pair[1], label, None

metric9(pair, label)

('clear-solutions/subtitles-parser',
 4,
 'refactoring',
 [13.75, 15.75, 3.25, 6.0])

## Commit Time Difference

Difference between posted commits in hours

In [14]:
def metric10(pair, label):
    repository, pull = pair
    try:
        repo = g.get_repo(repository)
        pull_request = repo.get_pull(pull)
        
        commits_here = list()
        def action(argument):
            try:
                if int(argument.get_commits().totalCount)!=0:
                    for commit in argument.get_commits():
                        commits_here.append(datetime.datetime.strptime(commit.last_modified, "%a, %d %b %Y %H:%M:%S %Z"))

                    if len(commits_here)==1:
                        sup = [0]
                    elif len(commits_here)==2:
                        sup = [(commits_here[1]-commits_here[0]).seconds]
                    elif len(commits_here)>2:
                        sup = list()
                        for x, y in zip(commits_here[:-1], commits_here[1:]):
                            sup.append((y - x).seconds)

                    return sup
                else:
                    return None
            except:
                return None

        result = action(pull_request)
        return pair[0], pair[1], label, result

    except:
        return pair[0], pair[1], label, None

metric10(pair, label)

('clear-solutions/subtitles-parser', 4, 'refactoring', [0])

## Pull Request Participation

The number of people participating in the pull request

In [15]:
def metric11(pair, label):
    repository, pull = pair
    try:
        repo = g.get_repo(repository)
        pull_request = repo.get_pull(pull)
        
        def action(argument):
            try:
                a = [commit.author.login for commit in argument.get_commits()]
                b = [argument.base.repo.owner.login, argument.base.user.login, argument.head.user.login, argument.user.login]
                return len(list(set(a+b)))
            except:
                return None

        result = action(pull_request)
        return pair[0], pair[1], label, result

    except:
        return pair[0], pair[1], label, None

metric11(pair, label)

('clear-solutions/subtitles-parser', 4, 'refactoring', 2)

## Commit Message Size

Length of every commit message

In [16]:
def metric12(pair, label):
    repository, pull = pair
    try:
        repo = g.get_repo(repository)
        pull_request = repo.get_pull(pull)
        
        participants_here = list()
        def action(argument):
            try:
                if int(pull_request.get_commits().totalCount)!=0:
                    return [len((element.commit.message).split()) for element in pull_request.get_commits()]
            except:
                return None

        result = action(pull_request)
        return pair[0], pair[1], label, result

    except:
        return pair[0], pair[1], label, None

metric12(pair, label)

('clear-solutions/subtitles-parser', 4, 'refactoring', [1])

## Pull Request Number

The number ID given by GitHub to every pull request. 

In [17]:
def metric15(pair, label):
    repository, pull = pair
    try:
        repo = g.get_repo(repository)
        pull_request = repo.get_pull(pull)
    
        def action(argument):
            try:
                return int(argument.number)
            except:
                return None

        result = action(pull_request)
        return pair[0], pair[1], label, result

    except:
        return pair[0], pair[1], label, None

metric15(pair, label)

('clear-solutions/subtitles-parser', 4, 'refactoring', 4)

## Proportion

Proportion between commits and file changes

In [18]:
def metric17(pair, label):
    repository, pull = pair
    try:
        repo = g.get_repo(repository)
        pull_request = repo.get_pull(pull)
        
        proportions_here = list()
        def action(argument):
            try:
                if int(argument.get_commits().totalCount)!=0:
                    for every_commit in argument.get_commits():
                        proportions_here.append(sum([every_file.changes for every_file in every_commit.files]))
                    return proportions_here
                else:
                    return None
            except:
                return None

        result = action(pull_request)
        return pair[0], pair[1], label, result

    except:
        return pair[0], pair[1], label, None

metric17(pair, label)

('clear-solutions/subtitles-parser', 4, 'refactoring', [155])

## Commit Changes

How many commit messages

In [19]:
def metric18(pair, label):
    repository, pull = pair
    try:
        repo = g.get_repo(repository)
        pull_request = repo.get_pull(pull)
        
        changes_here = list()
        def action(argument):
            try:
                if int(argument.get_commits().totalCount)!=0:
                    for every_commit in argument.get_commits():
                        for every_file in every_commit.files:
                            changes_here.append(every_file.changes)
                    return sum(changes_here)
                else:
                    return None
            except:
                return None

        result = action(pull_request)
        return pair[0], pair[1], label, result

    except:
        return pair[0], pair[1], label, None

metric18(pair, label)

('clear-solutions/subtitles-parser', 4, 'refactoring', 155)

## Labels

The number of total labels in the pull request

In [20]:
def metric22(pair, label):
    repository, pull = pair
    try:
        repo = g.get_repo(repository)
        pull_request = repo.get_pull(pull)
        
        def action(argument):
            try:
                if int(argument.get_labels().totalCount)>=0:
                    return int(pull_request.get_labels().totalCount)
            except:
                return None

        result = action(pull_request)
        return pair[0], pair[1], label, result

    except:
        return pair[0], pair[1], label, None

metric22(pair, label)

('clear-solutions/subtitles-parser', 4, 'refactoring', 0)