# Mineração de dados do Github

### Bibliotecas e aqreuivos utilizados
Caso não exista o arquivo 'github_token.py' no diretorio, deve-se criar um arquivo com o seguinte conteúdo: 
 
```GITHUB_TOKEN = [PERSONAL_TOKEN]```

Esse token é gerado pelo proprio github, e será utilizado para minerar os dados do github.

Para mais informações sobre como gerar o token, acesse: https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/

In [177]:
from github import Github
from github_token import GITHUB_TOKEN
import pandas as pd
import requests
from datetime import datetime
import re

### Inicialmente criamos as instâncias da API do Github
Vamos utilizar o repositorio zulip para nossa pesquisa.

In [2]:
# criar instancia do github com o token
g = Github(GITHUB_TOKEN)

# criar uma instancia da organizacao
org = g.get_organization('zulip')

# pegar o repositorio zulip/zulip
repo = org.get_repo('zulip')

### Pegando os dados dos PR's e das issues do repositorio

In [178]:
# numero dos PR's fechados e abertos, cujo as labels são dificeis
ClosedPR = [18239, 16308, 15721, 15683, 15329, 15248, 15065, 14538, 14244, 14028, 13953, 12999, 9425, 8725]
OpenPR = [19975, 16313, 15505, 12096, 10056, 9448, 8780]

data = []

# pegar os PR's fechados
for numberPR in ClosedPR:
    pr = repo.get_pull(numberPR)

    changes = 0
    deletions = 0
    aditions = 0

    for file in pr.get_files():
        changes += file.changes
        deletions += file.deletions
        aditions += file.additions
    
    size = 0

    for comment in pr.get_comments():
        if comment.user.login != 'timabbott':
            size += len(comment.body)

    html = requests.get(pr.diff_url)

    # python e typescript
    pythonType = re.findall('import \w+', html.text)
    js = re.findall('[+]const \w+ = require', html.text)

    strTime = pr.last_modified.split(' ')
    strTime = strTime[2] + ' ' + strTime[1] + ' ' + strTime[3] + ' ' + strTime[4]

    lastModified = datetime.strptime(strTime, '%b %d %Y %H:%M:%S')

    if None != pr.merged_at:
        timeDiff = lastModified - pr.merged_at
    else:
        timeDiff = None

    sizeIssueComment = 0
    qtdIssueComment = 0

    for comments in pr.get_issue_comments():
        if 'bot' not in comments.user.login:
            sizeIssueComment + len(comments.body)
            qtdIssueComment += 1

    sizeReviewComment = 0
    qtdReviewComment = 0

    for reviewComments in pr.get_review_comments():
        if 'bot' not in reviewComments.user.login:
            sizeReviewComment += len(reviewComments.body)
            qtdReviewComment += 1

    data.append(['Closed', numberPR, len(pythonType) + len(js), pr.changed_files, changes, deletions, aditions, len(pr.body), pr.created_at, pr.closed_at, pr.merged_at, timeDiff, datetime.now()-pr.created_at, pr.comments, size, qtdReviewComment, sizeReviewComment, qtdIssueComment, sizeIssueComment, len(pr.labels), pr.commits])

# pegar os PR's abertos
for numberPR in OpenPR:
    pr = repo.get_pull(numberPR)

    changes = 0
    deletions = 0
    aditions = 0

    for file in pr.get_files():
        changes += file.changes
        deletions += file.deletions
        aditions += file.additions

    size = 0

    for comment in pr.get_comments():
        if 'bot' not in comment.user.login:
            size += len(comment.body)

    html = requests.get(pr.diff_url)

    # python e typescript
    pythonType = re.findall('import \w+', html.text)
    js = re.findall('[+]const \w+ = require', html.text)

    strTime = pr.last_modified.split(' ')
    strTime = strTime[2] + ' ' + strTime[1] + ' ' + strTime[3] + ' ' + strTime[4]

    lastModified = datetime.strptime(strTime, '%b %d %Y %H:%M:%S')

    if None != pr.merged_at:
        timeDiff = lastModified - pr.merged_at
    else:
        timeDiff = None

    sizeIssueComment = 0
    qtdIssueComment = 0

    for comments in pr.get_issue_comments():
        if 'bot' not in comments.user.login:
            sizeIssueComment + len(comments.body)
            qtdIssueComment += 1

    sizeReviewComment = 0
    qtdReviewComment = 0

    for reviewComments in pr.get_review_comments():
        if 'bot' not in reviewComments.user.login:
            sizeReviewComment += len(reviewComments.body)
            qtdReviewComment += 1

    data.append(['Open', numberPR, len(pythonType) + len(js), pr.changed_files, changes, deletions, aditions, len(pr.body), pr.created_at, pr.closed_at, pr.merged_at, timeDiff, datetime.now()-pr.created_at, pr.comments, size, qtdReviewComment, sizeReviewComment, qtdIssueComment, sizeIssueComment, len(pr.labels), pr.commits])


# pegar issues difíceis
for issue in repo.get_issues(labels=['difficult']):
    size = 0

    for comment in issue.get_coSmments():
        if 'bot' not in comment.user.login:
            size += len(comment.body)
    
    data.append(['Issue', issue.number, 0, 0, 0, 0, 0, len(issue.body), issue.created_at, issue.closed_at, None, None, datetime.now()-issue.created_at, None, None, None, None, issue.comments, size, len(issue.labels), 0])

df = pd.DataFrame(data, columns=['Situation', 'Number', 'Imports', 'Changed_files', 'Changes', 'Deletion', 'Aditions', 'Size_body', 'Create_date','Closed_date', 'Merge_date', 'Diff_MergeLastModify', 'Diff_CreateNow', 'PR_comments', 'PR_size_comments', 'Review_comments', 'Review_size_comments', 'Issue_comments', 'Issue_size_comments', 'Qtd_labels', 'Commits'])
df

Unnamed: 0,Situation,Number,Imports,Changed_files,Changes,Deletion,Aditions,Size_body,Create_date,Closed_date,...,Diff_MergeLastModify,Diff_CreateNow,PR_comments,PR_size_comments,Review_comments,Review_size_comments,Issue_comments,Issue_size_comments,Qtd_labels,Commits
0,Closed,18239,6,2,70,0,70,4804,2021-04-21 22:36:19,2021-04-28 04:50:33,...,200 days 22:11:06,207 days 12:56:54.039096,9.0,1940.0,8.0,1940.0,3,0,5,1
1,Closed,16308,8,9,201,12,189,1437,2020-09-06 12:01:35,2020-09-28 23:15:33,...,NaT,434 days 23:31:42.146807,19.0,405.0,5.0,405.0,9,0,8,5
2,Closed,15721,3,3,36,3,33,742,2020-07-08 14:56:14,2020-07-13 19:45:33,...,NaT,494 days 20:37:05.970302,8.0,23.0,1.0,23.0,4,0,5,1
3,Closed,15683,0,2,55,0,55,98,2020-07-06 07:48:30,2020-07-06 22:48:56,...,490 days 06:35:00,497 days 03:44:52.114616,2.0,0.0,0.0,0.0,0,0,5,1
4,Closed,15329,0,7,106,21,85,211,2020-06-12 08:54:28,2020-06-15 17:54:01,...,NaT,521 days 02:38:57.070514,8.0,598.0,4.0,598.0,3,0,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,Issue,4263,0,0,0,0,0,3147,2017-03-23 04:04:20,NaT,...,NaT,1698 days 07:30:20.541722,,,,,2,265,3,0
108,Issue,4108,0,0,0,0,0,418,2017-03-17 09:49:45,NaT,...,NaT,1704 days 01:44:55.819766,,,,,3,286,3,0
109,Issue,3030,0,0,0,0,0,795,2016-12-31 05:51:09,NaT,...,NaT,1780 days 05:43:32.107844,,,,,6,328,3,0
110,Issue,2755,0,0,0,0,0,2428,2016-12-17 06:17:13,NaT,...,NaT,1794 days 05:17:28.397012,,,,,3,255,3,0
