# Mineração de dados do Github

### Bibliotecas e aqreuivos utilizados
Caso não exista o arquivo 'github_token.py' no diretorio, deve-se criar um arquivo com o seguinte conteúdo: 
 
```GITHUB_TOKEN = [PERSONAL_TOKEN]```

Esse token é gerado pelo proprio github, e será utilizado para minerar os dados do github.

Para mais informações sobre como gerar o token, acesse: https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/

In [1]:
from github import Github
from github_token import GITHUB_TOKEN
import pandas as pd
import requests
from datetime import datetime
import re
import plotly.express as px
from sklearn.cluster import KMeans

### Inicialmente criamos as instâncias da API do Github
Vamos utilizar o repositorio zulip para nossa pesquisa.

In [2]:
# criar instancia do github com o token
g = Github(GITHUB_TOKEN)

# criar uma instancia da organizacao
org = g.get_organization('zulip')

# pegar o repositorio zulip/zulip
repo = org.get_repo('zulip')

In [3]:
# close PRs
ClosedPR = [18239, 16308, 15721, 15683, 15329, 15248, 15065, 14538, 14244, 14028, 13953, 12999, 9425, 8725]
# ClosedPR = []

# open PRs
OpenPR = [19975, 16313, 15505, 12096, 10056, 9448, 8780]
# OpenPR = []

# label = repo.get_label('difficult')

# for issue in repo.get_pulls(state='closed'):
#     if label in issue.labels:
#         ClosedPR.append(issue.number)

# for issue in repo.get_pulls(state='open'):
#     if label in issue.labels:
#         OpenPR.append(issue.number)

In [4]:
data = []

# pegar os PR's fechados
for numberPR in ClosedPR:
    pr = repo.get_pull(numberPR)

    changes = 0
    deletions = 0
    aditions = 0

    for file in pr.get_files():
        changes += file.changes
        deletions += file.deletions
        aditions += file.additions
    
    size = 0

    for comment in pr.get_comments():
        if 'bot' not in comment.user.login:
            size += len(comment.body)

    html = requests.get(pr.diff_url)

    # python e typescript
    pythonType = re.findall('import \w+', html.text)
    js = re.findall('[+]const \w+ = require', html.text)

    strTime = pr.last_modified.split(' ')
    strTime = strTime[2] + ' ' + strTime[1] + ' ' + strTime[3] + ' ' + strTime[4]

    lastModified = datetime.strptime(strTime, '%b %d %Y %H:%M:%S')

    if None != pr.merged_at:
        timeDiff = lastModified - pr.merged_at
    else:
        timeDiff = None

    sizeIssueComment = 0
    qtdIssueComment = 0

    for comments in pr.get_issue_comments():
        if 'bot' not in comments.user.login:
            sizeIssueComment + len(comments.body)
            qtdIssueComment += 1

    sizeReviewComment = 0
    qtdReviewComment = 0

    for reviewComments in pr.get_review_comments():
        if 'bot' not in reviewComments.user.login:
            sizeReviewComment += len(reviewComments.body)
            qtdReviewComment += 1

    data.append(['Closed', numberPR, len(pythonType) + len(js), pr.changed_files, changes, deletions, aditions, len(pr.body), pr.created_at, pr.closed_at, pr.merged_at, timeDiff, datetime.now()-pr.created_at, pr.comments, size, qtdReviewComment, sizeReviewComment, qtdIssueComment, sizeIssueComment, len(pr.labels), pr.commits])

# pegar os PR's abertos
for numberPR in OpenPR:
    pr = repo.get_pull(numberPR)

    changes = 0
    deletions = 0
    aditions = 0

    for file in pr.get_files():
        changes += file.changes
        deletions += file.deletions
        aditions += file.additions

    size = 0

    for comment in pr.get_comments():
        if 'bot' not in comment.user.login:
            size += len(comment.body)

    html = requests.get(pr.diff_url)

    # python e typescript
    pythonType = re.findall('import \w+', html.text)
    js = re.findall('[+]const \w+ = require', html.text)

    strTime = pr.last_modified.split(' ')
    strTime = strTime[2] + ' ' + strTime[1] + ' ' + strTime[3] + ' ' + strTime[4]

    lastModified = datetime.strptime(strTime, '%b %d %Y %H:%M:%S')

    if None != pr.merged_at:
        timeDiff = lastModified - pr.merged_at
    else:
        timeDiff = None

    sizeIssueComment = 0
    qtdIssueComment = 0

    for comments in pr.get_issue_comments():
        if 'bot' not in comments.user.login:
            sizeIssueComment + len(comments.body)
            qtdIssueComment += 1

    sizeReviewComment = 0
    qtdReviewComment = 0

    for reviewComments in pr.get_review_comments():
        if 'bot' not in reviewComments.user.login:
            sizeReviewComment += len(reviewComments.body)
            qtdReviewComment += 1

    data.append(['Open', numberPR, len(pythonType) + len(js), pr.changed_files, changes, deletions, aditions, len(pr.body), pr.created_at, pr.closed_at, pr.merged_at, timeDiff, datetime.now()-pr.created_at, pr.comments, size, qtdReviewComment, sizeReviewComment, qtdIssueComment, sizeIssueComment, len(pr.labels), pr.commits])


# pegar issues difíceis
for issue in repo.get_issues(labels=['difficult']):
    size = 0

    for comment in issue.get_comments():
        if 'bot' not in comment.user.login:
            size += len(comment.body)
    
    data.append(['Issue', issue.number, 0, 0, 0, 0, 0, len(issue.body), issue.created_at, issue.closed_at, -1, -1, datetime.now()-issue.created_at, -1, -1, -1, -1, issue.comments, size, len(issue.labels), 0])

df = pd.DataFrame(data, columns=['Situation', 'Number', 'Imports', 'Changed_files', 'Changes', 'Deletion', 'Aditions', 'Size_body', 'Create_date','Closed_date', 'Merge_date', 'Diff_MergeLastModify', 'Diff_CreateNow', 'PR_comments', 'PR_size_comments', 'Review_comments', 'Review_size_comments', 'Issue_comments', 'Issue_size_comments', 'Qtd_labels', 'Commits'])


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype          
---  ------                --------------  -----          
 0   Situation             112 non-null    object         
 1   Number                112 non-null    int64          
 2   Imports               112 non-null    int64          
 3   Changed_files         112 non-null    int64          
 4   Changes               112 non-null    int64          
 5   Deletion              112 non-null    int64          
 6   Aditions              112 non-null    int64          
 7   Size_body             112 non-null    int64          
 8   Create_date           112 non-null    datetime64[ns] 
 9   Closed_date           14 non-null     datetime64[ns] 
 10  Merge_date            96 non-null     object         
 11  Diff_MergeLastModify  96 non-null     object         
 12  Diff_CreateNow        112 non-null    timedelta64[ns]
 13  PR_co

In [6]:
df.loc[df['Situation'] == 'Closed', 'Situation'] = 0
df.loc[df['Situation'] == 'Open', 'Situation'] = 1
df.loc[df['Situation'] == 'Issue', 'Situation'] = 3

df['Create_date'] = 0
df['Closed_date'] = 0
df['Merge_date'] = 0
df['Diff_MergeLastModify'] = 0
df['Diff_CreateNow'] = 0

X = df.values

In [7]:
numberClusters = 2

# crio a instancia
kmeans = KMeans(n_clusters = numberClusters, init = 'random')

# executo o algoritmo
kmeans.fit(X)

label = kmeans.fit_predict(X)

In [8]:
clusters = []

for i in range(numberClusters):
    clusters.append([])

for i, cluster in enumerate(kmeans.labels_):
    if cluster == 0:
        clusters[0].append(X[i])
    elif cluster == 1:
        clusters[1].append(X[i])
    elif cluster == 2:
        clusters[2].append(X[i])

In [9]:
tamanhoCluster = []

for i, alunoCluster in enumerate(clusters):
    tamanhoCluster.append(len(alunoCluster))

In [10]:
qtdAlunos = pd.DataFrame(columns=['Cluster', 'Quantidade'])
index = 0

for i, data in enumerate(tamanhoCluster):
    qtdAlunos.loc[index] = ['Cluster ' + str(i + 1)] + list([data])
    index += 1

fig = px.bar(qtdAlunos, x='Cluster', y='Quantidade', title='Quantidade de Issues')
fig.show()