# Connect to `GitHub GraphQL` API

In [1]:
import requests
import pandas as pd

In [2]:
# Open and read file
with open ('../../../Auth_Keys/graphql_api_auth.txt') as file:
    token = file.read()
    
# Specify the Authorization code
headers = {"Authorization": 'Bearer ' + token}

In [3]:
# Function to use requests.post to make an API call
def run_query(query): 
    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))

---

# Ten Random Repos

In [39]:
df = pd.read_pickle('../../../Files/df_repo_10.pk1')

In [40]:
df

Unnamed: 0,createdAt,description,totalForks,hasIssuesEnabled,hasWikiEnabled,id,totalIssues,licenseInfo,name,nameWithOwner,readMe,owner,primaryLanguage,totalPullRequests,totalCommits,totalStargazers,updatedAt
0,2013-01-19 02:21:47+00:00,A game-theoretic poker player (written in 2005),30,True,True,MDEwOlJlcG9zaXRvcnk3Njk4MTkx,0,,game-theory-poker,adamsmith/game-theory-poker,game-theory-poker\n=====\n\nThis is a game-the...,adamsmith,Java,1,2,167,2019-08-13 15:14:46+00:00
1,2014-04-01 22:40:40+00:00,The Google I/O 2019 Android App,5931,True,False,MDEwOlJlcG9zaXRvcnkxODM0NzQ3Ng==,192,NOASSERTION,iosched,google/iosched,Google I/O Android App\n======================...,google,Kotlin,134,2635,18987,2019-09-03 15:52:27+00:00
2,2014-07-17 04:50:32+00:00,"SEX IS ZERO (0), so, who wanna be the ONE (1),...",1959,True,True,MDEwOlJlcG9zaXRvcnkyMTkyOTAyOQ==,76,GPL-2.0,hardseed,yangyangwithgnu/hardseed,"<h1 align=""center"">给不了你梦中情人，至少还有硬盘女神：hardseed<...",yangyangwithgnu,C++,12,197,9102,2019-09-02 12:40:54+00:00
3,2017-03-21 11:05:58+00:00,在线地址: http://www.jobinfo.cc:8000/,91,True,True,MDEwOlJlcG9zaXRvcnk4NTY5MjA1Mg==,8,MIT,webspider,iven-he/webspider,# \n\n[![Build Status](https://travis-ci.org/G...,iven-he,Python,8,51,260,2019-08-29 15:43:08+00:00
4,2012-09-14 14:19:46+00:00,Using Spring from within a Play 2.0 application,24,False,False,MDEwOlJlcG9zaXRvcnk1ODEwMjM4,0,,play20-spring-demo,guillaumebort/play20-spring-demo,Using Spring from within a Play 2.0 applicatio...,guillaumebort,Java,3,3,88,2019-05-31 11:35:43+00:00
5,2013-08-14 10:22:25+00:00,File based encrypted key-value store,46,True,True,MDEwOlJlcG9zaXRvcnkxMjEwNjE5Mg==,151,MIT,trousseau,oleiade/trousseau,"![Trousseau, a portable encrypted keyring](tro...",oleiade,Go,46,547,881,2019-08-26 16:50:04+00:00
6,2014-09-16 03:58:52+00:00,Embulk: Pluggable Bulk Data Loader.,166,True,False,MDEwOlJlcG9zaXRvcnkyNDA4NDczMA==,391,NOASSERTION,embulk,embulk/embulk,# What's Embulk?\n\nEmbulk is a parallel bulk ...,embulk,Java,786,2646,1324,2019-09-03 07:27:25+00:00
7,2018-01-28 08:18:24+00:00,🌕 🌖 🌗 🌘 🌑 🌒 🌓 🌔Imitate Cosmos - a special thir...,72,True,True,MDEwOlJlcG9zaXRvcnkxMTkyNDA0NTM=,10,GPL-2.0,ZHNCosmos,zhnnnnn/ZHNCosmos,# Imitate Cosmos - a special third party weibo...,zhnnnnn,Objective-C,0,37,451,2019-08-30 06:26:09+00:00
8,2016-04-27 23:40:22+00:00,C++ library and programs that demonstrate mesh...,103,True,True,MDEwOlJlcG9zaXRvcnk1NzI1NTk3Mg==,2,NOASSERTION,Mesh-processing-library,microsoft/Mesh-processing-library,# Mesh-processing-library\nSee [README.html](h...,microsoft,C++,0,42,399,2019-09-03 03:55:23+00:00


In [42]:
df['owner'][0] , df['name'][0]

('adamsmith', 'game-theory-poker')

---

---

# `Pull Requests`

## Query x2

In [59]:
# Initial query
pr_query = '''
{{
  repositoryOwner(login: "{owner}") {{
    repository(name: "{name}") {{
      pullRequests(first: 100) {{
        pageInfo {{
          endCursor
          hasNextPage
        }}
        nodes {{
          createdAt
          updatedAt
          closedAt
          title
          mergedBy {{
            login
          }}
          author {{
            login
            ... on User {{
              company
            }}
          }}
          authorAssociation
          files {{
            totalCount
          }}
          state
          resourcePath
          bodyText
          comments(first: 25) {{
            totalCount
            nodes {{
              author {{
                login
                ... on User {{
                  company
                }}
              }}
              authorAssociation
              bodyText
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

# Subsequent queries
pr_query_2 = '''
{{
  repositoryOwner(login: "{owner}") {{
    repository(name: "{name}") {{
      pullRequests(first: 100, after:"{end_cursor}") {{
        pageInfo {{
          endCursor
          hasNextPage
        }}
        nodes {{
          createdAt
          updatedAt
          closedAt
          title
          mergedBy {{
            login
          }}
          author {{
            login
            ... on User {{
              company
            }}
          }}
          authorAssociation
          files {{
            totalCount
          }}
          state
          resourcePath
          bodyText
          comments(first: 25) {{
            totalCount
            nodes {{
              author {{
                login
                ... on User {{
                  company
                }}
              }}
              authorAssociation
              bodyText
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}

'''

pr_variables = {
    "end_cursor": "",
    "owner":"",
    "name":""
}

## Functions to query pr

In [55]:
def pr_to_df(result):
    
    # Get the keys of nodes.
    data = result['data']['repositoryOwner']['repository']['pullRequests']['nodes']
    
    # Put the data into a dataframe
    df_pr = pd.DataFrame(data)
    
    return df_pr

def pr_get_page_info(result):
    cur = result['data']['repositoryOwner']['repository']['pullRequests']['pageInfo']['endCursor']
    has_cur = result['data']['repositoryOwner']['repository']['pullRequests']['pageInfo']['hasNextPage']
    return cur, has_cur

def pr_result_query(query):
    # Run first query
    result = run_query(query)

    # Save to DF
    df_pr = pr_to_df(result)

    # Get end cursor and has cursor
    pr_variables['end_cursor'], has_cursor = pr_get_page_info(result)

    # Limit
    limit = result['data']['rateLimit']['remaining']
    
    return df_pr, has_cursor, limit

---

---

# `Issues`

## Query x2

In [87]:
is_query = '''
{{
  repositoryOwner(login:"{owner}") {{
    repository(name:"{name}") {{
      issues(first:100) {{
        pageInfo {{
          endCursor
          hasNextPage
        }}
        nodes {{
          createdAt
          closedAt
          updatedAt
          title
          number
          author {{
            login
            ... on User {{
              company
            }}
          }}
          authorAssociation
          state
          bodyText
          comments(first:20) {{
            totalCount
            nodes {{
              author {{
                login
                ... on User {{
                  company
                }}
              }}
              createdAt
              authorAssociation
              bodyText
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

is_query_2 = '''
{{
  repositoryOwner(login:"{owner}") {{
    repository(name:"{name}") {{
      issues(first:100, after:"{end_cursor}") {{
        pageInfo {{
          endCursor
          hasNextPage
        }}
        nodes {{
          createdAt
          closedAt
          updatedAt
          title
          number
          author {{
            login
            ... on User {{
              company
            }}
          }}
          authorAssociation
          state
          bodyText
          comments(first:20) {{
            totalCount
            nodes {{
              author {{
                login
                ... on User {{
                  company
                }}
              }}
              createdAt
              authorAssociation
              bodyText
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

is_variables = {
    "end_cursor": "",
    "owner":"",
    "name":""
}

## Function to query issues

In [88]:
def is_to_df(result):
    
    # Get the keys of nodes.
    data = result['data']['repositoryOwner']['repository']['issues']['nodes']
    
    # Put the data into a dataframe
    df_is = pd.DataFrame(data)
    
    return df_is

def is_get_page_info(result):
    cur = result['data']['repositoryOwner']['repository']['issues']['pageInfo']['endCursor']
    has_cur = result['data']['repositoryOwner']['repository']['issues']['pageInfo']['hasNextPage']
    return cur, has_cur

def is_result_query(query):
    # Run first query
    result = run_query(query)

    # Save to DF
    df_is = is_to_df(result)

    # Get end cursor and has cursor
    is_variables['end_cursor'], has_cursor = is_get_page_info(result)

    # Limit
    limit = result['data']['rateLimit']['remaining']
    
    return df_is, has_cursor, limit

---

---

# `Commits`

## Query x2

In [100]:
cm_query = '''
{{
  repository(owner: "{owner}", name: "{name}") {{
    ref(qualifiedName: "master") {{
      target {{
        ... on Commit {{
          history(first: 100) {{
            pageInfo {{
              hasNextPage
              endCursor
            }}
            nodes {{
              oid
              messageHeadline
              committedDate
              committer {{
                user {{
                  login
                  company
                }}
              }}
              status {{
                id
                state
              }}
              associatedPullRequests(first: 3) {{
                nodes {{
                  id
                  title
                  author {{
                    login
                  }}
                  authorAssociation
                  createdAt
                  updatedAt
                  closedAt
                  number
                  state
                }}
              }}
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

cm_query_2 = '''
{{
  repository(owner: "{owner}", name: "{name}") {{
    ref(qualifiedName: "master") {{
      target {{
        ... on Commit {{
          history(first: 100, after:"{end_cursor}") {{
            pageInfo {{
              hasNextPage
              endCursor
            }}
            nodes {{
              oid
              messageHeadline
              committedDate
              committer {{
                user {{
                  login
                  company
                }}
              }}
              status {{
                id
                state
              }}
              associatedPullRequests(first: 3) {{
                nodes {{
                  id
                  title
                  author {{
                    login
                  }}
                  authorAssociation
                  createdAt
                  updatedAt
                  closedAt
                  number
                  state
                }}
              }}
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

cm_variables = {
    "end_cursor": "",
    "owner":"",
    "name":""
}

## Functions for Commits

In [94]:
def cm_to_df(result):
    
    # Get the keys of nodes.
    data = result['data']['repository']['ref']['target']['history']['nodes']
    
    # Put the data into a dataframe
    df_commits = pd.DataFrame(data)
    
    return df_commits

def cm_get_page_info(result):
    cur = result['data']['repository']['ref']['target']['history']['pageInfo']['endCursor']
    has_cur = result['data']['repository']['ref']['target']['history']['pageInfo']['hasNextPage']
    return cur, has_cur

def cm_result_query(query):
    # Run first query
    result = run_query(query)

    # Save to DF
    df_commits = cm_to_df(result)

    # Get end cursor and has cursor
    cm_variables['end_cursor'], has_cursor = cm_get_page_info(result)

    # Limit
    limit = result['data']['rateLimit']['remaining']
    
    return df_commits, has_cursor, limit

## Run commits query

In [101]:
%%time

# Set index
cm_variables['owner'] = df['owner'][0]
cm_variables['name'] = df['name'][0]

##### Comment Out this section if there's an error #####

# Run first query
df_commits, has_cursor, limit = cm_result_query(cm_query.format(**cm_variables))

print('Starting limit is: ', limit)
print('Start cursor: ', has_cursor, '\n')

##### Comment Out this section if there's an error #####

# Run While Loop below
while has_cursor and (limit >= 0):
    
    # Run next queries
    df_commits_2, has_cursor, limit = result_query(cm_query_2.format(**cm_variables))

    # Concat to existing df_star
    df_commits = pd.concat([df_commits, df_commits_2])
    
    # Print limit and cursor
    print('Next limit is: ', limit)
    print('Next cursor: ', has_cursor, '\n')

Starting limit is:  4999
Start cursor:  False 

Wall time: 540 ms


In [102]:
df_commits

Unnamed: 0,associatedPullRequests,committedDate,committer,messageHeadline,oid,status
0,{'nodes': []},2013-01-19T02:25:04Z,"{'user': {'login': 'adamsmith', 'company': '@k...",Initial commit,6fa9d57ffb25b1b3bf9d714d26f0d4691881436b,
1,{'nodes': []},2013-01-19T02:21:47Z,"{'user': {'login': 'adamsmith', 'company': '@k...",Initial commit,cb10cb8b2c7d194f75930f423714b5054420734f,


## Save to commits to pickle

In [103]:
df_commits.to_pickle('../../../Files/' + cm_variables['owner'] + '/' + cm_variables['owner'] 
                + '_' + cm_variables['name'] + '_pr' + '.pk1')

---

---

# `Stargazers`

## Query x2

In [121]:
# Query star gazers
st_query = '''
{{
  repositoryOwner(login: "{owner}") {{
    id
    login
    repository(name: "{name}") {{
      id
      name
      createdAt
      updatedAt
      description
      licenseInfo {{
        spdxId
      }}
      stargazers(first:100) {{
        totalCount
        pageInfo {{
          endCursor
          hasNextPage
        }}
        edges {{
          starredAt
          node {{
            createdAt
            updatedAt
            id
            login
            company
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

st_query_2 = '''
{{
  repositoryOwner(login: "{owner}") {{
    id
    login
    repository(name: "{name}") {{
      id
      name
      createdAt
      updatedAt
      description
      licenseInfo {{
        spdxId
      }}
      stargazers(first:100, after:"{end_cursor}") {{
        totalCount
        pageInfo {{
          endCursor
          hasNextPage
        }}
        edges {{
          starredAt
          node {{
            createdAt
            updatedAt
            id
            login
            company
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

st_variables = {
    "end_cursor": "",
    "owner":"",
    "name":""
}

## Functions for stargazers

In [122]:
def st_to_df(result):
    
    # Get the keys of nodes.
    data = result['data']['repositoryOwner']['repository']['stargazers']['edges']
    
    # Put the data into a dataframe
    df_star = pd.DataFrame(data)
    
    return df_star

def st_get_page_info(result):
    cur = result['data']['repositoryOwner']['repository']['stargazers']['pageInfo']['endCursor']
    has_cur = result['data']['repositoryOwner']['repository']['stargazers']['pageInfo']['hasNextPage']
    return cur, has_cur

def st_result_query(query):
    # Run first query
    result = run_query(query)

    # Save to DF
    df_star = st_to_df(result)

    # Get end cursor and has cursor
    st_variables['end_cursor'], has_cursor = st_get_page_info(result)

    # Limit
    limit = result['data']['rateLimit']['remaining']
    
    return df_star, has_cursor, limit

---

---

# `Run Queries`

## Run PR Query

In [None]:
%%time

# Set owner and name for pr_variables by changing the index
pr_variables['owner'] = df['owner'][0]
pr_variables['name'] = df['name'][0]

##### Comment Out this section if there's a error #####
# Run first query
df_pr, has_cursor, limit = pr_result_query(pr_query.format(**pr_variables))

print('Starting limit is: ', limit)
print('Start cursor: ', has_cursor, '\n')

##### Comment Out this section if there's a error #####

# Run While Loop below
while has_cursor and (limit >= 0):
    
    # Run next queries
    df_pr_2, has_cursor, limit = pr_result_query(pr_query_2.format(**pr_variables))

    # Concat to existing df_star
    df_pr = pd.concat([df_pr, df_pr_2])
    
    # Print limit and cursor
    print('Next limit is: ', limit)
    print('Next cursor: ', has_cursor, '\n')

### Save PR as a pickle file

In [None]:
df_pr.to_pickle('../../../Files/' + pr_variables['owner'] + '/' + pr_variables['owner'] 
                + '_' + pr_variables['name'] + '_pr' + '.pk1')

---

## Run Issues Query

In [None]:
%%time

# Set owner and name for pr_variables by changing the index
is_variables['owner'] = df['owner'][0]
is_variables['name'] = df['name'][0]

##### Comment Out this section if there's a error #####
# Run first query
df_is, has_cursor, limit = is_result_query(is_query.format(**is_variables))

print('Starting limit is: ', limit)
print('Start cursor: ', has_cursor, '\n')

##### Comment Out this section if there's a error #####

# Run While Loop below
while has_cursor and (limit >= 0):
    
    # Run next queries
    df_is_2, has_cursor, limit = is_result_query(is_query_2.format(**is_variables))

    # Concat to existing df_star
    df_is = pd.concat([df_is, df_is_2])
    
    # Print limit and cursor
    print('Next limit is: ', limit)
    print('Next cursor: ', has_cursor, '\n')

### Save Issues as a pickle file

In [None]:
df_is.to_pickle('../../../Files/' + is_variables['owner'] + '/' + is_variables['owner'] 
                + '_' + is_variables['name'] + '_pr' + '.pk1')

---

## Run Commits Query

In [None]:
%%time

# Set index
cm_variables['owner'] = df['owner'][0]
cm_variables['name'] = df['name'][0]

##### Comment Out this section if there's an error #####

# Run first query
df_commits, has_cursor, limit = cm_result_query(cm_query.format(**cm_variables))

print('Starting limit is: ', limit)
print('Start cursor: ', has_cursor, '\n')

##### Comment Out this section if there's an error #####

# Run While Loop below
while has_cursor and (limit >= 0):
    
    # Run next queries
    df_commits_2, has_cursor, limit = result_query(cm_query_2.format(**cm_variables))

    # Concat to existing df_star
    df_commits = pd.concat([df_commits, df_commits_2])
    
    # Print limit and cursor
    print('Next limit is: ', limit)
    print('Next cursor: ', has_cursor, '\n')

### Save Commits as a pickle file

In [None]:
df_commits.to_pickle('../../../Files/' + cm_variables['owner'] + '/' + cm_variables['owner'] 
                + '_' + cm_variables['name'] + '_pr' + '.pk1')

---

## Run Stargazer Query

In [125]:
%%time

# Set index
st_variables['owner'] = df['owner'][0]
st_variables['name'] = df['name'][0]

##### Comment Out this section if there's an error #####

# Run first query
df_star, has_cursor, limit = st_result_query(st_query.format(**st_variables))

print('Starting limit is: ', limit)
print('Start cursor: ', has_cursor, '\n')

##### Comment Out this section if there's an error #####

# Run While Loop below
while has_cursor and (limit >= 0):
    
    # Run next queries
    df_star_2, has_cursor, limit = st_result_query(st_query_2.format(**st_variables))

    # Concat to existing df_star
    df_star = pd.concat([df_star, df_star_2])
    
    # Print limit and cursor
    print('Next limit is: ', limit)
    print('Next cursor: ', has_cursor, '\n')

Starting limit is:  4992
Start cursor:  True 

Next limit is:  4991
Next cursor:  False 

Wall time: 1.04 s


## Save stargazers to pickle

In [128]:
df_star.to_pickle('../../../Files/' + st_variables['owner'] + '/' + st_variables['owner'] 
                + '_' + st_variables['name'] + '_pr' + '.pk1')

---