# Connect to `GitHub GraphQL` API

In [1]:
import requests
import pandas as pd

In [2]:
# Open and read file
with open ('../../../Auth_Keys/graphql_api_auth.txt') as file:
    token = file.read()
    
# Specify the Authorization code
headers = {"Authorization": 'Bearer ' + token}

In [3]:
# Function to use requests.post to make an API call
def run_query(query): 
    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))

---

# Random Repos

In [4]:
df = pd.read_pickle('../../../Files/more_repos/df_repo_5000.pk1')

In [159]:
df[:12]

Unnamed: 0,createdAt,description,totalForks,hasIssuesEnabled,hasWikiEnabled,id,totalIssues,licenseInfo,name,nameWithOwner,readMe,owner,primaryLanguage,totalPullRequests,totalCommits,totalStargazers,updatedAt
0,2016-08-21 05:31:51+00:00,An android process bar library associated with...,37,True,True,MDEwOlJlcG9zaXRvcnk2NjE4Mjg1MA==,0,,android_ProcessBar,hzw1199/android_ProcessBar,# Process Bar\n[![](https://jitpack.io/v/hzw11...,hzw1199,Java,0,10.0,199,2019-08-28 06:14:45+00:00
1,2015-10-20 18:22:34+00:00,A ScrollView component that handles keyboard a...,377,True,True,MDEwOlJlcG9zaXRvcnk0NDYyNjI1MA==,272,MIT,react-native-keyboard-aware-scroll-view,APSL/react-native-keyboard-aware-scroll-view,# react-native-keyboard-aware-scroll-view\n\n<...,APSL,JavaScript,110,166.0,3118,2019-09-09 21:17:49+00:00
2,2015-11-24 12:43:44+00:00,Sample app to demonstrate multidex,51,True,True,MDEwOlJlcG9zaXRvcnk0Njc5MTAyMg==,1,,multidex-sample,mmadev/multidex-sample,# multidex-sample\nSample app to demonstrate m...,mmadev,Java,0,2.0,118,2019-03-22 01:44:33+00:00
3,2015-10-05 09:34:22+00:00,multiNetX is a python package for the manipula...,31,True,True,MDEwOlJlcG9zaXRvcnk0MzY3NTc1OQ==,6,,multinetx,nkoub/multinetx,![png](logo.png) multiNetX v2.0 \n=========\n\...,nkoub,Jupyter Notebook,3,70.0,104,2019-08-22 15:04:41+00:00
4,2013-07-18 14:39:58+00:00,Painlessly create beautiful matplotlib plots.,140,True,True,MDEwOlJlcG9zaXRvcnkxMTUwNTIxOA==,64,MIT,prettyplotlib,olgabot/prettyplotlib,# Announcement\n\nThank you to everyone who ha...,olgabot,Python,37,252.0,1482,2019-09-09 13:16:14+00:00
5,2010-08-15 22:59:33+00:00,"Github Repository Finder, now powered by GitHu...",8,True,True,MDEwOlJlcG9zaXRvcnk4Mzk5NTQ=,15,,GithubFinder,sr3d/GithubFinder,# Github Finder\nGithubFinder is a fast Github...,sr3d,JavaScript,2,128.0,126,2019-07-14 10:02:17+00:00
6,2016-04-01 13:53:58+00:00,🍡 LeetCode Online Judge刷题题解(Java/C++/Python/Ru...,113,True,True,MDEwOlJlcG9zaXRvcnk1NTIzNjA1Ng==,6,,LeetCode,liuchuo/LeetCode,- 博客地址：https://www.liuchuo.net/\n\n| ID | ...,liuchuo,C++,2,354.0,327,2019-09-09 03:50:24+00:00
7,2012-03-20 16:17:18+00:00,【鼠鬚管】Rime for macOS,263,True,True,MDEwOlJlcG9zaXRvcnkzNzc3MjEw,316,GPL-3.0,squirrel,rime/squirrel,鼠鬚管\n 爲物雖微情不淺\n 新詩醉墨時一揮\n 別後寄我無辭遠...,rime,Objective-C,42,389.0,2269,2019-09-09 18:56:21+00:00
8,2018-12-01 21:10:44+00:00,Reversible Reproducible Documents,23,True,True,MDEwOlJlcG9zaXRvcnkxNTk5ODc1NjA=,43,NOASSERTION,redoc,noamross/redoc,\n<!-- README.md is generated from README.Rmd....,noamross,R,11,103.0,391,2019-09-09 09:52:39+00:00
9,2019-02-07 03:27:12+00:00,Provable adversarial robustness at ImageNet scale,12,True,True,MDEwOlJlcG9zaXRvcnkxNjk1MTM4MzA=,0,,smoothing,locuslab/smoothing,# Certified Adversarial Robustness via Randomi...,locuslab,Python,4,5.0,83,2019-09-08 08:08:57+00:00


---

---

# `Pull Requests`

## Query x2

In [6]:
# Initial query
pr_query = '''
{{
  repositoryOwner(login: "{owner}") {{
    repository(name: "{name}") {{
      pullRequests(first: 100) {{
        pageInfo {{
          endCursor
          hasNextPage
        }}
        nodes {{
          createdAt
          updatedAt
          closedAt
          title
          mergedBy {{
            login
          }}
          author {{
            login
            ... on User {{
              company
            }}
          }}
          authorAssociation
          files {{
            totalCount
          }}
          state
          resourcePath
          bodyText
          comments(first: 25) {{
            totalCount
            nodes {{
              author {{
                login
                ... on User {{
                  company
                }}
              }}
              authorAssociation
              bodyText
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

# Subsequent queries
pr_query_2 = '''
{{
  repositoryOwner(login: "{owner}") {{
    repository(name: "{name}") {{
      pullRequests(first: 100, after:"{end_cursor}") {{
        pageInfo {{
          endCursor
          hasNextPage
        }}
        nodes {{
          createdAt
          updatedAt
          closedAt
          title
          mergedBy {{
            login
          }}
          author {{
            login
            ... on User {{
              company
            }}
          }}
          authorAssociation
          files {{
            totalCount
          }}
          state
          resourcePath
          bodyText
          comments(first: 25) {{
            totalCount
            nodes {{
              author {{
                login
                ... on User {{
                  company
                }}
              }}
              authorAssociation
              bodyText
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}

'''

pr_variables = {
    "end_cursor": "",
    "owner":"",
    "name":""
}

## Functions to query pr

In [7]:
def pr_to_df(result):
    
    # Get the keys of nodes.
    data = result['data']['repositoryOwner']['repository']['pullRequests']['nodes']
    
    # Put the data into a dataframe
    df_pr = pd.DataFrame(data)
    
    return df_pr

def pr_get_page_info(result):
    cur = result['data']['repositoryOwner']['repository']['pullRequests']['pageInfo']['endCursor']
    has_cur = result['data']['repositoryOwner']['repository']['pullRequests']['pageInfo']['hasNextPage']
    return cur, has_cur

def pr_result_query(query):
    # Run first query
    result = run_query(query)

    # Save to DF
    df_pr = pr_to_df(result)

    # Get end cursor and has cursor
    pr_variables['end_cursor'], has_cursor = pr_get_page_info(result)

    # Limit
    limit = result['data']['rateLimit']['remaining']
    
    return df_pr, has_cursor, limit

---

---

# `Issues`

## Query x2

In [8]:
is_query = '''
{{
  repositoryOwner(login:"{owner}") {{
    repository(name:"{name}") {{
      issues(first:100) {{
        pageInfo {{
          endCursor
          hasNextPage
        }}
        nodes {{
          createdAt
          closedAt
          updatedAt
          title
          number
          author {{
            login
            ... on User {{
              company
            }}
          }}
          authorAssociation
          state
          bodyText
          comments(first:20) {{
            totalCount
            nodes {{
              author {{
                login
                ... on User {{
                  company
                }}
              }}
              createdAt
              authorAssociation
              bodyText
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

is_query_2 = '''
{{
  repositoryOwner(login:"{owner}") {{
    repository(name:"{name}") {{
      issues(first:100, after:"{end_cursor}") {{
        pageInfo {{
          endCursor
          hasNextPage
        }}
        nodes {{
          createdAt
          closedAt
          updatedAt
          title
          number
          author {{
            login
            ... on User {{
              company
            }}
          }}
          authorAssociation
          state
          bodyText
          comments(first:20) {{
            totalCount
            nodes {{
              author {{
                login
                ... on User {{
                  company
                }}
              }}
              createdAt
              authorAssociation
              bodyText
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

is_variables = {
    "end_cursor": "",
    "owner":"",
    "name":""
}

## Function to query issues

In [9]:
def is_to_df(result):
    
    # Get the keys of nodes.
    data = result['data']['repositoryOwner']['repository']['issues']['nodes']
    
    # Put the data into a dataframe
    df_is = pd.DataFrame(data)
    
    return df_is

def is_get_page_info(result):
    cur = result['data']['repositoryOwner']['repository']['issues']['pageInfo']['endCursor']
    has_cur = result['data']['repositoryOwner']['repository']['issues']['pageInfo']['hasNextPage']
    return cur, has_cur

def is_result_query(query):
    # Run first query
    result = run_query(query)

    # Save to DF
    df_is = is_to_df(result)

    # Get end cursor and has cursor
    is_variables['end_cursor'], has_cursor = is_get_page_info(result)

    # Limit
    limit = result['data']['rateLimit']['remaining']
    
    return df_is, has_cursor, limit

---

---

# `Commits`

## Query x2

In [10]:
cm_query = '''
{{
  repository(owner: "{owner}", name: "{name}") {{
    ref(qualifiedName: "master") {{
      target {{
        ... on Commit {{
          history(first: 100) {{
            pageInfo {{
              hasNextPage
              endCursor
            }}
            nodes {{
              oid
              messageHeadline
              committedDate
              committer {{
                user {{
                  login
                  company
                }}
              }}
              status {{
                id
                state
              }}
              associatedPullRequests(first: 3) {{
                nodes {{
                  id
                  title
                  author {{
                    login
                  }}
                  authorAssociation
                  createdAt
                  updatedAt
                  closedAt
                  number
                  state
                }}
              }}
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

cm_query_2 = '''
{{
  repository(owner: "{owner}", name: "{name}") {{
    ref(qualifiedName: "master") {{
      target {{
        ... on Commit {{
          history(first: 100, after:"{end_cursor}") {{
            pageInfo {{
              hasNextPage
              endCursor
            }}
            nodes {{
              oid
              messageHeadline
              committedDate
              committer {{
                user {{
                  login
                  company
                }}
              }}
              status {{
                id
                state
              }}
              associatedPullRequests(first: 3) {{
                nodes {{
                  id
                  title
                  author {{
                    login
                  }}
                  authorAssociation
                  createdAt
                  updatedAt
                  closedAt
                  number
                  state
                }}
              }}
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

cm_variables = {
    "end_cursor": "",
    "owner":"",
    "name":""
}

## Functions for Commits

In [11]:
def cm_to_df(result):
    
    # Get the keys of nodes.
    data = result['data']['repository']['ref']['target']['history']['nodes']
    
    # Put the data into a dataframe
    df_commits = pd.DataFrame(data)
    
    return df_commits

def cm_get_page_info(result):
    cur = result['data']['repository']['ref']['target']['history']['pageInfo']['endCursor']
    has_cur = result['data']['repository']['ref']['target']['history']['pageInfo']['hasNextPage']
    return cur, has_cur

def cm_result_query(query):
    # Run first query
    result = run_query(query)

    # Save to DF
    df_commits = cm_to_df(result)

    # Get end cursor and has cursor
    cm_variables['end_cursor'], has_cursor = cm_get_page_info(result)

    # Limit
    limit = result['data']['rateLimit']['remaining']
    
    return df_commits, has_cursor, limit

---

---

# `Stargazers`

## Query x2

In [12]:
# Query star gazers
st_query = '''
{{
  repositoryOwner(login: "{owner}") {{
    id
    login
    repository(name: "{name}") {{
      id
      name
      createdAt
      updatedAt
      description
      licenseInfo {{
        spdxId
      }}
      stargazers(first:100) {{
        totalCount
        pageInfo {{
          endCursor
          hasNextPage
        }}
        edges {{
          starredAt
          node {{
            createdAt
            updatedAt
            id
            login
            company
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

st_query_2 = '''
{{
  repositoryOwner(login: "{owner}") {{
    id
    login
    repository(name: "{name}") {{
      id
      name
      createdAt
      updatedAt
      description
      licenseInfo {{
        spdxId
      }}
      stargazers(first:100, after:"{end_cursor}") {{
        totalCount
        pageInfo {{
          endCursor
          hasNextPage
        }}
        edges {{
          starredAt
          node {{
            createdAt
            updatedAt
            id
            login
            company
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

st_variables = {
    "end_cursor": "",
    "owner":"",
    "name":""
}

## Functions for stargazers

In [13]:
def st_to_df(result):
    
    # Get the keys of nodes.
    data = result['data']['repositoryOwner']['repository']['stargazers']['edges']
    
    # Put the data into a dataframe
    df_star = pd.DataFrame(data)
    
    return df_star

def st_get_page_info(result):
    cur = result['data']['repositoryOwner']['repository']['stargazers']['pageInfo']['endCursor']
    has_cur = result['data']['repositoryOwner']['repository']['stargazers']['pageInfo']['hasNextPage']
    return cur, has_cur

def st_result_query(query):
    # Run first query
    result = run_query(query)

    # Save to DF
    df_star = st_to_df(result)

    # Get end cursor and has cursor
    st_variables['end_cursor'], has_cursor = st_get_page_info(result)

    # Limit
    limit = result['data']['rateLimit']['remaining']
    
    return df_star, has_cursor, limit

---

---

# `Run Queries`

## Set index from the random repos

In [271]:
i = 23

## Make new Folder to store the repo data

In [272]:
import os

try:
    os.mkdir('../../../Files/more_repos/' + df['owner'][i] + '-' + df['name'][i])
    print('Made Folder')
except:
    print('Folder exists')

Made Folder


## Run PR Query

In [273]:
%%time

# Set owner and name for pr_variables by changing the index
pr_variables['owner'] = df['owner'][i]
pr_variables['name'] = df['name'][i]

##### Comment Out this section if there's a error #####
# Run first query
df_pr, has_cursor_pr, limit = pr_result_query(pr_query.format(**pr_variables))

print('Starting limit is: ', limit)
print('Start cursor: ', has_cursor_pr, '\n')

##### Comment Out this section if there's a error #####

# Run While Loop below
while has_cursor_pr and (limit >= 0):
    
    # Run next queries
    df_pr_2, has_cursor_pr, limit = pr_result_query(pr_query_2.format(**pr_variables))

    # Concat to existing df_star
    df_pr = pd.concat([df_pr, df_pr_2])
    
    # Print limit and cursor
    print('Next limit is: ', limit)
    print('Next cursor: ', has_cursor_pr, '\n')

Starting limit is:  4564
Start cursor:  False 

Wall time: 371 ms


### Save PR as a pickle file

In [274]:
df_pr.to_pickle('../../../Files/more_repos/' + pr_variables['owner'] + '-' + pr_variables['name'] + '/' + pr_variables['owner'] 
                + '_' + pr_variables['name'] + '_pr' + '.pk1')

---

## Run Issues Query

In [275]:
%%time

# Set owner and name for pr_variables by changing the index
is_variables['owner'] = df['owner'][i]
is_variables['name'] = df['name'][i]

##### Comment Out this section if there's a error #####
# Run first query
df_is, has_cursor_is, limit = is_result_query(is_query.format(**is_variables))

print('Starting limit is: ', limit)
print('Start cursor: ', has_cursor_is, '\n')

##### Comment Out this section if there's a error #####

# Run While Loop below
while has_cursor_is and (limit >= 0):
    
    # Run next queries
    df_is_2, has_cursor_is, limit = is_result_query(is_query_2.format(**is_variables))

    # Concat to existing df_star
    df_is = pd.concat([df_is, df_is_2])
    
    # Print limit and cursor
    print('Next limit is: ', limit)
    print('Next cursor: ', has_cursor_is, '\n')

Starting limit is:  4563
Start cursor:  False 

Wall time: 659 ms


### Save Issues as a pickle file

In [276]:
df_is.to_pickle('../../../Files/more_repos/' + is_variables['owner'] + '-' + is_variables['name'] + '/' + is_variables['owner'] 
                + '_' + is_variables['name'] + '_is' + '.pk1')

---

## Run Commits Query

In [None]:
%%time

# Set index
cm_variables['owner'] = df['owner'][i]
cm_variables['name'] = df['name'][i]

##### Comment Out this section if there's an error #####

# Run first query
df_commits, has_cursor_cm, limit = cm_result_query(cm_query.format(**cm_variables))

print('Starting limit is: ', limit)
print('Start cursor: ', has_cursor_cm, '\n')

##### Comment Out this section if there's an error #####

# Run While Loop below
while has_cursor_cm and (limit >= 0):
    
    # Run next queries
    df_commits_2, has_cursor_cm, limit = cm_result_query(cm_query_2.format(**cm_variables))

    # Concat to existing df_star
    df_commits = pd.concat([df_commits, df_commits_2])
    
    # Print limit and cursor
    print('Next limit is: ', limit)
    print('Next cursor: ', has_cursor_cm, '\n')

Starting limit is:  4562
Start cursor:  True 



### Save Commits as a pickle file

In [None]:
df_commits.to_pickle('../../../Files/more_repos/' + cm_variables['owner'] + '-' + cm_variables['name'] + '/' + cm_variables['owner'] 
                + '_' + cm_variables['name'] + '_cm' + '.pk1')

---

## Run Stargazer Query

In [None]:
%%time

# Set index
st_variables['owner'] = df['owner'][i]
st_variables['name'] = df['name'][i]

##### Comment Out this section if there's an error #####

# Run first query
df_star, has_cursor_st, limit = st_result_query(st_query.format(**st_variables))

print('Starting limit is: ', limit)
print('Start cursor: ', has_cursor_st, '\n')

##### Comment Out this section if there's an error #####

# Run While Loop below
while has_cursor_st and (limit >= 0):
    
    # Run next queries
    df_star_2, has_cursor_st, limit = st_result_query(st_query_2.format(**st_variables))

    # Concat to existing df_star
    df_star = pd.concat([df_star, df_star_2])
    
    # Print limit and cursor
    print('Next limit is: ', limit)
    print('Next cursor: ', has_cursor_st, '\n')

### Save stargazers to pickle

In [None]:
df_star.to_pickle('../../../Files/more_repos/' + st_variables['owner'] + '-' + st_variables['name'] + '/' + st_variables['owner'] 
                + '_' + st_variables['name'] + '_st' + '.pk1')

---