# Rate-Limited Querying of Github's GraphQL API

In [1]:
import pandas as pd
import requests

In [2]:
file = open("/home/joseph/graphql_token.txt", "r")
api_token = file.read().strip()

url = "https://api.github.com/graphql"
headers = {"Authorization": "token %s" % api_token}

In [3]:
def query(json: str) -> dict:
    r = requests.post(url=url, json=json, headers=headers)
    return r.json()

### Sample Query

The data is returned in a JSON structure, as a `str` type object. 

In [4]:
json = {
    "query": "{ viewer { repositories(first: 1) { totalCount pageInfo { hasNextPage endCursor } edges { node { name } } } } }"
}

query(json)

{'data': {'viewer': {'repositories': {'totalCount': 38,
    'pageInfo': {'hasNextPage': True, 'endCursor': 'Y3Vyc29yOnYyOpHOAvjC2Q=='},
    'edges': [{'node': {'name': 'Yendors-Analysis'}}]}}}}

This query returns the remaining number of nodes queries, as well as the when the limit will be reset. This data can be passed into a pandas `DataFrame` using `pd.read_json()`, and from there the remaining limit and reset time can be parsed to allow for rate-limited programmatic scraping of Github's GraphQL API.

In [5]:
query_text = """query {
  viewer {
    login
  }
  rateLimit {
    limit
    cost
    remaining
    resetAt
  }
}"""

json = {"query": query_text}
query(json)

{'data': {'viewer': {'login': 'beverast'},
  'rateLimit': {'limit': 5000,
   'cost': 1,
   'remaining': 4961,
   'resetAt': '2019-08-24T00:26:27Z'}}}

### Ingest GraphQL Responses Into a DataFrame
1. Query the endpoint, ingest as a DataFrame from JSON

In [6]:
limit_df = pd.DataFrame(query(json))
limit_df = limit_df.reset_index()

In [7]:
limit_df.head()

Unnamed: 0,index,data
0,rateLimit,"{'limit': 5000, 'cost': 1, 'remaining': 4960, ..."
1,viewer,{'login': 'beverast'}


2. Create columns for the necessary data: `remaining` and `resetAt`

In [8]:
limit_df["remaining"] = limit_df.iloc[0][1]['remaining']
limit_df["resetAt"] = pd.Timestamp(limit_df.iloc[0][1]['resetAt'])

3. Drop unnecessary `viewer` data

In [9]:
limit_df = limit_df.drop(axis=1, index=1)

In [10]:
limit_df.head()

Unnamed: 0,index,data,remaining,resetAt
0,rateLimit,"{'limit': 5000, 'cost': 1, 'remaining': 4960, ...",4960,2019-08-24 00:26:27+00:00


### Automating Queries

In [11]:
def update_limit() -> pd.DataFrame:
    """ Returns a DataFrame of current GraphQL API query limits."""
    query_text = """query {
                      viewer {
                        login
                      }
                      rateLimit {
                        limit
                        cost
                        remaining
                        resetAt
                      }
                    }"""

    json = {"query": query_text}
    query(json)
    df = pd.DataFrame(query(json)).reset_index()
    
    df["remaining"] = df.iloc[0][1]['remaining']
    df["resetAt"] = pd.Timestamp(df.iloc[0][1]['resetAt'])
    df = df.drop(axis=1, index=1)
    
    return df


def get_remaining(df: pd.DataFrame) -> int:
    """Returns the remaining count of node queries."""
    return df['remaining'][0].astype(int)


def get_resetAt(df: pd.DataFrame) -> pd.Timestamp:
    """Retruns the time (UTC) at which the remaining queries will be reset to 5000."""
    return pd.Timestamp(df['resetAt'][0])


def is_resetAt_reached(df: pd.DataFrame) -> bool:
    """Returns True or False if the resetAt time has been reached."""    
    resetAt = get_resetAt(df)
    return pd.Timestamp.now(tz='UTC') > resetAt

In [12]:
## LOOPING QUERIES UNTIL `remaining` REACHES 0 ##
# 1. Get value of `remaining` and `resetAt`
# 2. Select repo, get `createdAt` data
# 3. Create a query sampling a month's worth of pull request data
# 4. Perform this request repeatedly until `remaining` reaches 0, or all months have been sampled
# 5. When sampling has finished for a repo's entire history, export this data to CSV.

In [13]:
limit_df = update_limit()
remaining = get_remaining(limit_df)
resetAt = get_resetAt(limit_df)

remaining, resetAt

(4958, Timestamp('2019-08-24 00:26:27+0000', tz='UTC'))

#### The following two code cells are from [this notebook](https://github.com/labs15-github-commit/data-science/blob/patrick/toDataFrameFunction.ipynb) by Patrick Wilky.

In [14]:
def better_df(x):
    """
    This function will take the raw query results and turn it into a pretty dataframe.
    """
    data = x.get('data').get('search').get('nodes')
    
    copy = pd.DataFrame(data)
    
    templist = []
    templist2 = []
    
    for i in copy.comments:
        templist.append(i.get('totalCount'))
    copy['commentCount'] = templist
    
    templist.clear()
    for i in copy.comments:
        templist.append(i.get('edges'))
    copy['comments'] = templist
    
    templist.clear()
    for i in range(len(copy.comments)):
        templist2 = []
        if (copy.commentCount[i]==0):
            templist.append(templist2)
        else:
            for o in copy.comments[i]:
                templist2.append(o.get('node'))
                if (len(templist2)==copy.commentCount[i]):
                    templist.append(templist2)                   
    copy['comments'] = templist
    
    templist.clear()
    for i in copy.author:
        templist.append(i.get('company'))
    copy['company'] = templist 
    
    templist.clear()
    for i in copy.author:
        templist.append(i.get('login'))
    copy['author'] = templist   
    
    templist.clear()
    for i in copy.files:
        templist.append(i.get('totalCount'))
    copy['filesCommited'] = templist
    
    
    templist.clear()
    for i in copy.mergedBy:
        if (i == None):
            templist.append(None)
        else:
            templist.append(i.get('login'))
    copy['mergedBy'] = templist
    
    copy = copy.drop(columns='files')
    
    return copy

In [15]:
pull_request_query = """
{
  search(first:100, query:"repo:pandas-dev/pandas created:2018-08-01..2018-08-12 type:pr", type:ISSUE) {
    nodes {
      ... on PullRequest {
        createdAt
        updatedAt
        title
        mergedBy {
          login
        }
        authorAssociation
        author {
          login
          ... on User {
            company
          }
        }
        files {
          totalCount
        }
        state
        resourcePath
        bodyText
        comments(first: 50) {
          totalCount
          edges {
            node {
              authorAssociation
              author{
                login
              }
              bodyText
            }
          }
        }
      }
    }
  }
}
"""

In [16]:
json = {"query": pull_request_query}
pull_request_df = better_df(query(json))

In [17]:
pull_request_df.head()

Unnamed: 0,author,authorAssociation,bodyText,comments,createdAt,mergedBy,resourcePath,state,title,updatedAt,commentCount,company,filesCommited
0,dsaxton,NONE,DataFrame.corr currently returns a KeyError fo...,"[{'authorAssociation': 'NONE', 'author': {'log...",2018-08-12T23:53:02Z,TomAugspurger,/pandas-dev/pandas/pull/22298,MERGED,Use a more helpful error message for invalid c...,2018-08-18T19:53:34Z,8,,5
1,realead,CONTRIBUTOR,it is more or less the clean-up after PR #2190...,"[{'authorAssociation': 'CONTRIBUTOR', 'author'...",2018-08-12T20:10:14Z,jreback,/pandas-dev/pandas/pull/22296,MERGED,BUG: don't mangle NaN-float-values and pd.NaT...,2018-09-19T11:00:36Z,13,,7
2,nmusolino,CONTRIBUTOR,Added tests of DataFrame.xs() method for multi...,"[{'authorAssociation': 'NONE', 'author': {'log...",2018-08-12T16:35:54Z,jreback,/pandas-dev/pandas/pull/22294,MERGED,TST: Add test of DataFrame.xs() with duplicate...,2018-09-15T15:42:02Z,4,,1
3,makbigc,CONTRIBUTOR,"closes #22092\n 1 test added for &, | and ^ lo...","[{'authorAssociation': 'NONE', 'author': {'log...",2018-08-12T14:15:20Z,jbrockmendel,/pandas-dev/pandas/pull/22293,MERGED,Bug: Logical operator of Series with Index (#2...,2018-09-18T13:54:15Z,14,,3
4,alimcmaster1,CONTRIBUTOR,"[y] passes git diff upstream/master -u -- ""*.p...","[{'authorAssociation': 'NONE', 'author': {'log...",2018-08-12T00:39:32Z,,/pandas-dev/pandas/pull/22289,CLOSED,Remove unused param/method,2018-08-13T20:01:43Z,3,,4


**Notes:**

- Modify pull_request_query to get 'createdAt' data for the entire repo. Use this for getting the first pull requests, one month at a time.
- After getting 1 month of pull request data, check `remaining`.
- If remaining > 0 then query the next month: this means modifying the query using concatentation or substitution somehow to change the date values.