In [1]:
from string import Template

import pandas as pd
import requests

### Set `file`, `repo_owner`, and `repo_name` Before Running Notebook

In [2]:
file = open("/home/joseph/graphql_token.txt", "r")
api_token = file.read().strip()
headers = {"Authorization": "token %s" % api_token}

In [3]:
repo_owner = "pandas-dev"
repo_name = "pandas"

In [4]:
# Query Pull Request
pr_query = Template('''
{
  repositoryOwner(login: "$repo_owner") {
    repository(name: "$repo_name") {
      pullRequests(first: 100) {
        pageInfo {
          endCursor
          hasNextPage
        }
        nodes {
          createdAt
          updatedAt
          closedAt
          title
          mergedBy {
            login
          }
          author {
            login
            ... on User {
              company
            }
          }
          authorAssociation
          files {
            totalCount
          }
          state
          resourcePath
          bodyText
          comments(first: 25) {
            totalCount
            nodes {
              author {
                login
                ... on User {
                  company
                }
              }
              authorAssociation
              bodyText
            }
          }
        }
      }
    }
  }
  rateLimit {
    limit
    cost
    remaining
    resetAt
  }
}
''')

pr_query_2 = Template('''
{
  repositoryOwner(login: "$repo_owner") {
    repository(name: "$repo_name") {
      pullRequests(first: 100, after: "$end_cursor") {
        pageInfo {
          endCursor
          hasNextPage
        }
        nodes {
          createdAt
          updatedAt
          closedAt
          title
          mergedBy {
            login
          }
          author {
            login
            ... on User {
              company
            }
          }
          authorAssociation
          files {
            totalCount
          }
          state
          resourcePath
          bodyText
          comments(first: 25) {
            totalCount
            nodes {
              author {
                login
                ... on User {
                  company
                }
              }
              authorAssociation
              bodyText
            }
          }
        }
      }
    }
  }
  rateLimit {
    limit
    cost
    remaining
    resetAt
  }
}
''')


variables = {
    "repo_owner": "",
    "repo_name": "",
    "end_cursor": ""
}

In [5]:
# Function to use requests.post to make an API call
def run_query(query): 
    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        print("Query failed to run by returning code of {}. {}".format(request.status_code, query))

In [6]:
def to_df(result):
    # Get the keys of nodes.
    data = result['data']['repositoryOwner']['repository']['pullRequests']['nodes']
    
    # Put the data into a dataframe
    df_pr = pd.DataFrame(data)
    
    return df_pr
def get_page_info(result):
    cur = result['data']['repositoryOwner']['repository']['pullRequests']['pageInfo']['endCursor']
    has_cur = result['data']['repositoryOwner']['repository']['pullRequests']['pageInfo']['hasNextPage']
    return cur, has_cur
def result_query(query):
    # Run first query
    result = run_query(query)
    # Save to DF
    df_pr = to_df(result)
    # Get end cursor and has cursor
    variables['end_cursor'], has_cursor = get_page_info(result)
    # Limit
    limit = result['data']['rateLimit']['remaining']
    
    return df_pr, has_cursor, limit

In [7]:
# Get `createdAt` date for repo, use that as cursor
repo_query = Template('''
{
  repositoryOwner(login:"$repo_owner") {
    id
    login
    repository(name:"$repo_name") {
      id
      name
      createdAt
      updatedAt
      description
    }
  }
}
''')

# Query for repo info
repo_query_sub = repo_query.substitute(repo_owner=repo_owner, repo_name=repo_name)

query_results = run_query(repo_query_sub)
query_dict = query_results['data']['repositoryOwner']['repository']
repo_createdAt = pd.Timestamp(query_dict['createdAt']).date()

# Update variables dict
#variables["end_cursor"] = str(repo_createdAt)
variables["end_cursor"] = str(repo_createdAt)
variables["repo_owner"] = repo_owner
variables["repo_name"] = repo_name

In [8]:
# Run first query
print(variables)
pr_query_sub = pr_query.substitute(**variables)
print(pr_query_2.substitute(**variables))
df_pr, has_cursor, limit = result_query(pr_query_sub)
print('Starting limit is: ', limit)
print('Start cursor: ', has_cursor, '\n')

# Run remaining queries
while has_cursor and (limit >= 0):
    try:
        # Run next queries
        df_pr_2, has_cursor, limit = result_query(pr_query_2.substitute(**variables))
        # Concat to existing df_star
        df_pr = pd.concat([df_pr, df_pr_2])
    except:
        # The print stmnt in `run_query()` will print error messages
        continue
    
    # Print limit and cursor
    print('Next limit is: ', limit)
    print('Next cursor: ', has_cursor)
    print(df_pr.shape, '\n')

{'repo_owner': 'pandas-dev', 'repo_name': 'pandas', 'end_cursor': '2010-08-24'}

{
  repositoryOwner(login: "pandas-dev") {
    repository(name: "pandas") {
      pullRequests(first: 100, after: "2010-08-24") {
        pageInfo {
          endCursor
          hasNextPage
        }
        nodes {
          createdAt
          updatedAt
          closedAt
          title
          mergedBy {
            login
          }
          author {
            login
            ... on User {
              company
            }
          }
          authorAssociation
          files {
            totalCount
          }
          state
          resourcePath
          bodyText
          comments(first: 25) {
            totalCount
            nodes {
              author {
                login
                ... on User {
                  company
                }
              }
              authorAssociation
              bodyText
            }
          }
        }
      }
    }
  }
  rateL

In [9]:
df_pr.head()

Unnamed: 0,author,authorAssociation,bodyText,closedAt,comments,createdAt,files,mergedBy,resourcePath,state,title,updatedAt
0,,NONE,I have added hist method to frame.py and serie...,2011-02-25T01:41:56Z,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-02-22T08:07:35Z,{'totalCount': 3},,/pandas-dev/pandas/pull/35,CLOSED,Added hist method and added auto_x argument to...,2011-02-27T05:17:56Z
1,,NONE,"Added two methods in frame.py: describe, which...",2011-03-18T20:00:26Z,"{'totalCount': 4, 'nodes': [{'author': {'login...",2011-03-06T01:35:53Z,{'totalCount': 1},{'login': 'wesm'},/pandas-dev/pandas/pull/37,MERGED,added describe and scoreatpercentile in frame.py,2014-07-03T19:26:14Z
2,,NONE,So the results of DataFrame.toString() without...,2011-03-22T20:01:36Z,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-03-22T19:27:42Z,{'totalCount': 1},{'login': 'wesm'},/pandas-dev/pandas/pull/38,MERGED,changed _pfixed in common.py to add a leading ...,2011-03-22T23:47:23Z
3,"{'login': 'dieterv77', 'company': None}",CONTRIBUTOR,"Hi Wes, i've been following your pandas work f...",2011-06-29T02:06:48Z,"{'totalCount': 1, 'nodes': [{'author': {'login...",2011-06-29T00:59:24Z,{'totalCount': 1},{'login': 'wesm'},/pandas-dev/pandas/pull/55,MERGED,Minor change to CleanCommand so build works wi...,2011-06-29T02:07:07Z
4,"{'login': 'jberka', 'company': None}",CONTRIBUTOR,Starting small...\nIt looks like the .h5 files...,2011-08-07T16:15:41Z,"{'totalCount': 1, 'nodes': [{'author': {'login...",2011-08-06T08:02:19Z,{'totalCount': 1},{'login': 'wesm'},/pandas-dev/pandas/pull/89,MERGED,Fix for issue #80,2014-07-17T17:26:50Z


In [10]:
pickle_filename = './'+repo_owner+'_'+repo_name+'_df.pk1' 

df_pr.to_pickle(pickle_filename)