In [1]:
import requests
import pandas as pd

In [2]:
# Open and read file
with open ('../../../Auth_Keys/graphql_api_auth.txt') as file:
    token = file.read()
    
# Specify the Authorization code
headers = {"Authorization": 'Bearer ' + token}

In [3]:
# Function to use requests.post to make an API call
def run_query(query): 
    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))

In [4]:
query = '''
{
  repository(owner: "pandas-dev", name: "pandas") {
    ref(qualifiedName: "master") {
      target {
        ... on Commit {
          history(first: 100) {
            pageInfo {
              hasNextPage
              endCursor
            }
            nodes {
              oid
              messageHeadline
              committedDate
              committer {
                user {
                  login
                  company
                }
              }
              status {
                id
                state
              }
              associatedPullRequests(first: 3) {
                nodes {
                  id
                  title
                  author {
                    login
                  }
                  authorAssociation
                  createdAt
                  updatedAt
                  closedAt
                  number
                  state
                }
              }
            }
          }
        }
      }
    }
  }
  rateLimit {
    limit
    cost
    remaining
    resetAt
  }
}
'''

In [5]:
query_2 = '''
{{
  repository(owner: "pandas-dev", name: "pandas") {{
    ref(qualifiedName: "master") {{
      target {{
        ... on Commit {{
          history(first: 100, after:"{end_cursor}") {{
            pageInfo {{
              hasNextPage
              endCursor
            }}
            nodes {{
              oid
              messageHeadline
              committedDate
              committer {{
                user {{
                  login
                  company
                }}
              }}
              status {{
                id
                state
              }}
              associatedPullRequests(first: 3) {{
                nodes {{
                  id
                  title
                  author {{
                    login
                  }}
                  authorAssociation
                  createdAt
                  updatedAt
                  closedAt
                  number
                  state
                }}
              }}
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

variables = {
    "end_cursor": ""
}

In [6]:
def to_df(result):
    
    # Get the keys of nodes.
    data = result['data']['repository']['ref']['target']['history']['nodes']
    
    # Put the data into a dataframe
    df_commits = pd.DataFrame(data)
    
    return df_commits

In [7]:
def get_page_info(result):
    cur = result['data']['repository']['ref']['target']['history']['pageInfo']['endCursor']
    has_cur = result['data']['repository']['ref']['target']['history']['pageInfo']['hasNextPage']
    return cur, has_cur

In [8]:
def result_query(query):
    # Run first query
    result = run_query(query)

    # Save to DF
    df_commits = to_df(result)

    # Get end cursor and has cursor
    variables['end_cursor'], has_cursor = get_page_info(result)

    # Limit
    limit = result['data']['rateLimit']['remaining']
    
    return df_commits, has_cursor, limit

In [27]:
%%time
##### Comment Out this section if there's a 403 error #####

# Run first query
# df_commits, has_cursor, limit = result_query(query)

# print('Starting limit is: ', limit)
# print('Start cursor: ', has_cursor, '\n')

##### Comment Out this section if there's a 403 error #####

# Run While Loop below
while has_cursor and (limit >= 0):
    
    # Run next queries
    df_commits_2, has_cursor, limit = result_query(query_2.format(**variables))

    # Concat to existing df_star
    df_commits = pd.concat([df_commits, df_commits_2])
    
    # Print limit and cursor
    print('Next limit is: ', limit)
    print('Next cursor: ', has_cursor, '\n')

Next limit is:  4788
Next cursor:  True 

Next limit is:  4787
Next cursor:  True 

Next limit is:  4786
Next cursor:  True 

Next limit is:  4785
Next cursor:  True 

Next limit is:  4784
Next cursor:  True 

Next limit is:  4783
Next cursor:  True 

Next limit is:  4782
Next cursor:  True 

Next limit is:  4781
Next cursor:  True 

Next limit is:  4780
Next cursor:  True 

Next limit is:  4779
Next cursor:  True 

Next limit is:  4778
Next cursor:  False 

Wall time: 1min 31s


In [30]:
df_commits.shape

(20036, 6)

In [29]:
df_commits.to_csv('df_commits_pandas.csv', index=False)

In [32]:
df_commits.to_pickle('../../../Files/df_commits_pandas.pk1')