In [3]:
import requests
import pandas as pd

In [2]:
# Open and read file
with open ('../../../Auth_Keys/graphql_api_auth.txt') as file:
    token = file.read()
    
# Specify the Authorization code
headers = {"Authorization": 'Bearer ' + token}

In [3]:
# Function to use requests.post to make an API call
def run_query(query): 
    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))

In [17]:
# Query Pull Request
query = '''
{
  repositoryOwner(login: "pandas-dev") {
    repository(name: "pandas") {
      pullRequests(first: 100) {
        pageInfo {
          endCursor
          hasNextPage
        }
        nodes {
          createdAt
          updatedAt
          closedAt
          title
          mergedBy {
            login
          }
          author {
            login
            ... on User {
              company
            }
          }
          authorAssociation
          files {
            totalCount
          }
          state
          resourcePath
          bodyText
          comments(first: 25) {
            totalCount
            nodes {
              author {
                login
                ... on User {
                  company
                }
              }
              authorAssociation
              bodyText
            }
          }
        }
      }
    }
  }
  rateLimit {
    limit
    cost
    remaining
    resetAt
  }
}
'''

query_2 = '''
{{
  repositoryOwner(login: "pandas-dev") {{
    repository(name: "pandas") {{
      pullRequests(first: 100, after:"{end_cursor}") {{
        pageInfo {{
          endCursor
          hasNextPage
        }}
        nodes {{
          createdAt
          updatedAt
          closedAt
          title
          mergedBy {{
            login
          }}
          author {{
            login
            ... on User {{
              company
            }}
          }}
          authorAssociation
          files {{
            totalCount
          }}
          state
          resourcePath
          bodyText
          comments(first: 25) {{
            totalCount
            nodes {{
              author {{
                login
                ... on User {{
                  company
                }}
              }}
              authorAssociation
              bodyText
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}

'''

variables = {
    "end_cursor": ""
}

### Get closedAt date

In [58]:
query = '''
{
  repositoryOwner(login: "pandas-dev") {
    repository(name: "pandas") {
      pullRequests(first: 100) {
        pageInfo {
          endCursor
          hasNextPage
        }
        nodes {
          closedAt
          title
        }
      }
    }
  }
  rateLimit {
    limit
    cost
    remaining
    resetAt
  }
}
'''
query_2 = '''
{{
  repositoryOwner(login: "pandas-dev") {{
    repository(name: "pandas") {{
      pullRequests(first: 100, after:"{end_cursor}") {{
        pageInfo {{
          endCursor
          hasNextPage
        }}
        nodes {{
          closedAt
          title
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''
variables = {
    "end_cursor": ""
}

In [11]:
def to_df(result):
    
    # Get the keys of nodes.
    data = result['data']['repositoryOwner']['repository']['pullRequests']['nodes']
    
    # Put the data into a dataframe
    df_pr = pd.DataFrame(data)
    
    return df_pr

def get_page_info(result):
    cur = result['data']['repositoryOwner']['repository']['pullRequests']['pageInfo']['endCursor']
    has_cur = result['data']['repositoryOwner']['repository']['pullRequests']['pageInfo']['hasNextPage']
    return cur, has_cur

def result_query(query):
    # Run first query
    result = run_query(query)

    # Save to DF
    df_pr = to_df(result)

    # Get end cursor and has cursor
    variables['end_cursor'], has_cursor = get_page_info(result)

    # Limit
    limit = result['data']['rateLimit']['remaining']
    
    return df_pr, has_cursor, limit

In [59]:
%%time
##### Comment Out this section if there's a error #####

# Run first query
df_pr, has_cursor, limit = result_query(query)

print('Starting limit is: ', limit)
print('Start cursor: ', has_cursor, '\n')

##### Comment Out this section if there's a error #####

# Run While Loop below
while has_cursor and (limit >= 0):
    
    # Run next queries
    df_pr_2, has_cursor, limit = result_query(query_2.format(**variables))

    # Concat to existing df_star
    df_pr = pd.concat([df_pr, df_pr_2])
    
    # Print limit and cursor
    print('Next limit is: ', limit)
    print('Next cursor: ', has_cursor, '\n')

Starting limit is:  4914
Start cursor:  True 

Next limit is:  4913
Next cursor:  True 

Next limit is:  4912
Next cursor:  True 

Next limit is:  4911
Next cursor:  True 

Next limit is:  4910
Next cursor:  True 

Next limit is:  4909
Next cursor:  True 

Next limit is:  4908
Next cursor:  True 

Next limit is:  4907
Next cursor:  True 

Next limit is:  4906
Next cursor:  True 

Next limit is:  4905
Next cursor:  True 

Next limit is:  4904
Next cursor:  True 

Next limit is:  4903
Next cursor:  True 

Next limit is:  4902
Next cursor:  True 

Next limit is:  4901
Next cursor:  True 

Next limit is:  4900
Next cursor:  True 

Next limit is:  4899
Next cursor:  True 

Next limit is:  4898
Next cursor:  True 

Next limit is:  4897
Next cursor:  True 

Next limit is:  4896
Next cursor:  True 

Next limit is:  4895
Next cursor:  True 

Next limit is:  4894
Next cursor:  True 

Next limit is:  4893
Next cursor:  True 

Next limit is:  4892
Next cursor:  True 

Next limit is:  4891
Next cur

In [56]:
df_pr.shape

(12405, 11)

In [57]:
df_pr.to_pickle('../../../Files/df_pr_pandas.pk1')

## Combine the two dfs together - df with closedAt and original df

In [61]:
df1 = pd.read_pickle('../../../Files/df_pr_pandas.pk1')

In [60]:
df2 = df_pr

In [65]:
print(df1.shape)
df1.head()

(12405, 11)


Unnamed: 0,author,authorAssociation,bodyText,comments,createdAt,files,mergedBy,resourcePath,state,title,updatedAt
0,,NONE,I have added hist method to frame.py and serie...,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-02-22T08:07:35Z,{'totalCount': 3},,/pandas-dev/pandas/pull/35,CLOSED,Added hist method and added auto_x argument to...,2011-02-27T05:17:56Z
1,,NONE,"Added two methods in frame.py: describe, which...","{'totalCount': 4, 'nodes': [{'author': {'login...",2011-03-06T01:35:53Z,{'totalCount': 1},{'login': 'wesm'},/pandas-dev/pandas/pull/37,MERGED,added describe and scoreatpercentile in frame.py,2014-07-03T19:26:14Z
2,,NONE,So the results of DataFrame.toString() without...,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-03-22T19:27:42Z,{'totalCount': 1},{'login': 'wesm'},/pandas-dev/pandas/pull/38,MERGED,changed _pfixed in common.py to add a leading ...,2011-03-22T23:47:23Z
3,"{'login': 'dieterv77', 'company': None}",CONTRIBUTOR,"Hi Wes, i've been following your pandas work f...","{'totalCount': 1, 'nodes': [{'author': {'login...",2011-06-29T00:59:24Z,{'totalCount': 1},{'login': 'wesm'},/pandas-dev/pandas/pull/55,MERGED,Minor change to CleanCommand so build works wi...,2011-06-29T02:07:07Z
4,"{'login': 'jberka', 'company': None}",CONTRIBUTOR,Starting small...\nIt looks like the .h5 files...,"{'totalCount': 1, 'nodes': [{'author': {'login...",2011-08-06T08:02:19Z,{'totalCount': 1},{'login': 'wesm'},/pandas-dev/pandas/pull/89,MERGED,Fix for issue #80,2014-07-17T17:26:50Z


In [66]:
print(df2.shape)
df2.head()

(12405, 2)


Unnamed: 0,closedAt,title
0,2011-02-25T01:41:56Z,Added hist method and added auto_x argument to...
1,2011-03-18T20:00:26Z,added describe and scoreatpercentile in frame.py
2,2011-03-22T20:01:36Z,changed _pfixed in common.py to add a leading ...
3,2011-06-29T02:06:48Z,Minor change to CleanCommand so build works wi...
4,2011-08-07T16:15:41Z,Fix for issue #80


In [78]:
df3 = df1.merge(df2, on=['title'])
print(df3.shape)
df3.head()

(12797, 12)


Unnamed: 0,author,authorAssociation,bodyText,comments,createdAt,files,mergedBy,resourcePath,state,title,updatedAt,closedAt
0,,NONE,I have added hist method to frame.py and serie...,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-02-22T08:07:35Z,{'totalCount': 3},,/pandas-dev/pandas/pull/35,CLOSED,Added hist method and added auto_x argument to...,2011-02-27T05:17:56Z,2011-02-25T01:41:56Z
1,,NONE,"Added two methods in frame.py: describe, which...","{'totalCount': 4, 'nodes': [{'author': {'login...",2011-03-06T01:35:53Z,{'totalCount': 1},{'login': 'wesm'},/pandas-dev/pandas/pull/37,MERGED,added describe and scoreatpercentile in frame.py,2014-07-03T19:26:14Z,2011-03-18T20:00:26Z
2,,NONE,So the results of DataFrame.toString() without...,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-03-22T19:27:42Z,{'totalCount': 1},{'login': 'wesm'},/pandas-dev/pandas/pull/38,MERGED,changed _pfixed in common.py to add a leading ...,2011-03-22T23:47:23Z,2011-03-22T20:01:36Z
3,"{'login': 'dieterv77', 'company': None}",CONTRIBUTOR,"Hi Wes, i've been following your pandas work f...","{'totalCount': 1, 'nodes': [{'author': {'login...",2011-06-29T00:59:24Z,{'totalCount': 1},{'login': 'wesm'},/pandas-dev/pandas/pull/55,MERGED,Minor change to CleanCommand so build works wi...,2011-06-29T02:07:07Z,2011-06-29T02:06:48Z
4,"{'login': 'jberka', 'company': None}",CONTRIBUTOR,Starting small...\nIt looks like the .h5 files...,"{'totalCount': 1, 'nodes': [{'author': {'login...",2011-08-06T08:02:19Z,{'totalCount': 1},{'login': 'wesm'},/pandas-dev/pandas/pull/89,MERGED,Fix for issue #80,2014-07-17T17:26:50Z,2011-08-07T16:15:41Z


In [83]:
df3.to_pickle('../../../Files/df_pr_pandas_2.pk1')