# Queried Repo: Public, Stars >= 5, and Forks >= 5

In [1]:
import requests
import pandas as pd

In [2]:
# Open and read file
with open ('../../../Auth_Keys/graphql_api_auth.txt') as file:
    token = file.read()
    
# Specify the Authorization code
headers = {"Authorization": 'Bearer ' + token}

In [3]:
# Function to use requests.post to make an API call
def run_query(query): 
    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))

## Randomize Date time

In [4]:
import random
import numpy as np
import datetime
import time

In [5]:
# Change start and end date to date time object
date_time_start = datetime.date(2008,8,23)
date_time_end = datetime.date(2019,8,29)

# Convert start and end dates to unix
unix_start = time.mktime(date_time_start.timetuple())
unix_end = time.mktime(date_time_end.timetuple())

# Get a random date between the dates
rand_range = random.randrange(unix_start, unix_end)

# https://stackoverflow.com/questions/3682748/converting-unix-timestamp-string-to-readable-date
# Change unix the utc format
date_query = datetime.datetime.utcfromtimestamp(rand_range).strftime('%Y-%m-%dT%H')#:%M:%SZ')

In [8]:
def get_date():
    rand_range = random.randrange(unix_start, unix_end)
    return datetime.datetime.utcfromtimestamp(rand_range).strftime('%Y-%m-%dT%H')#:%M:%SZ')

def to_df(result):
    
    # Get the keys of nodes.
    data = result['data']['search']['nodes']
    
    # Put the data into a dataframe
    df_repo = pd.DataFrame(data)
    
    return df_repo

In [9]:
query_2 = '''
{{
  search(query: "is:public stars:>=5 forks:>=5 created:{date_query}", type: REPOSITORY, first:1) {{
    repositoryCount
    pageInfo {{
      endCursor
      hasNextPage
    }}
    nodes {{
      ... on Repository {{
        id
        createdAt
        updatedAt
        owner {{
          login
        }}
        name
        nameWithOwner
        primaryLanguage {{
          name
        }}
        description
        hasIssuesEnabled
        hasWikiEnabled
        licenseInfo {{
          spdxId
        }}
        object(expression:"master:README.md") {{
          ... on Blob {{
            text
          }}
        }}
        second_object: object(expression: "master") {{
          ... on Commit {{
            history {{
              totalCount
            }}
          }}
        }}
        forkCount
        stargazers {{
          totalCount
        }}
        issues {{
          totalCount
        }}
        pullRequests {{
          totalCount
        }}
      }}
    }}
  }}
    rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

variables = {
    "date_query": ""
}

In [10]:
# Run While Loop below
i=10

df_repo = pd.DataFrame()

while i>=0:
    print('Iteration number: ', i)
    # Get new random date
    variables['date_query'] = get_date()
    print('Random Date Generated')
    
    # Run next queries
    result = run_query(query_2.format(**variables))
    print('Query Finish')
    
    # Save to df
    df_repo_2 = to_df(result)
    print('Saved Query to DF')

    # Concat to existing df_star
    df_repo = pd.concat([df_repo, df_repo_2])
    print('Concat repo successful')
    
    # Iterate next random repo
    i -= 1
    
    print('Next Iteration number: ', i, '\n')

Iteration number:  10
Random Date Generated
Query Finish
Saved Query to DF
Concat repo successful
Next Iteration number:  9 

Iteration number:  9
Random Date Generated
Query Finish
Saved Query to DF
Concat repo successful
Next Iteration number:  8 

Iteration number:  8
Random Date Generated
Query Finish
Saved Query to DF
Concat repo successful
Next Iteration number:  7 

Iteration number:  7
Random Date Generated
Query Finish
Saved Query to DF
Concat repo successful
Next Iteration number:  6 

Iteration number:  6
Random Date Generated
Query Finish
Saved Query to DF
Concat repo successful
Next Iteration number:  5 

Iteration number:  5
Random Date Generated
Query Finish
Saved Query to DF
Concat repo successful
Next Iteration number:  4 

Iteration number:  4
Random Date Generated
Query Finish
Saved Query to DF
Concat repo successful
Next Iteration number:  3 

Iteration number:  3
Random Date Generated
Query Finish
Saved Query to DF
Concat repo successful
Next Iteration number:  2 


In [11]:
df_repo

Unnamed: 0,createdAt,description,forkCount,hasIssuesEnabled,hasWikiEnabled,id,issues,licenseInfo,name,nameWithOwner,object,owner,primaryLanguage,pullRequests,second_object,stargazers,updatedAt
0,2013-01-19T02:21:47Z,A game-theoretic poker player (written in 2005),30,True,True,MDEwOlJlcG9zaXRvcnk3Njk4MTkx,{'totalCount': 0},,game-theory-poker,adamsmith/game-theory-poker,{'text': 'game-theory-poker ===== This is a g...,{'login': 'adamsmith'},{'name': 'Java'},{'totalCount': 1},{'history': {'totalCount': 2}},{'totalCount': 167},2019-08-13T15:14:46Z
0,2014-04-01T22:40:40Z,The Google I/O 2019 Android App,5931,True,False,MDEwOlJlcG9zaXRvcnkxODM0NzQ3Ng==,{'totalCount': 192},{'spdxId': 'NOASSERTION'},iosched,google/iosched,{'text': 'Google I/O Android App =============...,{'login': 'google'},{'name': 'Kotlin'},{'totalCount': 134},{'history': {'totalCount': 2635}},{'totalCount': 18987},2019-09-03T15:52:27Z
0,2014-07-17T04:50:32Z,"SEX IS ZERO (0), so, who wanna be the ONE (1),...",1959,True,True,MDEwOlJlcG9zaXRvcnkyMTkyOTAyOQ==,{'totalCount': 76},{'spdxId': 'GPL-2.0'},hardseed,yangyangwithgnu/hardseed,"{'text': '<h1 align=""center"">给不了你梦中情人，至少还有硬盘女神...",{'login': 'yangyangwithgnu'},{'name': 'C++'},{'totalCount': 12},{'history': {'totalCount': 197}},{'totalCount': 9102},2019-09-02T12:40:54Z
0,2017-03-21T11:05:58Z,在线地址: http://www.jobinfo.cc:8000/,91,True,True,MDEwOlJlcG9zaXRvcnk4NTY5MjA1Mg==,{'totalCount': 8},{'spdxId': 'MIT'},webspider,iven-he/webspider,{'text': '# [![Build Status](https://travis-...,{'login': 'iven-he'},{'name': 'Python'},{'totalCount': 8},{'history': {'totalCount': 51}},{'totalCount': 260},2019-08-29T15:43:08Z
0,2012-09-14T14:19:46Z,Using Spring from within a Play 2.0 application,24,False,False,MDEwOlJlcG9zaXRvcnk1ODEwMjM4,{'totalCount': 0},,play20-spring-demo,guillaumebort/play20-spring-demo,{'text': 'Using Spring from within a Play 2.0 ...,{'login': 'guillaumebort'},{'name': 'Java'},{'totalCount': 3},{'history': {'totalCount': 3}},{'totalCount': 88},2019-05-31T11:35:43Z
0,2013-08-14T10:22:25Z,File based encrypted key-value store,46,True,True,MDEwOlJlcG9zaXRvcnkxMjEwNjE5Mg==,{'totalCount': 151},{'spdxId': 'MIT'},trousseau,oleiade/trousseau,"{'text': '![Trousseau, a portable encrypted ke...",{'login': 'oleiade'},{'name': 'Go'},{'totalCount': 46},{'history': {'totalCount': 547}},{'totalCount': 881},2019-08-26T16:50:04Z
0,2014-09-16T03:58:52Z,Embulk: Pluggable Bulk Data Loader.,166,True,False,MDEwOlJlcG9zaXRvcnkyNDA4NDczMA==,{'totalCount': 391},{'spdxId': 'NOASSERTION'},embulk,embulk/embulk,{'text': '# What's Embulk? Embulk is a parall...,{'login': 'embulk'},{'name': 'Java'},{'totalCount': 786},{'history': {'totalCount': 2646}},{'totalCount': 1324},2019-09-03T07:27:25Z
0,2018-01-28T08:18:24Z,🌕 🌖 🌗 🌘 🌑 🌒 🌓 🌔Imitate Cosmos - a special thir...,72,True,True,MDEwOlJlcG9zaXRvcnkxMTkyNDA0NTM=,{'totalCount': 10},{'spdxId': 'GPL-2.0'},ZHNCosmos,zhnnnnn/ZHNCosmos,{'text': '# Imitate Cosmos - a special third p...,{'login': 'zhnnnnn'},{'name': 'Objective-C'},{'totalCount': 0},{'history': {'totalCount': 37}},{'totalCount': 451},2019-08-30T06:26:09Z
0,2016-04-27T23:40:22Z,C++ library and programs that demonstrate mesh...,103,True,True,MDEwOlJlcG9zaXRvcnk1NzI1NTk3Mg==,{'totalCount': 2},{'spdxId': 'NOASSERTION'},Mesh-processing-library,microsoft/Mesh-processing-library,{'text': '# Mesh-processing-library See [READM...,{'login': 'microsoft'},{'name': 'C++'},{'totalCount': 0},{'history': {'totalCount': 42}},{'totalCount': 399},2019-09-03T03:55:23Z


### Clean data

In [14]:
df_repo['issues'] = df_repo['issues'].apply(lambda x: x['totalCount'])

In [19]:
df_repo['licenseInfo'] = df_repo['licenseInfo'].apply(lambda x: x['spdxId'] if x != None else None)

In [23]:
df_repo['object'] = df_repo['object'].apply(lambda x: x['text'])

In [24]:
df_repo['owner'] = df_repo['owner'].apply(lambda x: x['login'])

In [25]:
df_repo['primaryLanguage'] = df_repo['primaryLanguage'].apply(lambda x: x['name'])

In [26]:
df_repo['pullRequests'] = df_repo['pullRequests'].apply(lambda x: x['totalCount'])

In [27]:
df_repo['second_object'] = df_repo['second_object'].apply(lambda x: x['history']['totalCount'])

In [28]:
df_repo['stargazers'] = df_repo['stargazers'].apply(lambda x: x['totalCount'])

In [29]:
df_repo = df_repo.reset_index().drop(columns='index')

### Rename columns

In [68]:
df_repo = df_repo.rename(columns={'object':'readMe','issues':'totalIssues','pullRequests':'totalPullRequests',
                'second_object':'totalCommits', 'stargazers':'totalStargazers', 'forkCount':'totalForks'})

In [59]:
df_repo['createdAt'] = df_repo['createdAt'].apply(lambda x: pd.to_datetime(x))
df_repo['updatedAt'] = df_repo['updatedAt'].apply(lambda x: pd.to_datetime(x))

In [69]:
df_repo.head()

Unnamed: 0,createdAt,description,totalForks,hasIssuesEnabled,hasWikiEnabled,id,totalIssues,licenseInfo,name,nameWithOwner,readMe,owner,primaryLanguage,totalPullRequests,totalCommits,totalStargazers,updatedAt
0,2013-01-19 02:21:47+00:00,A game-theoretic poker player (written in 2005),30,True,True,MDEwOlJlcG9zaXRvcnk3Njk4MTkx,0,,game-theory-poker,adamsmith/game-theory-poker,game-theory-poker\n=====\n\nThis is a game-the...,adamsmith,Java,1,2,167,2019-08-13 15:14:46+00:00
1,2014-04-01 22:40:40+00:00,The Google I/O 2019 Android App,5931,True,False,MDEwOlJlcG9zaXRvcnkxODM0NzQ3Ng==,192,NOASSERTION,iosched,google/iosched,Google I/O Android App\n======================...,google,Kotlin,134,2635,18987,2019-09-03 15:52:27+00:00
2,2014-07-17 04:50:32+00:00,"SEX IS ZERO (0), so, who wanna be the ONE (1),...",1959,True,True,MDEwOlJlcG9zaXRvcnkyMTkyOTAyOQ==,76,GPL-2.0,hardseed,yangyangwithgnu/hardseed,"<h1 align=""center"">给不了你梦中情人，至少还有硬盘女神：hardseed<...",yangyangwithgnu,C++,12,197,9102,2019-09-02 12:40:54+00:00
3,2017-03-21 11:05:58+00:00,在线地址: http://www.jobinfo.cc:8000/,91,True,True,MDEwOlJlcG9zaXRvcnk4NTY5MjA1Mg==,8,MIT,webspider,iven-he/webspider,# \n\n[![Build Status](https://travis-ci.org/G...,iven-he,Python,8,51,260,2019-08-29 15:43:08+00:00
4,2012-09-14 14:19:46+00:00,Using Spring from within a Play 2.0 application,24,False,False,MDEwOlJlcG9zaXRvcnk1ODEwMjM4,0,,play20-spring-demo,guillaumebort/play20-spring-demo,Using Spring from within a Play 2.0 applicatio...,guillaumebort,Java,3,3,88,2019-05-31 11:35:43+00:00


In [61]:
df_repo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 17 columns):
createdAt           9 non-null datetime64[ns, UTC]
description         9 non-null object
forkCount           9 non-null int64
hasIssuesEnabled    9 non-null bool
hasWikiEnabled      9 non-null bool
id                  9 non-null object
issues              9 non-null int64
licenseInfo         7 non-null object
name                9 non-null object
nameWithOwner       9 non-null object
object              9 non-null object
owner               9 non-null object
primaryLanguage     9 non-null object
pullRequests        9 non-null int64
second_object       9 non-null int64
stargazers          9 non-null int64
updatedAt           9 non-null datetime64[ns, UTC]
dtypes: bool(2), datetime64[ns, UTC](2), int64(5), object(8)
memory usage: 1.2+ KB


In [62]:
df_repo.head()

Unnamed: 0,createdAt,description,forkCount,hasIssuesEnabled,hasWikiEnabled,id,issues,licenseInfo,name,nameWithOwner,object,owner,primaryLanguage,pullRequests,second_object,stargazers,updatedAt
0,2013-01-19 02:21:47+00:00,A game-theoretic poker player (written in 2005),30,True,True,MDEwOlJlcG9zaXRvcnk3Njk4MTkx,0,,game-theory-poker,adamsmith/game-theory-poker,game-theory-poker\n=====\n\nThis is a game-the...,adamsmith,Java,1,2,167,2019-08-13 15:14:46+00:00
1,2014-04-01 22:40:40+00:00,The Google I/O 2019 Android App,5931,True,False,MDEwOlJlcG9zaXRvcnkxODM0NzQ3Ng==,192,NOASSERTION,iosched,google/iosched,Google I/O Android App\n======================...,google,Kotlin,134,2635,18987,2019-09-03 15:52:27+00:00
2,2014-07-17 04:50:32+00:00,"SEX IS ZERO (0), so, who wanna be the ONE (1),...",1959,True,True,MDEwOlJlcG9zaXRvcnkyMTkyOTAyOQ==,76,GPL-2.0,hardseed,yangyangwithgnu/hardseed,"<h1 align=""center"">给不了你梦中情人，至少还有硬盘女神：hardseed<...",yangyangwithgnu,C++,12,197,9102,2019-09-02 12:40:54+00:00
3,2017-03-21 11:05:58+00:00,在线地址: http://www.jobinfo.cc:8000/,91,True,True,MDEwOlJlcG9zaXRvcnk4NTY5MjA1Mg==,8,MIT,webspider,iven-he/webspider,# \n\n[![Build Status](https://travis-ci.org/G...,iven-he,Python,8,51,260,2019-08-29 15:43:08+00:00
4,2012-09-14 14:19:46+00:00,Using Spring from within a Play 2.0 application,24,False,False,MDEwOlJlcG9zaXRvcnk1ODEwMjM4,0,,play20-spring-demo,guillaumebort/play20-spring-demo,Using Spring from within a Play 2.0 applicatio...,guillaumebort,Java,3,3,88,2019-05-31 11:35:43+00:00


In [70]:
df_repo.to_pickle('../../../Files/df_repo_10.pk1')

## Querying only 1000 repos - Must use created:{date} and retrieve it by chunks(days or weeks)

## Original query

In [39]:
query = '''
{
  search(query: "is:public stars:>=5 forks:>=5", type: REPOSITORY, first:15) {
    repositoryCount
    pageInfo {
      endCursor
      hasNextPage
    }
    nodes {
      ... on Repository {
        id
        createdAt
        updatedAt
        owner {
          login
        }
        name
        nameWithOwner
        primaryLanguage {
          name
        }
        description
        hasIssuesEnabled
        hasWikiEnabled
        licenseInfo {
          spdxId
        }
        object(expression:"master:README.md") {
          ... on Blob {
            text
          }
        }
        second_object: object(expression: "master") {
          ... on Commit {
            history {
              totalCount
            }
          }
        }
        forkCount
        stargazers {
          totalCount
        }
        issues {
          totalCount
        }
        pullRequests {
          totalCount
        }
      }
    }
  }
    rateLimit {
    limit
    cost
    remaining
    resetAt
  }
}
'''
query_2 = '''
{{
  search(query: "is:public stars:>=5 forks:>=5, type: REPOSITORY, first:1, after:"{end_cursor}") {{
    repositoryCount
    pageInfo {{
      endCursor
      hasNextPage
    }}
    nodes {{
      ... on Repository {{
        id
        createdAt
        updatedAt
        owner {{
          login
        }}
        name
        nameWithOwner
        primaryLanguage {{
          name
        }}
        description
        hasIssuesEnabled
        hasWikiEnabled
        licenseInfo {{
          spdxId
        }}
        object(expression:"master:README.md") {{
          ... on Blob {{
            text
          }}
        }}
        second_object: object(expression: "master") {{
          ... on Commit {{
            history {{
              totalCount
            }}
          }}
        }}
        forkCount
        stargazers {{
          totalCount
        }}
        issues {{
          totalCount
        }}
        pullRequests {{
          totalCount
        }}
      }}
    }}
  }}
    rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

variables = {
    "end_cursor": ""
}

## Seperate Query

### Query 1/3: Repo info

In [96]:
query = '''
{
  search(query: "is:public stars:>=5 forks:>=5", type: REPOSITORY, first:100) {
    repositoryCount
    pageInfo {
      endCursor
      hasNextPage
    }
    nodes {
      ... on Repository {
        id
        createdAt
        updatedAt
        owner {
          login
        }
        name
        nameWithOwner
        primaryLanguage {
          name
        }
        description
        hasIssuesEnabled
        hasWikiEnabled
        licenseInfo {
          spdxId
        }
        forkCount
        stargazers {
          totalCount
        }
        issues {
          totalCount
        }
        pullRequests {
          totalCount
        }
      }
    }
  }
    rateLimit {
    limit
    cost
    remaining
    resetAt
  }
}
'''

query_2 = '''
{{
  search(query: "is:public stars:>=5 forks:>=5", type: REPOSITORY, first:100, after:"{end_cursor}") {{
    repositoryCount
    pageInfo {{
      endCursor
      hasNextPage
    }}
    nodes {{
      ... on Repository {{
        id
        createdAt
        updatedAt
        owner {{
          login
        }}
        name
        nameWithOwner
        primaryLanguage {{
          name
        }}
        description
        hasIssuesEnabled
        hasWikiEnabled
        licenseInfo {{
          spdxId
        }}
        forkCount
        stargazers {{
          totalCount
        }}
        issues {{
          totalCount
        }}
        pullRequests {{
          totalCount
        }}
      }}
    }}
  }}
    rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

variables = {
    "end_cursor": ""
}

### Query 2/3: Readme

In [85]:
query = '''
{
  search(query: "is:public stars:>=5 forks:>=5", type: REPOSITORY, first: 100) {
    pageInfo {
      endCursor
      hasNextPage
    }
    nodes {
      ... on Repository {
        id
        object(expression: "master:README.md") {
          ... on Blob {
            text
          }
        }
      }
    }
  }
  rateLimit {
    limit
    cost
    remaining
    resetAt
  }
}
'''

query_2 = '''
{{
  search(query: "is:public stars:>=5 forks:>=5", type: REPOSITORY, first: 100, after:"{end_cursor}") {{
    pageInfo {{
      endCursor
      hasNextPage
    }}
    nodes {{
      ... on Repository {{
        id
        object(expression: "master:README.md") {{
          ... on Blob {{
            text
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

variables = {
    "end_cursor": ""
}

### Query 3/3: Commit Count

In [107]:
query = '''
{
  search(query: "is:public stars:>=5 forks:>=5", type: REPOSITORY, first: 15) {
    pageInfo {
      endCursor
      hasNextPage
    }
    nodes {
      ... on Repository {
        id
        object(expression: "master") {
          ... on Commit {
            history {
              totalCount
            }
          }
        }
      }
    }
  }
  rateLimit {
    limit
    cost
    remaining
    resetAt
  }
}
'''

query_2 = '''
{{
  search(query: "is:public stars:>=5 forks:>=5", type: REPOSITORY, first: 10, after:"{end_cursor}") {{
    pageInfo {{
      endCursor
      hasNextPage
    }}
    nodes {{
      ... on Repository {{
        id
        object: object(expression: "master") {{
          ... on Commit {{
            history {{
              totalCount
            }}
          }}
        }}
      }}
    }}
  }}
  rateLimit {{
    limit
    cost
    remaining
    resetAt
  }}
}}
'''

variables = {
    "end_cursor": ""
}

In [6]:
def to_df(result):
    
    # Get the keys of nodes.
    data = result['data']['search']['nodes']
    
    # Put the data into a dataframe
    df_repo = pd.DataFrame(data)
    
    return df_repo

def get_page_info(result):
    cur = result['data']['search']['pageInfo']['endCursor']
    has_cur = result['data']['search']['pageInfo']['hasNextPage']
    return cur, has_cur

def result_query(query):
    # Run first query
    result = run_query(query)

    # Save to DF
    df_repo = to_df(result)

    # Get end cursor and has cursor
    variables['end_cursor'], has_cursor = get_page_info(result)

    # Limit
    limit = result['data']['rateLimit']['remaining']
    
    return df_repo, has_cursor, limit

In [108]:
%%time
##### Comment Out this section if there's a 403 error #####

# Run first query
df_repo, has_cursor, limit = result_query(query)

print('Starting limit is: ', limit)
print('Start cursor: ', has_cursor, '\n')

##### Comment Out this section if there's a 403 error #####

# Run While Loop below
while has_cursor and (limit >= 0):
    
    # Run next queries
    df_repo_2, has_cursor, limit = result_query(query_2.format(**variables))

    # Concat to existing df_star
    df_repo = pd.concat([df_repo, df_repo_2])
    
    # Print limit and cursor
    print('Next limit is: ', limit)
    print('Next cursor: ', has_cursor, '\n')

Starting limit is:  4847
Start cursor:  True 

Next limit is:  4846
Next cursor:  True 

Next limit is:  4845
Next cursor:  True 

Next limit is:  4844
Next cursor:  True 

Next limit is:  4843
Next cursor:  True 

Next limit is:  4842
Next cursor:  True 

Next limit is:  4841
Next cursor:  True 

Next limit is:  4840
Next cursor:  True 

Next limit is:  4839
Next cursor:  True 

Next limit is:  4838
Next cursor:  True 

Next limit is:  4837
Next cursor:  True 

Next limit is:  4836
Next cursor:  True 

Next limit is:  4835
Next cursor:  True 

Next limit is:  4834
Next cursor:  True 

Next limit is:  4833
Next cursor:  True 

Next limit is:  4832
Next cursor:  True 

Next limit is:  4831
Next cursor:  True 

Next limit is:  4830
Next cursor:  True 

Next limit is:  4829
Next cursor:  True 

Next limit is:  4828
Next cursor:  True 

Next limit is:  4827
Next cursor:  True 

Next limit is:  4826
Next cursor:  True 

Next limit is:  4825
Next cursor:  True 

Next limit is:  4824
Next cur

In [109]:
print(df_repo.shape)
df_repo.head()

(995, 2)


Unnamed: 0,id,object
0,MDEwOlJlcG9zaXRvcnkyODQ1NzgyMw==,{'history': {'totalCount': 24700}}
1,MDEwOlJlcG9zaXRvcnkxNzc3MzY1MzM=,{'history': {'totalCount': 2960}}
2,MDEwOlJlcG9zaXRvcnkxMTczMDM0Mg==,{'history': {'totalCount': 2283}}
3,MDEwOlJlcG9zaXRvcnkyMTI2MjQ0,{'history': {'totalCount': 18978}}
4,MDEwOlJlcG9zaXRvcnkxMDI3MDI1MA==,{'history': {'totalCount': 12327}}


### Pickle df

In [110]:
df_repo.to_pickle('../../../Files/df_repo_pandas_3.pk1')

In [112]:
df1 =pd.read_pickle('../../../Files/df_repo_pandas_1.pk1')
df2 =pd.read_pickle('../../../Files/df_repo_pandas_2.pk1')
df3 =pd.read_pickle('../../../Files/df_repo_pandas_3.pk1')

### Merge all 3 and pickle

In [116]:
print(df1.shape)
df1.head()

(1000, 15)


Unnamed: 0,createdAt,description,forkCount,hasIssuesEnabled,hasWikiEnabled,id,issues,licenseInfo,name,nameWithOwner,owner,primaryLanguage,pullRequests,stargazers,updatedAt
0,2014-12-24T17:49:19Z,The https://www.freeCodeCamp.org open source c...,22478,True,False,MDEwOlJlcG9zaXRvcnkyODQ1NzgyMw==,{'totalCount': 13752},{'spdxId': 'BSD-3-Clause'},freeCodeCamp,freeCodeCamp/freeCodeCamp,{'login': 'freeCodeCamp'},{'name': 'JavaScript'},{'totalCount': 22525},{'totalCount': 304556},2019-08-28T23:48:45Z
1,2019-03-26T07:31:14Z,Repo for counting stars and contributing. Pres...,21315,False,False,MDEwOlJlcG9zaXRvcnkxNzc3MzY1MzM=,{'totalCount': 0},{'spdxId': 'NOASSERTION'},996.ICU,996icu/996.ICU,{'login': '996icu'},{'name': 'Rust'},{'totalCount': 1802},{'totalCount': 247156},2019-08-28T21:26:17Z
2,2013-07-29T03:24:51Z,"🖖 Vue.js is a progressive, incrementally-adopt...",21536,True,True,MDEwOlJlcG9zaXRvcnkxMTczMDM0Mg==,{'totalCount': 8344},{'spdxId': 'MIT'},vue,vuejs/vue,{'login': 'vuejs'},{'name': 'JavaScript'},{'totalCount': 1596},{'totalCount': 147000},2019-08-28T23:04:19Z
3,2011-07-29T21:19:00Z,"The most popular HTML, CSS, and JavaScript fra...",66660,True,False,MDEwOlJlcG9zaXRvcnkyMTI2MjQ0,{'totalCount': 18731},{'spdxId': 'MIT'},bootstrap,twbs/bootstrap,{'login': 'twbs'},{'name': 'JavaScript'},{'totalCount': 10057},{'totalCount': 135471},2019-08-28T22:52:51Z
4,2013-05-24T16:15:54Z,"A declarative, efficient, and flexible JavaScr...",25294,True,True,MDEwOlJlcG9zaXRvcnkxMDI3MDI1MA==,{'totalCount': 7959},{'spdxId': 'MIT'},react,facebook/react,{'login': 'facebook'},{'name': 'JavaScript'},{'totalCount': 8478},{'totalCount': 135172},2019-08-28T21:21:48Z


In [137]:
print(df2.shape)
print(df2.isna().sum())
df2[:10] 

(1000, 2)
id          0
object    114
dtype: int64


Unnamed: 0,id,object
0,MDEwOlJlcG9zaXRvcnkyODQ1NzgyMw==,{'text': '![freeCodeCamp.org Social Banner](ht...
1,MDEwOlJlcG9zaXRvcnkxNzc3MzY1MzM=,{'text': '[996.ICU](https://996.icu/#/en_US) =...
2,MDEwOlJlcG9zaXRvcnkxMTczMDM0Mg==,"{'text': '<p align=""center""><a href=""https://v..."
3,MDEwOlJlcG9zaXRvcnkxMDI3MDI1MA==,{'text': '# [React](https://reactjs.org/) &mid...
4,MDEwOlJlcG9zaXRvcnk0NTcxNzI1MA==,"{'text': '<div align=""center"">  <img src=""htt..."
5,MDEwOlJlcG9zaXRvcnkyMTczNzQ2NQ==,
6,MDEwOlJlcG9zaXRvcnkxNDQ0MDI3MA==,{'text': '# You Don't Know JS Yet (book series...
7,MDEwOlJlcG9zaXRvcnkyOTExMzc=,"{'text': '<p align=""center"">  <img src=""https..."
8,MDEwOlJlcG9zaXRvcnkxMDYyODk3,{'text': '# A collection of `.gitignore` templ...
9,MDEwOlJlcG9zaXRvcnk2NDk4NDky,{'text': '# Airbnb JavaScript Style Guide() { ...


In [138]:
print(df3.shape)
print(df3.isna().sum())
df3[:10]

(995, 2)
id         0
object    18
dtype: int64


Unnamed: 0,id,object
0,MDEwOlJlcG9zaXRvcnkyODQ1NzgyMw==,{'history': {'totalCount': 24700}}
1,MDEwOlJlcG9zaXRvcnkxNzc3MzY1MzM=,{'history': {'totalCount': 2960}}
2,MDEwOlJlcG9zaXRvcnkxMTczMDM0Mg==,{'history': {'totalCount': 2283}}
3,MDEwOlJlcG9zaXRvcnkyMTI2MjQ0,{'history': {'totalCount': 18978}}
4,MDEwOlJlcG9zaXRvcnkxMDI3MDI1MA==,{'history': {'totalCount': 12327}}
5,MDEwOlJlcG9zaXRvcnk0NTcxNzI1MA==,{'history': {'totalCount': 63831}}
6,MDEwOlJlcG9zaXRvcnkxMzQ5MTg5NQ==,{'history': {'totalCount': 5060}}
7,MDEwOlJlcG9zaXRvcnkyMTczNzQ2NQ==,{'history': {'totalCount': 849}}
8,MDEwOlJlcG9zaXRvcnkxNDQ0MDI3MA==,{'history': {'totalCount': 1491}}
9,MDEwOlJlcG9zaXRvcnkyOTExMzc=,{'history': {'totalCount': 5218}}


In [153]:
merge_2_3 = df2.merge(df3, on=['id'], how='outer').rename(columns={'object_x':'readMeText', 'object_y':'commit'})

In [157]:
all_merge = df1.merge(merge_2_3, on=['id'], how='outer')

Unnamed: 0,createdAt,description,forkCount,hasIssuesEnabled,hasWikiEnabled,id,issues,licenseInfo,name,nameWithOwner,owner,primaryLanguage,pullRequests,stargazers,updatedAt,readMeText,commit
0,2014-12-24T17:49:19Z,The https://www.freeCodeCamp.org open source c...,22478.0,True,False,MDEwOlJlcG9zaXRvcnkyODQ1NzgyMw==,{'totalCount': 13752},{'spdxId': 'BSD-3-Clause'},freeCodeCamp,freeCodeCamp/freeCodeCamp,{'login': 'freeCodeCamp'},{'name': 'JavaScript'},{'totalCount': 22525},{'totalCount': 304556},2019-08-28T23:48:45Z,{'text': '![freeCodeCamp.org Social Banner](ht...,{'history': {'totalCount': 24700}}
1,2019-03-26T07:31:14Z,Repo for counting stars and contributing. Pres...,21315.0,False,False,MDEwOlJlcG9zaXRvcnkxNzc3MzY1MzM=,{'totalCount': 0},{'spdxId': 'NOASSERTION'},996.ICU,996icu/996.ICU,{'login': '996icu'},{'name': 'Rust'},{'totalCount': 1802},{'totalCount': 247156},2019-08-28T21:26:17Z,{'text': '[996.ICU](https://996.icu/#/en_US) =...,{'history': {'totalCount': 2960}}
2,2013-07-29T03:24:51Z,"🖖 Vue.js is a progressive, incrementally-adopt...",21536.0,True,True,MDEwOlJlcG9zaXRvcnkxMTczMDM0Mg==,{'totalCount': 8344},{'spdxId': 'MIT'},vue,vuejs/vue,{'login': 'vuejs'},{'name': 'JavaScript'},{'totalCount': 1596},{'totalCount': 147000},2019-08-28T23:04:19Z,"{'text': '<p align=""center""><a href=""https://v...",{'history': {'totalCount': 2283}}
3,2011-07-29T21:19:00Z,"The most popular HTML, CSS, and JavaScript fra...",66660.0,True,False,MDEwOlJlcG9zaXRvcnkyMTI2MjQ0,{'totalCount': 18731},{'spdxId': 'MIT'},bootstrap,twbs/bootstrap,{'login': 'twbs'},{'name': 'JavaScript'},{'totalCount': 10057},{'totalCount': 135471},2019-08-28T22:52:51Z,,{'history': {'totalCount': 18978}}
4,2013-05-24T16:15:54Z,"A declarative, efficient, and flexible JavaScr...",25294.0,True,True,MDEwOlJlcG9zaXRvcnkxMDI3MDI1MA==,{'totalCount': 7959},{'spdxId': 'MIT'},react,facebook/react,{'login': 'facebook'},{'name': 'JavaScript'},{'totalCount': 8478},{'totalCount': 135172},2019-08-28T21:21:48Z,{'text': '# [React](https://reactjs.org/) &mid...,{'history': {'totalCount': 12327}}
5,2015-11-07T01:19:20Z,An Open Source Machine Learning Framework for ...,76997.0,True,False,MDEwOlJlcG9zaXRvcnk0NTcxNzI1MA==,{'totalCount': 19770},{'spdxId': 'Apache-2.0'},tensorflow,tensorflow/tensorflow,{'login': 'tensorflow'},{'name': 'C++'},{'totalCount': 12122},{'totalCount': 133206},2019-08-28T23:46:45Z,"{'text': '<div align=""center"">  <img src=""htt...",{'history': {'totalCount': 63831}}
6,2013-10-11T06:50:37Z,:books: Freely available programming books,31766.0,True,True,MDEwOlJlcG9zaXRvcnkxMzQ5MTg5NQ==,{'totalCount': 398},{'spdxId': 'NOASSERTION'},free-programming-books,EbookFoundation/free-programming-books,{'login': 'EbookFoundation'},,{'totalCount': 2875},{'totalCount': 127409},2019-08-28T23:30:49Z,,{'history': {'totalCount': 5060}}
7,2014-07-11T13:42:37Z,😎 Awesome lists about all kinds of interesting...,15214.0,True,False,MDEwOlJlcG9zaXRvcnkyMTczNzQ2NQ==,{'totalCount': 230},,awesome,sindresorhus/awesome,{'login': 'sindresorhus'},,{'totalCount': 1359},{'totalCount': 114878},2019-08-28T22:32:09Z,,{'history': {'totalCount': 849}}
8,2013-11-16T02:37:24Z,A book series on JavaScript. @YDKJS on twitter.,21400.0,True,True,MDEwOlJlcG9zaXRvcnkxNDQ0MDI3MA==,{'totalCount': 771},{'spdxId': 'NOASSERTION'},You-Dont-Know-JS,getify/You-Dont-Know-JS,{'login': 'getify'},,{'totalCount': 728},{'totalCount': 107442},2019-08-28T23:41:11Z,{'text': '# You Don't Know JS Yet (book series...,{'history': {'totalCount': 1491}}
9,2009-08-28T18:15:37Z,"🙃 A delightful community-driven (with 1,300+ c...",17368.0,True,True,MDEwOlJlcG9zaXRvcnkyOTExMzc=,{'totalCount': 2968},{'spdxId': 'NOASSERTION'},oh-my-zsh,robbyrussell/oh-my-zsh,{'login': 'robbyrussell'},{'name': 'Shell'},{'totalCount': 5135},{'totalCount': 94158},2019-08-28T22:44:24Z,"{'text': '<p align=""center"">  <img src=""https...",{'history': {'totalCount': 5218}}


In [158]:
df_repo.to_pickle('../../../Files/df_repo_pandas_final.pk1')