In [153]:
from google.cloud import bigquery
from google.oauth2 import service_account
import json
import re
import datetime
from spacy_langdetect import LanguageDetector
from textblob import TextBlob
import pandas as pd
import numpy as np
import urllib
import zipfile
import os

In [117]:
# save key .json file in the github labeler root
# project id on bigquery account should match

credentials = service_account.Credentials.from_service_account_file(
'../../github-issue-data-extraction-key.json')

project_id = 'github-issue-data-extraction'
client = bigquery.Client(credentials= credentials, project=project_id)

In [163]:
# select memory you are willing to use on github issue data
# defined in gigabytes

max_memory_usage = 5

In [167]:
# simple preprocessing functions

def remove_quotes(string):
    """
    Remove quotes from the string (everything extracted from json has quotes)
    """
    if type(string) == str:
        return string[1:-1]
    else:
        return string

def is_bot(actor):
    """
    Identify users clearly tagged as bots
    """
    if type(actor) != str:
        return True
    if actor[-5:] == '[bot]':
        return True
    else:
        return False

In [165]:
def get_data_for_day(day):
    """
    Pass in a datetime object and a dataframe of all the issue data from that day will be returned
    """
    day.strftime('%Y%M%D')
    response = client.query(f"""SELECT JSON_EXTRACT(payload, '$.issue.title') as title,
                                JSON_EXTRACT(payload, '$.issue.body') as body,
                                JSON_EXTRACT(payload, '$.issue.html_url') as url,
                                JSON_EXTRACT(payload, '$.issue.user.login') as actor
                                FROM githubarchive.day.{date}
                                WHERE type = 'IssuesEvent' AND JSON_EXTRACT(payload, '$.action') = '"opened"' """)
    df = response.to_dataframe()
    return df

def process_df(df):
    for col in df.columns:
        df[col] = df[col].apply(remove_quotes)
        df = df[~df[col].apply(is_bot)]
    df = df[~df[col].apply(is_bot)]
    return df

In [149]:
def is_english(text):
    """
    Determine if a language is English
    """
    lang = TextBlob(text)
    return lang.detect_language()

In [150]:
### preprocess functions defined below

function_list = []

pattern = r"```.+?```"
code_block_regex = re.compile(pattern, re.DOTALL)


def code_block(string):
    """Replace code blocks with a CODE_BLOCK."""
    string = re.sub(code_block_regex, "CODE_BLOCK", string)
    return string


function_list.append(code_block)

pattern = r"`{1,2}.+?`{1,2}"
inline_code_regex = re.compile(pattern, re.DOTALL)


def code_variable(string):
    """Replace inline code with VARIABLE."""
    string = re.sub(inline_code_regex, " INLINE ", string)
    return string


function_list.append(code_variable)

pattern = r"\s@[^\s]+"
tagged_user_regex = re.compile(pattern)


def tagged_user(string):
    """Replace a user tagged with USER."""
    string = re.sub(tagged_user_regex, " USER ", string)
    return string


function_list.append(tagged_user)

pattern = r"[^\s]+\.(com|org|net|gov|edu)[^\s]*"
url_regex = re.compile(pattern)


def urls(string):
    """Replace URLs with URL."""
    string = re.sub(url_regex, " URL ", string)
    return string


function_list.append(urls)

pattern = r"[\r\n]+"
enter_regex = re.compile(pattern, re.DOTALL)


def enters(string):
    """Replace newline characters with ENTER."""
    string = re.sub(enter_regex, " ", string)
    return string


function_list.append(enters)

pattern = r"#####"
bold_regex = re.compile(pattern, re.DOTALL)


def bold(string):
    """Replace bold characters with bold word."""
    string = re.sub(bold_regex, " BOLD ", string)
    return string


function_list.append(bold)


def preprocess(string):
    """Put all preprocessing functions together."""
    for func in function_list:
        string = func(string)
    return string

In [154]:
# download pretrained english model

if not os.path.isfile('../models/wiki-news-300d-1M.vec'):
    urllib.urlretrieve("https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip",
                       "../models/wiki-news-300d-1M.vec.zip")
    with zipfile.ZipFile('../models/wiki-news-300d-1M.vec.zip', 'r') as zip_ref:
        zip_ref.extractall('../models')
    os.remove('../models/wiki-news-300d-1M.vec')

In [168]:
total_data = 0

curr_day = datetime.datetime.today().date() - datetime.timedelta(days = 7)

num_days = 1

while total_data < max_memory_usage:
    df = get_data_for_day(curr_day)
    df = process_df(df)
    total_data += sum(df.memory_usage(deep = True))/1000000000
    if num_days % 5 == 0:
        print(f'{num_days} days and {round(total_data, 2)} GB used')
    curr_day -= datetime.timedelta(days = 1)
    num_days += 1

5 days and 0.1 GB used


KeyboardInterrupt: 

In [169]:
df = get_data_for_day(curr_day)

In [174]:
df['actor'].isna().mean()

0.0

In [175]:
df.memory_u

Unnamed: 0,title,body,url,actor
0,"""Problem plotting even trivial data""","""On Mac OS High Sierra, I am having trouble wi...","""https://github.com/dmnfarrell/pandastable/iss...","""rpgoldman"""
1,"""Use gulp to automate deploy tasks in readme.md""","""""","""https://github.com/jkomoros/complexity-compen...","""jkomoros"""
2,"""Getting Error: TypeError: undefined is not an...","""I'm getting the above error from the iOS Simu...","""https://github.com/moaazsidat/react-native-qr...","""richoid"""
3,"""screenshot""","""![hyperstrip-hyperline-essentials](https://us...","""https://github.com/hetima/hyperstrip-hyperlin...","""hetima"""
4,"""lwan.pc compiled on aarch64 has an errant Lib...","""When compiling `lwan` on aarch64, the created...","""https://github.com/lpereira/lwan/issues/259""","""HalosGhost"""
...,...,...,...,...
13984,"""Blank space occurs if there are no sent notif...","""There should ideally be an empty state here s...","""https://github.com/real-time-footfall-analysi...","""ZeshanA"""
13985,"""TODO-test""","""###### TODO\r\n- [ ] zmienić typ pola w klasi...","""https://github.com/Team4stud/App/issues/2""","""mat-sop"""
13986,"""Product inventory does not update.""","""When a product is checked out, the back end i...","""https://github.com/Cezerin2/cezerin2/issues/18""","""neunygph"""
13987,"""Make each piece modular""","""- [x] Playable ro-sham-bo in console on one m...","""https://github.com/SetMatchGames/smg-proof-of...","""EzraWeller"""
