In [1]:
from google.cloud import bigquery
from google.oauth2 import service_account
import json
import re
import datetime
import pandas as pd
import numpy as np
import urllib
import zipfile
import os
import langid
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/atersaak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# save key .json file in the github labeler root
# project id on bigquery account should match

credentials = service_account.Credentials.from_service_account_file(
'../../github-issue-data-extraction-key.json')

project_id = 'github-issue-data-extraction'
client = bigquery.Client(credentials= credentials, project=project_id)

In [3]:
# select memory you are willing to use on github issue data
# defined in gigabytes

max_memory_usage = 5

In [4]:
# simple preprocessing functions

def remove_quotes(string):
    """
    Remove quotes from the string (everything extracted from json has quotes)
    """
    if type(string) == str:
        return string[1:-1]
    else:
        return string

def is_bot(actor):
    """
    Identify users clearly tagged as bots
    """
    if type(actor) != str:
        return True
    if actor[-5:] == '[bot]':
        return True
    else:
        return False

In [24]:
def get_data_for_day(day):
    """
    Pass in a datetime object and a dataframe of all the issue data from that day will be returned
    """
    date = day.strftime('%Y%m%d')
    response = client.query(f"""SELECT JSON_EXTRACT(payload, '$.issue.title') as title,
                                JSON_EXTRACT(payload, '$.issue.body') as body,
                                JSON_EXTRACT(payload, '$.issue.html_url') as url,
                                JSON_EXTRACT(payload, '$.issue.user.login') as actor
                                FROM githubarchive.day.{date}
                                WHERE type = 'IssuesEvent' AND JSON_EXTRACT(payload, '$.action') = '"opened"' 
                                """)
    df = response.to_dataframe()
    return df

def process_df(df):
    for col in df.columns:
        df[col] = df[col].apply(remove_quotes)
        df = df[~df[col].apply(is_bot)]
    df = df[~df[col].apply(is_bot)]
    return df

In [25]:
def is_english(text):
    """
    Determine if a language is English
    """
    return langid.classify(text)[0] == 'en'

In [26]:
### preprocess functions defined below

function_list = []

pattern = r"```.+?```"
code_block_regex = re.compile(pattern, re.DOTALL)


def code_block(string):
    """Replace code blocks with a CODE_BLOCK."""
    string = re.sub(code_block_regex, "CODE_BLOCK", string)
    return string


function_list.append(code_block)

pattern = r"`{1,2}.+?`{1,2}"
inline_code_regex = re.compile(pattern, re.DOTALL)


def code_variable(string):
    """Replace inline code with INLINE."""
    string = re.sub(inline_code_regex, " INLINE ", string)
    return string


function_list.append(code_variable)

pattern = r"\s@[^\s]+"
tagged_user_regex = re.compile(pattern)


def tagged_user(string):
    """Replace a user tagged with USER."""
    string = re.sub(tagged_user_regex, " USER ", string)
    return string


function_list.append(tagged_user)

pattern = r"[^\s]+\.(com|org|net|gov|edu|io|ai)[^\s]*"
url_regex = re.compile(pattern)


def urls(string):
    """Replace URLs with URL."""
    string = re.sub(url_regex, " URL ", string)
    return string


function_list.append(urls)

pattern = r"(\\r\\n)+"
enter_regex = re.compile(pattern, re.DOTALL)


def enters(string):
    """Replace newline characters with a space."""
    string = re.sub(enter_regex, " ", string)
    return string


function_list.append(enters)

pattern = r"#{3,}"
bold_regex = re.compile(pattern, re.DOTALL)


def bold(string):
    """Replace bold characters with a space."""
    string = re.sub(bold_regex, " ", string)
    return string


function_list.append(bold)


def preprocess(string):
    """Put all preprocessing functions together."""
    for func in function_list:
        string = func(string)
    return string

In [213]:
# download pretrained english model

# if not os.path.isfile('../models/wiki-news-300d-1M.vec'):
#     urllib.urlretrieve("https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip",
#                        "../models/wiki-news-300d-1M.vec.zip")
#     with zipfile.ZipFile('../models/wiki-news-300d-1M.vec.zip', 'r') as zip_ref:
#         zip_ref.extractall('../models')
#     os.remove('../models/wiki-news-300d-1M.vec')

In [27]:
# here we download some data spaced out over about two years to build vocabulary

data = []

total_data = 0

curr_day = datetime.datetime.today().date() - datetime.timedelta(days = 7)

num_days = 1

while num_days < 50:
    df = get_data_for_day(curr_day)
    df = process_df(df)
    inp = df['title'].fillna(' ') + ' <SEP> ' + df['body'].fillna(' ')
    inp = inp.apply(preprocess)
    inp = inp[inp.apply(is_english)]
    inp = inp.apply(word_tokenize).values
    data += list(inp)
    total_data += sum(df.memory_usage(deep = True))/1000000000
    print(f'{num_days} days and {round(total_data, 2)} GB used')
    curr_day -= datetime.timedelta(days = 14)
    num_days += 1

data = np.stack(data)

1 days and 0.06 GB used
2 days and 0.13 GB used
3 days and 0.19 GB used
4 days and 0.26 GB used
5 days and 0.35 GB used
6 days and 0.41 GB used
7 days and 0.49 GB used
8 days and 0.55 GB used
9 days and 0.62 GB used
10 days and 0.69 GB used


KeyboardInterrupt: 

In [29]:
data[0]

['Issue',
 'with',
 'functions',
 'are',
 'default',
 'arguments',
 '<',
 'SEP',
 '>',
 'The',
 'following',
 'code',
 ':',
 'CODE_BLOCK',
 'causes',
 'an',
 'analyzer',
 'error',
 ':',
 '>',
 '>',
 'dds.structures.DDSException',
 ':',
 'The',
 'type',
 '<',
 'class',
 "'function",
 "'",
 '>',
 'is',
 'currently',
 'not',
 'supported',
 '.',
 'The',
 'only',
 'supported',
 'types',
 'are',
 "'well-known",
 "'",
 'types',
 'that',
 'are',
 'part',
 'of',
 'the',
 'standard',
 'data',
 'structures',
 'in',
 'the',
 'python',
 'library',
 '.',
 'If',
 'you',
 'think',
 'your',
 'data',
 'type',
 'should',
 'be',
 'supported',
 'by',
 'DDS',
 ',',
 'please',
 'open',
 'a',
 'request',
 'ticket',
 '.',
 'General',
 'Python',
 'classes',
 'will',
 'not',
 'be',
 'supported',
 'since',
 'they',
 'can',
 'carry',
 'arbitrary',
 'state',
 'and',
 'can',
 'not',
 'be',
 'easily',
 'compared',
 '.',
 'Consider',
 'using',
 'a',
 'dataclass',
 ',',
 'a',
 'dictionary',
 'or',
 'a',
 'named',
 'tup

In [95]:
from gensim.models import Word2Vec

In [86]:
import gensim.downloader

In [61]:
model = gensim.downloader.load('glove-wiki-gigaword-50')

In [96]:
w = Word2Vec(vector_size=50, window=4, min_count=5, workers=4)

In [97]:
w.wv = model

In [89]:
w.save('model')

In [98]:
w.build_vocab(data[150:160], update = True)

AttributeError: 'Word2Vec' object has no attribute 'syn1neg'

In [77]:
len(w.wv)

400009

In [66]:
w = Word2Vec.load('model')

In [78]:
! pip list

Package                            Version
---------------------------------- -------------------
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 2.0.3
anaconda-project                   0.9.1
anyio                              2.2.0
appdirs                            1.4.4
argh                               0.26.2
argon2-cffi                        20.1.0
asn1crypto                         1.4.0
astroid                            2.5
astropy                            4.2.1
async-generator                    1.10
atomicwrites                       1.4.0
attrs                              20.3.0
autopep8                           1.5.6
Babel                              2.9.0
backcall                           0.2.0
backports.entry-points-selectable  1.1.0
backports.functools-lru-cache      1.6.4
backports.shutil-get-terminal-size 1.0.0
backports.tempfile                 1.0
backports.weakref  

In [91]:
! pip install gensim==3.8.3

Collecting gensim==3.8.3
  Downloading gensim-3.8.3-cp38-cp38-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 3.3 MB/s eta 0:00:01
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.0.0
    Uninstalling gensim-3.0.0:
      Successfully uninstalled gensim-3.0.0
Successfully installed gensim-3.8.3


In [93]:
imp.reload(Word2Vec)

TypeError: reload() argument must be a module

In [94]:
type(Word2Vec)

type