# Download data from BigQuery tables

In [None]:
# change these to try this notebook out
BUCKET = 'cloud-training-demos'
PROJECT = 'smooth-splicer-354315'
REGION = 'us-central1'

import os
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

import sys
sys.path.append('./utils')

In [None]:
from common_functions import read_bigquery

In [None]:
# All stories from Github, New York Times and TechCrunch
query = """
    SELECT source, LOWER(REGEXP_REPLACE(title, '[^a-zA-Z0-9 $.-]', ' ')) AS title FROM
    (SELECT
        ARRAY_REVERSE(SPLIT(REGEXP_EXTRACT(url, '.*://(.[^/]+)/'), '.'))[OFFSET(1)] AS source,
        title
    FROM
        `bigquery-public-data.hacker_news.stories`
    WHERE
        REGEXP_CONTAINS(REGEXP_EXTRACT(url, '.*://(.[^/]+)/'), '.com$')
        AND LENGTH(title) > 10
    )
    WHERE (source = 'github' OR source = 'nytimes' OR source = 'techcrunch')
    """

In [None]:
read_bigquery(query + " LIMIT 5").head()

Unnamed: 0,source,title
0,github,django outbox
1,github,webscrapper using node.js deferred cheerio...
2,techcrunch,flashnotes picks up another $3.6m
3,github,a git user s guide to svn because at least 10...
4,github,show hn cmake module to take care of git subm...


A simple, repeatable way to do this is to use the hash of a well-distributed column in our data (See https://www.oreilly.com/learning/repeatable-sampling-of-data-sets-in-bigquery-for-machine-learning).

In [None]:
# below query would output almost 75% rows to traindf and rest 25% to evaldf
traindf = read_bigquery(query + " AND ABS(MOD(FARM_FINGERPRINT(title), 4)) > 0")
evaldf  = read_bigquery(query + " AND ABS(MOD(FARM_FINGERPRINT(title), 4)) = 0")

In [None]:
traindf['source'].value_counts()

github        27445
techcrunch    23131
nytimes       21586
Name: source, dtype: int64

In [None]:
evaldf['source'].value_counts()

github        9080
techcrunch    7760
nytimes       7201
Name: source, dtype: int64

In [None]:
traindf.to_csv('data/hackernews-train.csv', index=False)
evaldf.to_csv('data/hackernews-eval.csv', index=False)

In [None]:
!wget --directory-prefix ./data https://nlp.stanford.edu/data/glove.6B.zip

--2022-06-26 15:00:43--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-06-26 15:00:43--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-06-26 15:03:23 (5.17 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [None]:
!unzip ./data/glove.6B.zip glove.6B.200d.txt -d ./data/

Archive:  ./data/glove.6B.zip
  inflating: ./data/glove.6B.200d.txt  


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=46e3e238-14db-4736-bba3-428176481d68' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>