In [1]:
import zipfile
import databricks.koalas as ks
import sys
sys.path.insert(0, '../')
from util import util_data_access
from pyspark.sql import SparkSession
import tqdm
import pandas as pd 
import os, glob
import re
from tqdm.auto import tqdm

# Download and Prepare Data

In [15]:
import spacy

In [None]:
spacy.load('en_core_web_l')

In [14]:
## download spacy model 
# download data
util_data_access.download_file('en_core_web_lg.tar.gz', 'edit-pathways/spacy/en_core_web_lg.tar.gz')
! tar -xf en_core_web_lg.tar.gz

True

In [3]:
# download data
util_data_access.download_file('newssniffer-nytimes.db.gz', 'edit-pathways/dbs/newssniffer-nytimes.db.gz')
! gunzip newssniffer-nytimes.db.gz

True

In [7]:
## download data
# util_data_access.download_file('edit-data.zip', 'edit-data/edit-data.zip')
# myzip = zipfile.ZipFile('edit-data.zip')
# myzip.extractall(path='edit-data')

True

In [68]:
util_data_access.upload_file(
    'edit-data/article_output/article-output-1-8.json',
    object_name='edit-data/article-output-1-8.json',
)

True

In [71]:
for i in tqdm(range(2, 9)):
    util_data_access.upload_file(
        'edit-data/article_output/article-output-%d-8.json' % i,
        object_name='edit-data/article-output-%d-8.json' % i,
    )

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [17]:
for i in tqdm(range(1, 9)):
    util_data_access.upload_file(
        'edit-data/article_output/version-output-%d-8.json' % i,
        object_name='edit-data/version-output-%d-8.json' % i,
    )

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




# Turn on Spark

In [2]:
# turn on spark
spark = (
    SparkSession.builder \
      .config("spark.executor.instances", "50") \
      .config("spark.driver.memory", "15g") \
      .getOrCreate()
)
# sc = spark.sparkContext
# set arrow to true
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [3]:
ks.set_option('compute.default_index_type', 'distributed')

In [4]:
spark

In [5]:
import sqlite3

In [6]:
conn = sqlite3.connect('newssniffer-nytimes.db')

In [7]:
df = pd.read_sql('select * from entryversion', con=conn)

In [8]:
kdf = ks.from_pandas(df)

In [9]:
import sys

In [12]:
sys.path.append('../util')

In [13]:
import util_newssniffer_parsing as unp

downloading spacy...


KeyboardInterrupt: 

In [None]:
kdf.groupby('entry_id')

In [None]:
k

# Data Manipulation

In [5]:
cache_dir = 'edit-data/article_output/'

article_page_files = glob.glob(cache_dir + '/article-output*')
diff_files         = glob.glob(cache_dir + '/diff-output-*')
version_files      = glob.glob(cache_dir + '/version-output*')

sort_by_num = lambda fs: sorted(fs, key=lambda x: int(re.search('-(\d+)-\d.json', x)[1]))
article_page_files = sort_by_num(article_page_files)
diff_files = sort_by_num(diff_files)
version_files = sort_by_num(version_files)

In [6]:
article_kdfs = None
for i in tqdm(range(1, 9)):
    kdf = ks.read_json('s3://aspangher/edit-data/article-output-%d-8.json' % i)#, index_col=0)
    if article_kdfs is None:
        article_kdfs = kdf
    else:
        article_kdfs = article_kdfs.append(kdf, ignore_index=True)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [9]:
article_kdfs.shape 

(2791957, 2)

In [7]:
article_kdfs = article_kdfs.drop('_corrupt_record').dropna()

In [8]:
version_files = sort_by_num(version_files)

In [9]:
version_kdfs = None
for i in tqdm(range(1, 9)):
    kdf = ks.read_json('s3://aspangher/edit-data/version-output-%d-8.json' % i)#, index_col=0)
    if version_kdfs is None:
        version_kdfs = kdf
    else:
        version_kdfs = version_kdfs.append(kdf, ignore_index=True)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [10]:
version_kdfs.shape 

(3470064, 9)

In [11]:
version_kdfs = version_kdfs.drop('_corrupt_record').dropna()

In [12]:
version_kdfs.columns

Index(['article_id', 'article_url', 'source', 'time_created', 'time_delta',
       'title', 'version', 'version_url'],
      dtype='object')

In [13]:
version_kdfs.head(1)

Unnamed: 0,article_id,article_url,source,time_created,time_delta,title,version,version_url
1,35,http://news.bbc.co.uk/go/rss/-/1/hi/world/midd...,bbc,"August 29, 2006 01:35",,Annan asks Israel to end blockade,0,https://www.newssniffer.co.uk/articles/35/vers...


In [14]:
article_kdfs.columns

Index(['html', 'url'], dtype='object')

In [29]:
full_kdfs = version_kdfs.merge(article_kdfs, left_on='version_url', right_on='url')

In [16]:
full_kdfs.head(1)

Unnamed: 0,article_id,article_url,source,time_created,time_delta,title,version,version_url,html,url
0,10001,http://news.bbc.co.uk/go/rss/-/1/hi/uk/6115912...,bbc,"November 05, 2006 21:20",1 day later,North East 'warms to devolution',2,https://www.newssniffer.co.uk/articles/10001/v...,"<p><a href=""http://news.bbc.co.uk/go/rss/-/1/h...",https://www.newssniffer.co.uk/articles/10001/v...


In [18]:
t = full_kdfs['html'].head(2)

In [21]:
s = t.iloc[0]

In [30]:
def f(s):
    return '</p><p>'.join(s.split('</p><p>')[2:])

full_kdfs['body_text'] = full_kdfs['html'].apply(f)

In [33]:
t = full_kdfs['body_text'].str.split('</p><p>')

In [38]:
from collections import defaultdict, Counter

In [39]:
c = Counter()

In [None]:
counter = defaultdict(int)
for i in tqdm(t):
    c.update(i)

In [49]:
full_kdfs.columns

Index(['article_id', 'article_url', 'source', 'time_created', 'time_delta',
       'title', 'version', 'version_url', 'html', 'url', 'body_text'],
      dtype='object')

In [50]:
t = full_kdfs.groupby('article_id')[['version', 'body_text', 'title']]

In [52]:
t.head(1)

Py4JJavaError: An error occurred while calling o3814.collectToPython.
: org.apache.spark.SparkException: Job 32 cancelled 
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1824)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2076)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3263)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3260)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3260)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


KeyboardInterrupt: 

In [31]:
ks.Series

databricks.koalas.series.Series

In [34]:
full_kdfs['version_url'].isnull().value_counts()

False    3470048
Name: version_url, dtype: int64

In [None]:
spark.stop()