In [None]:
## download Albert model 
# download data
sys.path.insert(0, '../')
from util import util_data_access
import zipfile
util_data_access.download_file('albert.zip', 'spark-nlp/albert_xxlarge_uncased_en_2.5.0_2.4_1588073588232.zip')
# ! tar -xf en_core_web_lg.tar.gz
# ! mv en_core_web_lg-2.3.1 en_core_web_lg
with zipfile.ZipFile('albert.zip', 'r') as zip_ref:
    zip_ref.extractall('albert')

# Load Spark

In [1]:
import sparknlp
from pyspark.sql import SparkSession
# spark = sparknlp.start()

spark = (
    SparkSession.builder \
      .config("spark.executor.instances", "50") \
      .config("spark.driver.memory", "15g") \
      .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.7.5")
      .getOrCreate()
)

In [2]:
spark

# Set up Spark-NLP

In [3]:
from pyspark.ml import PipelineModel, Pipeline
import sparknlp.annotator as sa
import sparknlp.base as sb
# import sparknlp
from sparknlp import Finisher

In [4]:
documenter = sb.DocumentAssembler()\
    .setInputCol("summary")\
    .setOutputCol("document")
    
# sentencerDL = sa.SentenceDetectorDLModel\
#     .load("/Users/alex/.cache/spark-nlp/sentence_detector_dl_en_2.6.2_2.4_1600002888450") \
#     .setInputCols(["document"]) \
#     .setOutputCol("sentences")

sentencer = (sa.SentenceDetector()
                .setInputCols(["document"])
                .setOutputCol("sentences")            
            )

finisher = Finisher() \
    .setInputCols(["sentences"]) \
#     .setIncludeMetadata(True)

sd_pipeline = PipelineModel(stages=[documenter, sentencer, finisher])

# Try Sentence Tokenizing on Our Own Data

In [4]:
import sqlite3
import pandas as pd
from tqdm.auto import tqdm
import pyspark.sql.functions as F
# import unidecode

In [5]:
# conn = sqlite3.connect('../data/diffengine-diffs/db/newssniffer-nytimes.db')
conn = sqlite3.connect('newssniffer-nytimes.db')

df = pd.read_sql('''
     SELECT * from entryversion 
     WHERE entry_id IN (SELECT distinct entry_id FROM entryversion LIMIT 200)
 ''', con=conn)

# df = pd.read_sql('''
#     SELECT entry_id, summary, version from entryversion 
# ''', con=conn)

In [6]:
df = df.assign(summary=lambda df: df['summary'].str.replace('</p><p>', ' '))

In [7]:
sdf = spark.createDataFrame(df)

In [None]:
annotations_df = sd_pipeline.transform(sdf)

In [49]:
sent_list_df = (annotations_df
                .select("entry_id", "version", F.posexplode("finished_sentences"))
                .withColumnRenamed('col', 'sentence')
                .withColumnRenamed('pos', 'sent_idx')
               )
tdf = sent_list_df.toPandas()

In [54]:
exploded_sent_df = (sent_list_df
 .alias("sent_list_df")
 .join(
     sent_list_df.alias("sent_list_df_2"),
     [F.col("sent_list_df.entry_id") == F.col("sent_list_df_2.entry_id"), 
      F.col("sent_list_df.version") == F.col("sent_list_df_2.version"), 
     ], 
     "inner"
 )
 .select(
     F.col("sent_list_df.entry_id"),
     F.col("sent_list_df.version"),
     F.col("sent_list_df.sent_idx").alias("sent_idx_x"),
     F.col("sent_list_df_2.sent_idx").alias("sent_idx_y"),
     F.col("sent_list_df.sentence").alias("sentence_x"),
     F.col("sent_list_df_2.sentence").alias("sentence_y"),
#    .show(truncate=False)
    )
)

In [55]:
exploded_sent_df.show()


## todo: 
## 0. do this same procedure for diffed sequential versions

## 1a. use tokenize and Albert or BERT or Word2Vec to generate vectors of embeddings for each sentence.
## 1b. lemmatize each sentence

## 2. take Sim_asym along each row, two times using:
## a. phi(x, y) = vec(x) \cdot vec(y)
## b. phi(x ,y) = lemmatization

## 3. for each sentence, select the argmax in both directions.
## 4. choose some reasonable threshold.

## 5. For scores above this threshold, co

+--------+-------+----------+----------+-------------------+--------------------+
|entry_id|version|sent_idx_x|sent_idx_y|         sentence_x|          sentence_y|
+--------+-------+----------+----------+-------------------+--------------------+
|  548743|      1|         0|         0|FORT COLLINS, Colo.| FORT COLLINS, Colo.|
|  548743|      1|         0|         1|FORT COLLINS, Colo.|— Annie Hartnett ...|
|  548743|      1|         0|         2|FORT COLLINS, Colo.|Now 21 and a lead...|
|  548743|      1|         0|         3|FORT COLLINS, Colo.|“I would still sa...|
|  548743|      1|         0|         4|FORT COLLINS, Colo.|“When you’re voti...|
|  548743|      1|         0|         5|FORT COLLINS, Colo.|” So on Saturday ...|
|  548743|      1|         0|         6|FORT COLLINS, Colo.|Each party used t...|
|  548743|      1|         0|         7|FORT COLLINS, Colo.|But Mr. Obama, tr...|
|  548743|      1|         0|         8|FORT COLLINS, Colo.|“I’m counting on ...|
|  548743|      

In [56]:
import difflib

In [32]:
tdf

Unnamed: 0,entry_id,version,pos,col
0,547988,0,0,"In Silicon Valley, Apple just won big against ..."
1,547988,0,1,"Across the country, in a federal court in Flor..."
2,547988,0,2,"Mr. Stadnyk, who holds a patent on a motorcycl..."
3,547988,0,3,"Represented by a prominent Washington lawyer, ..."
4,547988,0,4,Mr. Stadnyk and his lawyer — along with some a...
5,547988,0,5,"The present system, one of the nation’s oldest..."
6,547988,0,6,The impending law would overturn that by award...
7,547988,0,7,"Mr. Stadnyk, 48, a garage inventor who stumble..."
8,547988,0,8,He devised a system of brackets and gears to a...
9,547988,0,9,"With his system, he says, the rider feels a fl..."


In [10]:
chunksize = 10000
unique_entryids = df['entry_id'].unique()
num_chunks = int(unique_entryids.shape[0] / chunksize)

output_dfs = []
for chunk_id in tqdm(range(num_chunks)):
    batch_ids = unique_entryids[chunk_id * chunksize: (chunk_id + 1) * chunksize]
    small_df = df.loc[lambda df: df['entry_id'].isin(batch_ids)]
    #
    sdf = spark.createDataFrame(small_df)
    #
    annotations_df = sd_pipeline.transform(sdf)
    t_df = annotations_df.toPandas()
    output_dfs.append(t_df)

  0%|          | 0/8 [00:00<?, ?it/s]

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 43196)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/python3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/python3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/python3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:35723)
Traceback (most recent call last

Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:35723)

# Get Albert Embeddings

In [77]:
document_assembler = sb.DocumentAssembler()\
  .setInputCol("summary")\
  .setOutputCol("document")

tokenizer = (sa.TokenizerModel()
  .setInputCols(["document"])
  .setOutputCol("token"))
 
word_embeddings = sa.AlbertEmbeddings.load('s3://aspangher/spark-nlp/albert_xxlarge_uncased_en')\
  .setInputCols(["document", "token"])\
  .setOutputCol("embeddings")

In [19]:
ls /notebooks/edit-project/notebooks/albert/metadata/part-0000*

/notebooks/edit-project/notebooks/albert/metadata/part-00000


ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:40323)
Traceback (most recent call last):
  File "/python3/lib/python3.7/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/python3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:40323)

In [None]:
bert_pipeline = PipelineModel(stages=
  [
    document_assembler,
    tokenizer,
    word_embeddings
  ]
)

In [79]:
# df_bert = bert_pipeline.fit(sdf).transform(sdf)
df_bert = bert_pipeline.transform(sdf)

IllegalArgumentException: "requirement failed: Wrong or missing inputCols annotators in ALBERT_EMBEDDINGS_8e99d8f61799.\n\nCurrent inputCols: token. Dataset's columns:\n(column_name=index,is_nlp_annotator=false)\n(column_name=version,is_nlp_annotator=false)\n(column_name=title,is_nlp_annotator=false)\n(column_name=created,is_nlp_annotator=false)\n(column_name=url,is_nlp_annotator=false)\n(column_name=source,is_nlp_annotator=false)\n(column_name=entry_id,is_nlp_annotator=false)\n(column_name=archive_url,is_nlp_annotator=false)\n(column_name=num_versions,is_nlp_annotator=false)\n(column_name=summary,is_nlp_annotator=false)\n(column_name=joint_key,is_nlp_annotator=false)\n(column_name=id,is_nlp_annotator=false)\n(column_name=document,is_nlp_annotator=true,type=document)\n(column_name=token,is_nlp_annotator=true,type=token).\nMake sure such annotators exist in your pipeline, with the right output names and that they have following annotator types: document, token"

In [None]:
t2_df = df_bert.toPandas()