In [4]:
import sparknlp
from pyspark.ml import PipelineModel, Pipeline
import sparknlp.annotator as sa
import sparknlp.base as sb
import sparknlp
from sparknlp import Finisher

In [15]:
import sys
sys.path.insert(0, '../')
from util import util_data_access

In [18]:
# download data
util_access.download_file('newssniffer-nytimes.db.gz', 'edit-pathways/dbs/newssniffer-nytimes.db.gz')
! gunzip newssniffer-nytimes.db.gz

# Load Spark

In [6]:
from pyspark.sql import SparkSession
# spark = sparknlp.start()

spark = (
    SparkSession.builder
      .config("spark.executor.instances", "30")
      .config("spark.driver.memory", "20g")
      .config("spark.executor.memory", "20g")
      .config("spark.kryoserializer.buffer.max", "2000M")
      .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.7.5")
      .getOrCreate()
)

In [7]:
spark

# Load Our Own Data

In [8]:
import sqlite3
import pandas as pd
from tqdm.auto import tqdm
import pyspark.sql.functions as F
# import unidecode

# conn = sqlite3.connect('../data/diffengine-diffs/db/newssniffer-nytimes.db')
conn = sqlite3.connect('newssniffer-nytimes.db')

df = pd.read_sql('''
     SELECT * from entryversion 
     WHERE entry_id IN (SELECT distinct entry_id FROM entryversion LIMIT 2)
 ''', con=conn)

# df = pd.read_sql('''
#     SELECT entry_id, summary, version from entryversion 
# ''', con=conn)

df = df.assign(summary=lambda df: df['summary'].str.replace('</p><p>', ' '))

In [9]:
sdf = spark.createDataFrame(df)

# Try Sentence Tokenizing on Our Own Data

In [None]:
documenter = sb.DocumentAssembler()\
    .setInputCol("summary")\
    .setOutputCol("document")

sentencer = (sa.SentenceDetector()
                .setInputCols(["document"])
                .setOutputCol("sentences")            
            )

finisher = (
    Finisher()
    .setInputCols(["sentences"]) 
)

sd_pipeline = PipelineModel(stages=[documenter, sentencer, finisher])

In [None]:
annotations_df = sd_pipeline.transform(sdf)

In [None]:
sent_list_df = (annotations_df
                .select("entry_id", "version", F.posexplode("finished_sentences"))
                .withColumnRenamed('col', 'sentence')
                .withColumnRenamed('pos', 'sent_idx')
               )
# tdf = sent_list_df.toPandas()

In [54]:
exploded_sent_df = (sent_list_df
 .alias("sent_list_df")
 .join(
     sent_list_df.alias("sent_list_df_2"),
     [F.col("sent_list_df.entry_id") == F.col("sent_list_df_2.entry_id"), 
      F.col("sent_list_df.version") == F.col("sent_list_df_2.version"), 
     ], 
     "inner"
 )
 .select(
     F.col("sent_list_df.entry_id"),
     F.col("sent_list_df.version"),
     F.col("sent_list_df.sent_idx").alias("sent_idx_x"),
     F.col("sent_list_df_2.sent_idx").alias("sent_idx_y"),
     F.col("sent_list_df.sentence").alias("sentence_x"),
     F.col("sent_list_df_2.sentence").alias("sentence_y"),
#    .show(truncate=False)
    )
)

In [55]:
exploded_sent_df.show()


## todo: 
## 0. do this same procedure for diffed sequential versions

## 1a. use tokenize and Albert or BERT or Word2Vec to generate vectors of embeddings for each sentence.
## 1b. lemmatize each sentence

## 2. take Sim_asym along each row, two times using:
## a. phi(x, y) = vec(x) \cdot vec(y)
## b. phi(x ,y) = lemmatization

## 3. for each sentence, select the argmax in both directions.
## 4. choose some reasonable threshold.

## 5. For scores above this threshold, co

+--------+-------+----------+----------+-------------------+--------------------+
|entry_id|version|sent_idx_x|sent_idx_y|         sentence_x|          sentence_y|
+--------+-------+----------+----------+-------------------+--------------------+
|  548743|      1|         0|         0|FORT COLLINS, Colo.| FORT COLLINS, Colo.|
|  548743|      1|         0|         1|FORT COLLINS, Colo.|— Annie Hartnett ...|
|  548743|      1|         0|         2|FORT COLLINS, Colo.|Now 21 and a lead...|
|  548743|      1|         0|         3|FORT COLLINS, Colo.|“I would still sa...|
|  548743|      1|         0|         4|FORT COLLINS, Colo.|“When you’re voti...|
|  548743|      1|         0|         5|FORT COLLINS, Colo.|” So on Saturday ...|
|  548743|      1|         0|         6|FORT COLLINS, Colo.|Each party used t...|
|  548743|      1|         0|         7|FORT COLLINS, Colo.|But Mr. Obama, tr...|
|  548743|      1|         0|         8|FORT COLLINS, Colo.|“I’m counting on ...|
|  548743|      

In [None]:
chunksize = 10000
unique_entryids = df['entry_id'].unique()
num_chunks = int(unique_entryids.shape[0] / chunksize)

output_dfs = []
for chunk_id in tqdm(range(num_chunks)):
    batch_ids = unique_entryids[chunk_id * chunksize: (chunk_id + 1) * chunksize]
    small_df = df.loc[lambda df: df['entry_id'].isin(batch_ids)]
    #
    sdf = spark.createDataFrame(small_df)
    #
    annotations_df = sd_pipeline.transform(sdf)
    t_df = annotations_df.toPandas()
    output_dfs.append(t_df)

# Get Albert Embeddings

In [None]:
document_assembler = (
      sb.DocumentAssembler()
        .setInputCol("summary")
        .setOutputCol("document")
)

tokenizer = (
    sa.Tokenizer()
        .setInputCols(["document"])
        .setOutputCol("token")
)
 
word_embeddings = (
    sa.AlbertEmbeddings
        .load('s3://aspangher/spark-nlp/albert_xxlarge_uncased_en')
        .setInputCols(["document", "token"])
        .setOutputCol("embeddings")
)

embeddings_finisher = (
    sb.EmbeddingsFinisher()
            .setInputCols("embeddings")
            .setOutputCols("embeddings_vectors")
            .setOutputAsVector(True)
)

In [None]:
bert_pipeline = Pipeline(stages=
  [
    document_assembler,
    tokenizer,
    word_embeddings,
    embeddings_finisher
  ]
)

In [40]:
df_bert = bert_pipeline.fit(sdf).transform(sdf)
# df_bert = bert_pipeline_model.transform(sdf)

In [42]:
df_bert#.select('entry_id', 'version', 'embedding_vectors')

DataFrame[index: bigint, version: bigint, title: string, created: string, url: string, source: string, entry_id: bigint, archive_url: string, num_versions: bigint, summary: string, joint_key: string, id: string, document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, token: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, embeddings: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, embeddings_vectors: array<vector>]

In [None]:
t2_df = (df_bert
         .select('entry_id', 'version', 'embeddings_vectors')
         .toPandas()
        )

# With Sentences

In [61]:
from pyspark.ml.feature import Normalizer, SQLTransformer
from pyspark.ml.feature import BucketedRandomProjectionLSH

In [62]:
documenter = (
    sb.DocumentAssembler()
        .setInputCol("summary")
        .setOutputCol("document")
)

sentencer = (
    sa.SentenceDetector()
        .setInputCols(["document"])
        .setOutputCol("sentences")            
)

explode_sentences = (
    SQLTransformer()
     .setStatement("SELECT entry_id, version, POSEXPLODE(sentences) AS (sent_idx, sentence), * FROM __THIS__")
)

documenter = (
    sb.DocumentAssembler()
        .setInputCol("summary")
        .setOutputCol("document")
)

tokenizer = (
    sa.Tokenizer()
        .setInputCols(["sentence"])
        .setOutputCol("token")
)
 
word_embeddings = (
    sa.AlbertEmbeddings
        .load('s3://aspangher/spark-nlp/albert_large_uncased_en')
        .setInputCols(["sentence", "token"])
        .setOutputCol("embeddings")
)

embeddings_finisher = (
    sb.EmbeddingsFinisher()
            .setInputCols("embeddings")
            .setOutputCols("embeddings_vectors")
            .setOutputAsVector(True)
)

explode_word_embeddings = (
    SQLTransformer()
     .setStatement("SELECT entry_id, version, sent_idx, POSEXPLODE(embeddings_vectors) AS (word_idx, word_embedding), * FROM __THIS__")
)

vector_normalizer = (
    Normalizer()
    .setInputCol("word_embedding")
    .setOutputCol("norm_word_embedding")
    .setP(1.0)
)

In [63]:
pipeline = Pipeline(stages=
  [
    documenter,
    sentencer,
    explode_sentences,
    tokenizer,
    word_embeddings,
    embeddings_finisher,
    explode_word_embeddings,
    vector_normalizer
  ]
)

In [64]:
sent_sdf = pipeline.fit(sdf).transform(sdf)

IllegalArgumentException: "requirement failed: Wrong or missing inputCols annotators in REGEX_TOKENIZER_38db707132ad.\n\nCurrent inputCols: sentence. Dataset's columns:\n(column_name=entry_id,is_nlp_annotator=false)\n(column_name=version,is_nlp_annotator=false)\n(column_name=sent_idx,is_nlp_annotator=false)\n(column_name=sentence,is_nlp_annotator=false)\n(column_name=index,is_nlp_annotator=false)\n(column_name=version,is_nlp_annotator=false)\n(column_name=title,is_nlp_annotator=false)\n(column_name=created,is_nlp_annotator=false)\n(column_name=url,is_nlp_annotator=false)\n(column_name=source,is_nlp_annotator=false)\n(column_name=entry_id,is_nlp_annotator=false)\n(column_name=archive_url,is_nlp_annotator=false)\n(column_name=num_versions,is_nlp_annotator=false)\n(column_name=summary,is_nlp_annotator=false)\n(column_name=joint_key,is_nlp_annotator=false)\n(column_name=id,is_nlp_annotator=false)\n(column_name=document,is_nlp_annotator=true,type=document)\n(column_name=sentences,is_nlp_annotator=true,type=document).\nMake sure such annotators exist in your pipeline, with the right output names and that they have following annotator types: document"

In [55]:
emb_sdf = bert_pipeline.fit(sent_list_df).transform(sent_list_df)

In [60]:
snet_list_dfp = word_emb_sdf.limit(5).toPandas()

NameError: name 'word_emb_sdf' is not defined

In [None]:
exploded_word_df = (word_emb_sdf
 .alias("word_emb_df")
 .join(
     sent_list_df.alias("word_emb_df_2"),
     [F.col("word_emb_df.entry_id") == F.col("word_emb_df.entry_id"), 
      F.col("word_emb_df.version") == F.col("word_emb_df.version"), 
     ], 
     "inner"
 )
 .select(
     F.col("word_emb_df.entry_id"),
     F.col("word_emb_df.version"),
     # sent_idx 
     F.col("word_emb_df.sent_idx").alias("sent_idx_x"),
     F.col("word_emb_df_2.sent_idx").alias("sent_idx_y"),
     # word_idx
     F.col("word_emb_df.word_idx").alias("word_idx_x"),
     F.col("word_emb_df_2.word_idx").alias("word_idx_y"),
     # word_emb
     F.col("word_emb_df.word_embedding").alias("word_embedding_x"),
     F.col("word_emb_df_2.word_embedding").alias("word_embedding_y"),
#    .show(truncate=False)
    )
)

In [16]:
df_bert.show()

+-----+-------+--------------------+--------------------+--------------------+-------+--------+--------------------+------------+--------------------+---------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|index|version|               title|             created|                 url| source|entry_id|         archive_url|num_versions|             summary|joint_key|      id|            document|           sentences|               token|          embeddings|  embeddings_vectors|
+-----+-------+--------------------+--------------------+--------------------+-------+--------+--------------------+------------+--------------------+---------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|68763|      0|Activist Challeng...|2012-08-26 22:55:...|http://www.nytime...|nytimes|  547988|https://www.newss...|           2|In Silicon Valley...| 547988-0|547988-0|[[docu

In [17]:
dfp = df_bert.toPandas()

In [None]:
dfp['sentneces']

In [19]:
dfp[['token', 'embeddings_vectors']]

Unnamed: 0,token,embeddings_vectors
0,"[(token, 0, 1, In, {'sentence': '0'}, []), (to...","[[-0.8931788206100464, -0.3664441406726837, -0..."
1,"[(token, 0, 1, In, {'sentence': '0'}, []), (to...","[[-0.8931788206100464, -0.3664441406726837, -0..."
2,"[(token, 0, 9, WASHINGTON, {'sentence': '0'}, ...","[[0.6029991507530212, -0.002772439271211624, -..."
3,"[(token, 0, 9, WASHINGTON, {'sentence': '0'}, ...","[[0.6029991507530212, -0.002772439271211624, -..."
4,"[(token, 0, 9, WASHINGTON, {'sentence': '0'}, ...","[[0.6029991507530212, -0.002772439271211624, -..."


In [35]:
dfp['token'][0][0]

Row(annotatorType='token', begin=0, end=1, result='In', metadata={'sentence': '0'}, embeddings=[])

In [36]:
dfp['token'][0][100]

Row(annotatorType='token', begin=529, end=535, result='himself', metadata={'sentence': '2'}, embeddings=[])

In [26]:
len(dfp['embeddings_vectors'][0])

1570

In [32]:
t = dfp['embeddings_vectors'][0][0]

In [None]:
tokenizer = (
    sa.Tokenizer()
        .setInputCols(["sentences"])
        .setOutputCol("token")
)

word_embeddings = (
    sa.AlbertEmbeddings
        .load('s3://aspangher/spark-nlp/albert_large_uncased_en')
        .setInputCols(["sentences", "token"])
        .setOutputCol("embeddings")
        .setBatchSize(100)
)

embeddings_finisher = (
    sb.EmbeddingsFinisher()
            .setInputCols("embeddings")
            .setOutputCols("embeddings_vectors")
            .setOutputAsVector(True)
)

In [25]:
spark

In [None]:
(df_bert
 .select('entry_id', 'version', 'sentences', 'embeddings_vectors')
 .write.mode("overwrite").parquet("s3://aspangher/tmp/tmp_albert_embeddings"))

In [None]:
df_bert.select('entry_id', 'version', 'embeddings_vectors').show()

In [39]:
pipeline_model = bert_pipeline_from_sentences.fit(spark.createDataFrame([[""]]).toDF("summary"))
result = pipeline_model.transform(spark.createDataFrame(pd.DataFrame({"summary": ["I love NLP"]})))

In [None]:
result.show()

In [None]:
t2_df = (df_bert
         .select('entry_id', 'version', 'sent_idx', 'sentence', 'embeddings_vectors')
         .toPandas()
        )

# Test Data

In [None]:
# test_data = CoNLL().readDataset(spark, 's3://aspangher/spark-nlp/conll/eng.train')

In [None]:
from sparknlp.training import CoNLL
training_data = CoNLL().readDataset(spark, 's3://aspangher/spark-nlp/conll/eng.train')

get_embeddings = (sa.AlbertEmbeddings
        .load('s3://aspangher/spark-nlp/albert_large_uncased_en')
        .setInputCols("sentence", "token")
        .setOutputCol("embeddings")
        .setMaxSentenceLength(100)
        .setBatchSize(8)
)

embeddings_finisher = (
    sb.EmbeddingsFinisher()
        .setInputCols("embeddings")
        .setOutputCols("embeddings_vectors")
        .setOutputAsVector(True)
)

sentence_finisher = (
    Finisher()
       .setInputCols(["sentence"]) 
)

pipeline =  Pipeline(stages=[
    get_embeddings, 
    embeddings_finisher, 
    sentence_finisher
])

pipelineDF = pipeline.fit(training_data).transform(training_data)

(pipelineDF
 .select('finished_sentence', 'embeddings_vectors')
 .write
 .mode("overwrite").parquet('s3://aspangher/tmp/tmp_conll_albert_embeddings.pq')
)

In [29]:
sparknlp.version()

'2.7.5'

In [30]:
spark.version

'2.4.4'

In [None]:
pipelineDF.select('finished_sentence', 'embeddings_vectors').show()

In [None]:
ner_pipeline = RecursivePipeline(stages=[
    DocumentAssembler().setInputCol("text").setOutputCol("document"), 
    SentenceDetector().setInputCols(["document"]).setOutputCol("sentence"), 
    Tokenizer().setInputCols(["sentence"]).setOutputCol("token").setMaxLength(100).setSplitChars(["-", "\xa0", "—"]), 
    BertEmbeddings.pretrained(name = "bert_large_cased", lang='en').setInputCols(['sentence', 'token']).setOutputCol('embeddings'), 
#     NerDLModel.pretrained('onto_bert_large_cased', 'en').setInputCols(['sentence', 'token', 'embeddings']).setOutputCol('ner'), 
#     NerConverter().setInputCols(['sentence', 'token', 'ner']).setOutputCol('ner_chunk') 
])