<a href="https://colab.research.google.com/github/l-monninger/ahl/blob/main/embedding_pap2pat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Installation 

!pip install kaggle
!pip install pyspark
!pip install feedparser
!pip install torch
!pip install sentence-transformers
!pip install numpy
!pip install nltk
!apt install libkrb5-dev
!pip install sparkmagic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=f6550ddcc833ef8f841a8087c317d090fabc9b26b9d7ef2074caf55bf8477834
  Stored in directory: /root/.cache/pip/wheels/9f/34/a4/159aa12d0a510d5ff7c8f0220abbea42e5d81ecf588c4fd884
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-w

In [3]:
# libraries
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import * 
import glob

In [4]:
from google.colab import drive
drive.mount('/content/drive')

!mkdir ~/.kaggle

Mounted at /content/drive


In [6]:
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/

In [13]:
%load_ext sparkmagic.magics

In [28]:
# start sparkSession
local_spark = SparkSession.builder.appName("MyApp").getOrCreate()

In [14]:
%spark add -s spark_session -l python -u http://ec2-34-205-157-250.compute-1.amazonaws.com -a cis545-livy -p lmonn3 -t Basic_Access

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1682522365862_0001,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


In [7]:
!kaggle datasets download -d jackchungchiehyu/big-patent
!unzip -o /content/big-patent.zip 
!ls

Downloading big-patent.zip to /content
100% 2.46G/2.46G [02:18<00:00, 21.5MB/s]
100% 2.46G/2.46G [02:18<00:00, 19.1MB/s]
Archive:  /content/big-patent.zip
  inflating: train/train/a/data000000000000  
  inflating: train/train/a/data000000000001  
  inflating: train/train/a/data000000000002  
  inflating: train/train/a/data000000000003  
  inflating: train/train/a/data000000000004  
  inflating: train/train/a/data000000000005  
  inflating: train/train/a/data000000000006  
  inflating: train/train/a/data000000000007  
  inflating: train/train/a/data000000000008  
  inflating: train/train/a/data000000000009  
  inflating: train/train/a/data000000000010  
  inflating: train/train/a/data000000000011  
  inflating: train/train/a/data000000000012  
  inflating: train/train/a/data000000000013  
  inflating: train/train/a/data000000000014  
  inflating: train/train/a/data000000000015  
  inflating: train/train/a/data000000000016  
  inflating: train/train/a/data000000000017  
  inflating: trai

In [35]:
!ls

big-patent.zip	drive  sample_data  train  val


In [8]:
patents = pd.DataFrame()
i = 0
for file in glob.glob("./train/train/*/data*"):
  i+=1
  patents = pd.concat([patents, pd.read_json(file, lines=True)])
  if i >= 20:
    break

vocab = {}
pos = 0
CONTEXT_SIZE = 3
ngrams = []

for abstract in patents["abstract"]:
  for s in abstract.split("."):

    sentence = s.split()
    # compute ngrams from tokens
    ngrams.extend([
        (
            [sentence[i - j - 1] for j in range(CONTEXT_SIZE)],
            sentence[i]
        )
        for i in range(CONTEXT_SIZE, len(sentence))
    ])
    
    # add individual words to vocab
    for word in sentence:
      if word in vocab:
        continue
      vocab[word] = pos
      pos += 1

len(vocab), len(ngrams)

(16089, 977867)

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

EMBEDDING_DIM = 32

class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    print("EPOCH", epoch)
    total_loss = 0
    for context, target in ngrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([vocab[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([vocab[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    print(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

# To get the embedding of a particular word, e.g. "beauty"
print(model.embeddings.weight[vocab["tool"]])

EPOCH 0
5785190.293820312
EPOCH 1


KeyboardInterrupt: ignored

In [28]:
print(model.embeddings.weight[vocab["tool"]])
inv_map = {v: k for k, v in vocab.items()}
sim = nn.CosineSimilarity()
values, indices = sim(model.embeddings.weight[vocab["telecommunication"]], model.embeddings.weight).sort(descending=True)

for i in range(0, 10):
  print(inv_map[indices[i].item()])

tensor([ 2.0161,  1.2888, -0.2356, -0.3546,  2.1612, -2.5729,  0.8988, -0.7141,
         0.1753,  0.7366, -1.6688,  0.4734,  0.6641, -0.2487, -1.0231, -0.2732,
         0.7060,  0.4257,  0.4603,  0.2974,  1.2710,  0.3165, -0.3097,  0.3221,
         0.6625, -1.5410,  1.0366,  0.2899,  0.9294,  0.1526,  0.0120,  0.1418],
       grad_fn=<SelectBackward0>)
telecommunication
landed
disengaged
droplet
inflowing
sack
inductances
01
moveably
biocide


# Patents



In [20]:
%%spark
patents_df = spark.read.json("/train/train/*/data*")
patents_df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
Path does not exist: file:/train/train/*/data*
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 372, in json
    return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))
  File "/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 117, in deco
    raise converted from None
pyspark.sql.utils.AnalysisException: Path does not exist: file:/train/train/*/data*



## Tokenizing

In [None]:
import nltk
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType, ArrayType, FloatType

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

stopwords = set(nltk.corpus.stopwords.words('english'))

def tokenize_content(content):
  tokens = nltk.word_tokenize(content)
  words = [
    token.lower() 
    for token in tokens 
      if token.isalpha() and token.lower() not in stopwords
  ]
  return words

tokenize_content_udf = udf(tokenize_content, ArrayType(StringType()))

def get_sentence_ne(sentence):
  chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence)))
  return [ 
      " ".join(w for w, t in elt) 
      for elt in chunked if isinstance(elt, nltk.Tree) 
  ]

def get_ne(content):
  sentences = nltk.sent_tokenize(content)
  sentence_nes = [get_sentence_ne(sentence) for sentence in sentences]
  return [ne for nes in sentence_nes for ne in nes]

get_ne_udf = udf(get_ne, ArrayType(StringType()))

In [None]:
%%spark
patents_df = patents_df.withColumn("abstract_tokens", tokenize_content_udf("abstract"))
patents_df.show()

In [None]:
%%spark
patents_df = patents_df.withColumn("abstract_ne", get_ne_udf("abstract"))
patents_df.show()

### Top Tokens

In [None]:
from collections import Counter

def get_top(tokens, *, num : int = 10):
  return [
      token
      for token, _ in Counter(tokens).most_common(num)
  ]

get_top_10_udf = udf(get_top, ArrayType(StringType()))