# Sentence Transformer Functionalities

## Semantic Similarity

In [1]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('paraphrase-distilroberta-base-v1') ##  SBERT paper: Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks

In [2]:
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
sentences = ['I am writing this notebook on Azure Machine Learning framework',
    'The first certification in AWS is the Cloud practioner']
sentence_embeddings = model.encode(sentences)

print("Sentence embeddings:")
print(sentence_embeddings) #Semantically meaningful sentence embedings
#Map each sentence to a vector space such that semantically similar sentences are close

Sentence embeddings:
[[-0.07435048 -0.3073893  -0.2914517  ... -0.14530991 -0.21303491
  -0.12939769]
 [ 0.04234526 -0.06872454 -0.16243754 ... -0.55217934  0.24117126
   0.23806   ]]


In [3]:
len(sentence_embeddings[0])

768

In [4]:
# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)




In [5]:
# Query sentences:
queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']

In [6]:
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: {:.4f})".format(score))





Query: A man is eating pasta.

Top 5 most similar sentences in corpus:
A man is eating food. (Score: 0.7096)
A man is eating a piece of bread. (Score: 0.6074)
A man is riding a horse. (Score: 0.3360)
A man is riding a white horse on an enclosed ground. (Score: 0.3069)
A woman is playing violin. (Score: 0.2378)




Query: Someone in a gorilla costume is playing a set of drums.

Top 5 most similar sentences in corpus:
A monkey is playing drums. (Score: 0.6842)
A woman is playing violin. (Score: 0.3762)
A man is riding a horse. (Score: 0.3079)
A cheetah is running behind its prey. (Score: 0.2760)
A man is eating a piece of bread. (Score: 0.2495)




Query: A cheetah chases prey on across a field.

Top 5 most similar sentences in corpus:
A cheetah is running behind its prey. (Score: 0.7814)
A monkey is playing drums. (Score: 0.2824)
A man is riding a white horse on an enclosed ground. (Score: 0.2208)
A man is riding a horse. (Score: 0.2017)
A man is eating food. (Score: 0.1886)


In [7]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

The cat sits outside 		 The dog plays in the garden 		 Score: 0.4579
A man is playing guitar 		 A woman watches TV 		 Score: 0.1759
The new movie is awesome 		 The new movie is so great 		 Score: 0.9283


In [8]:
cosine_scores

tensor([[ 0.4579,  0.1059,  0.1447],
        [ 0.1239,  0.1759, -0.0344],
        [ 0.1696,  0.1313,  0.9283]])

## Name Entity Recognition

In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [10]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

In [11]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
# example1='Hugging Face Inc. is a company based in New York City. "\" Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge which is visible from the window.'
example1='Coursera Inc is an American massive open online course provider founded in 2012 by Stanford University computer science professors Andrew Ng and Daphne Koller.'

In [12]:
ner_results = nlp(example1)
ner_results

[{'word': 'Course',
  'score': 0.9995938539505005,
  'entity': 'B-ORG',
  'index': 1},
 {'word': '##ra', 'score': 0.9989863038063049, 'entity': 'I-ORG', 'index': 2},
 {'word': 'Inc', 'score': 0.9994171857833862, 'entity': 'I-ORG', 'index': 3},
 {'word': 'American',
  'score': 0.9992867708206177,
  'entity': 'B-MISC',
  'index': 6},
 {'word': 'Stanford',
  'score': 0.9990172982215881,
  'entity': 'B-ORG',
  'index': 16},
 {'word': 'University',
  'score': 0.9960840344429016,
  'entity': 'I-ORG',
  'index': 17},
 {'word': 'Andrew',
  'score': 0.9997637867927551,
  'entity': 'B-PER',
  'index': 21},
 {'word': 'Ng', 'score': 0.999763548374176, 'entity': 'I-PER', 'index': 22},
 {'word': 'Daphne',
  'score': 0.9996672868728638,
  'entity': 'B-PER',
  'index': 24},
 {'word': 'Ko', 'score': 0.9997257590293884, 'entity': 'I-PER', 'index': 25},
 {'word': '##ller',
  'score': 0.9985135793685913,
  'entity': 'I-PER',
  'index': 26}]

## Sentiment Analysis

In [13]:
nlp2 = pipeline("sentiment-analysis")
sent_analysis=nlp2(example1)

In [14]:
sent_analysis

[{'label': 'POSITIVE', 'score': 0.9041023850440979}]

## Key word extraction

In [15]:
example1

'Coursera Inc is an American massive open online course provider founded in 2012 by Stanford University computer science professors Andrew Ng and Daphne Koller.'

In [16]:
from rake_nltk import Rake
r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.
r.extract_keywords_from_text(example1)
r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.

['stanford university computer science professors andrew ng',
 'american massive open online course provider founded',
 'daphne koller',
 'coursera inc',
 '2012']

## Coreference Resolution

In [1]:
# Load your usual SpaCy model (one of SpaCy English models)
import spacy
nlp = spacy.load('en_core_web_sm')


In [2]:
import neuralcoref


  return f(*args, **kwds)


<spacy.lang.en.English at 0x7f76d30a8f98>

In [3]:
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

# You're done. You can now use NeuralCoref the same way you usually manipulate a SpaCy document and it's annotations.
doc = nlp(u'My sister has a dog. She loves him.')


ValueError: [E007] 'neuralcoref' already exists in pipeline. Existing names: ['tagger', 'parser', 'ner', 'neuralcoref']

## References 
* https://pypi.org/project/rake-nltk/
* https://huggingface.co/transformers/usage.html

