# Privacy-aware Personalization
## Text Embeddings

In [1]:
import numpy as np


from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download
from nltk.tokenize import sent_tokenize



### Use a Language Model as `Feature Extractor` to encode text as vector embeddings

In [2]:
# Sentence Transformer Model
# Paper: Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks by Nils Reimers and Iryna Gurevych
# https://arxiv.org/abs/1908.10084
LANG_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
lang_model = SentenceTransformer(LANG_MODEL)

In [3]:
t1 = """Countries’ climate promises still not enough to avoid catastrophic global warming"""
t2 = """‘Game changer’ ideas on water and sustainability, centre-stage ahead of major water conference"""
t3 = """Putin’s wild claims of a dirty bomb show just how badly his army is faring"""

Compute the Embeddings

In [4]:
t1_embedding = lang_model.encode(t1)
t2_embedding = lang_model.encode(t2)
t3_embedding = lang_model.encode(t3)

In [6]:
len(t1_embedding)

384

In [7]:
def cos_sim(a, b):
    return np.dot(a, b)/(np.linalg.norm(a) * np.linalg.norm(b))

In [11]:
cos_sim(t1_embedding, t2_embedding) > cos_sim(t1_embedding, t3_embedding)

True

# Document Embeddings from 

In [14]:
def get_sentences_from_txt(txt):
    return sent_tokenize(txt)
    
def encode(sentences):
    lang_model = SentenceTransformer(LANG_MODEL)
    return lang_model.encode(sentences)

def embedding(sentences):
    embs = encode(sentences)
    return np.mean(embs, axis=0)

In [15]:
txt = """
A couple of years into the pandemic, Shirley Neville had finally had enough of her crappy internet service.

“It was just a headache,” said Neville, who lives in a middle-class neighborhood in New Orleans whose residents are almost all Black or Latino. “When I was getting ready to use my tablet for a meeting, it was cutting off and not coming on.”

Neville said she was willing to pay more to be able to Zoom without interruption, so she called AT&T to upgrade her connection. She said she was told there was nothing the company could do.

In her area, AT&T only offers download speeds of 1 megabit per second or less, trapping her in a digital Stone Age. Her internet is so slow that it doesn’t meet Zoom’s recommended minimum for group video calls, doesn’t come close to the FCC’s definition of broadband, currently 25 Mbps, and is worlds below median home internet speeds in the U.S., which average 167 Mbps.
"""

In [16]:
sentences = get_sentences_from_txt(txt)

In [17]:
sentences[0]

'\nA couple of years into the pandemic, Shirley Neville had finally had enough of her crappy internet service.'

In [18]:
e = embedding(sentences)

In [19]:
e

array([ 1.66409955e-04, -2.46578753e-02,  1.27157653e-02, -1.39318025e-02,
       -1.95183177e-02, -8.11083429e-03,  2.91513633e-02, -2.32566590e-03,
       -1.82475466e-02, -3.23802163e-03,  3.82250622e-02,  3.92706655e-02,
       -2.80668736e-02, -3.62194292e-02, -2.22149473e-02, -1.47188818e-02,
        5.83654679e-02, -8.96345451e-02, -3.11663654e-02,  3.18242610e-03,
       -1.19665021e-03, -2.53055096e-02,  5.83956987e-02,  5.64749427e-02,
        4.75096703e-02,  3.08116619e-02, -4.12140824e-02, -2.56042685e-02,
       -1.03905173e-02,  1.29802199e-02, -3.84920835e-02,  5.07800020e-02,
        9.66909435e-03,  1.57850515e-02,  2.14994457e-02, -3.52065265e-03,
        3.54105122e-02,  7.28204325e-02,  9.65165813e-03, -1.24073550e-02,
       -7.32069975e-03, -4.41718996e-02, -1.35788731e-02,  2.49458533e-02,
       -8.54263920e-03, -4.47138883e-02,  4.47396003e-02, -2.02657189e-03,
       -1.82979386e-02, -6.51293397e-02, -5.16681513e-03,  1.86813250e-02,
       -2.60702595e-02, -