# Media embeddings extraction using Bert
---

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing


In [2]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-2.5.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
media = pd.read_pickle('/kaggle/input/news-data/final-news-data.pkl')

In [4]:
media['all_text'] = media['Body'].fillna('') + media['title'].fillna('')

In [5]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [6]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def chunk_text(text, chunk_size=250):
    """
    Split text into chunks of approximately `chunk_size` words, respecting word boundaries.

    Parameters:
    - text: Input text to be chunked.
    - chunk_size: Approximate number of words per chunk (default is 250).

    Returns:
    - Generator object yielding chunks of text.
    """
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield ' '.join(words[i:i+chunk_size])

def encode_texts(texts, model, batch_size=200, chunk_size=250):
    """
    Encode a list of texts into sentence embeddings using a SentenceTransformer model.

    Parameters:
    - texts: List of texts to be encoded.
    - model: Pre-trained SentenceTransformer model.
    - batch_size: Batch size for processing texts (default is 200).
    - chunk_size: Approximate number of words per chunk (default is 250).

    Returns:
    - Numpy array containing the encoded embeddings for each text.
    """
    embeddings = []
    for text in texts:
        # For very long texts, chunk them and aggregate embeddings
        if len(text.split()) > chunk_size:
            chunk_embeddings = []
            for chunk in chunk_text(text, chunk_size):
                # Note: The model.encode should be called outside the loop for batch processing.
                # For simplicity and to correct the logic, encoding each chunk separately here,
                # but consider revising for batch processing efficiency.
                chunk_embedding = model.encode(chunk, show_progress_bar=False)
                chunk_embeddings.append(chunk_embedding)
            # Aggregate chunk embeddings by mean
            embeddings.append(np.mean(chunk_embeddings, axis=0))
        else:
            # Directly encode texts that are within the limit
            embedding = model.encode(text, show_progress_bar=False)
            embeddings.append(embedding)
    return np.array(embeddings)

# Assuming `reddit.combined_clean_text` is a list of texts
embeddings = encode_texts(media.all_text, model, batch_size=200, chunk_size=250)

# # Saving and printing embeddings as before
import pickle
with open('embeddings-news-10m.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

print(embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[[-2.8520476e-02 -7.2712928e-02  7.9627614e-03 ... -1.4573094e-01
  -1.9283975e-02  2.4514228e-02]
 [ 3.5336073e-03 -3.9134413e-02 -2.6694302e-02 ... -1.8696475e-01
   2.0142529e-02  4.7578689e-02]
 [-1.1761022e-03 -2.7922533e-02  2.6832113e-02 ... -4.6743739e-02
   5.5845478e-03  1.7912003e-04]
 ...
 [-6.8731509e-02  3.9192419e-02  5.6753784e-02 ... -2.8611509e-02
   1.2235467e-04  4.1421957e-02]
 [-4.7707088e-02 -1.0991676e-02 -1.6721513e-02 ... -1.3713768e-01
   3.3668809e-02  4.0773340e-02]
 [-1.0100313e-02 -1.7578686e-02 -1.1585538e-02 ... -4.1612070e-02
   6.9273531e-02  5.1032245e-02]]


---
End of script