## This file will use BERT from google to get embeddings from content and url columns.

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader

import numpy as np
import pandas as pd
from bertopic.backend import BaseEmbedder
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

In [8]:
df = pd.read_csv("../r4.2/http.csv",usecols=['url','content','id'],nrows=1000)
df

Unnamed: 0,id,url,content
0,{V1Y4-S2IR20QU-6154HFXJ},http://msn.com/The_Human_Centipede_First_Seque...,remain representatives consensus concert altho...
1,{Q5R1-T3EF87UE-2395RWZS},http://urbanspoon.com/Plunketts_Creek_Loyalsoc...,festival off northwards than congestion partne...
2,{X9O1-O0XW52VO-5806RPHG},http://aa.com/Rhodocene/rhodocenium/fhaavatqrf...,long away reorganized baldwin seth business 18...
3,{G5S8-U5OG04TE-5299CCTU},http://groupon.com/Leonhard_Euler/leonhard/tne...,among german schwein experimental becomes prev...
4,{L0R4-A9DH29VP-4553AUWM},http://flickr.com/Inauguration_of_Barack_Obama...,kate criteria j 2008 highest 12 include books ...
...,...,...,...
995,{N7T0-L8DL49UV-2320NKFL},http://hulu.com/Gray_mouse_lemur/lemur/qvfpbha...,illegal neither purchased average live intrasp...
996,{Z6N0-R9AN55GC-9305HGQD},http://networksolutions.com/No_Line_on_the_Hor...,heard expected de never return out for detaile...
997,{H5M4-Q9RY97SC-6665MZVC},http://hulu.com/Gray_mouse_lemur/lemur/qvfpbha...,whom this weeks sometimes showed favored clutc...
998,{Z6E1-P5XP02OQ-6516ESQA},http://wordpress.org/The_Green_Dartmouth_Colle...,connect able your evident dropped insignifican...


In [9]:
# Load BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
topic_model = AutoModel.from_pretrained("bert-base-uncased").to('cuda')

In [10]:
# Enable multi-GPU usage
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = torch.nn.DataParallel(topic_model)

model = model.to('cuda')  # Move model to GPU(s)

Using 2 GPUs


In [11]:
# Function to get embeddings
def get_bert_embeddings(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to('cuda')
    with torch.cuda.amp.autocast():  # Mixed precision for memory efficiency
        outputs = model(**inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu()
    return cls_embeddings

## First Get Embeddings for Content 

In [12]:
# Process data in batches
batch_size = 312  # Adjust based on memory usage
dataloader = DataLoader(df['content'], batch_size=batch_size)

In [13]:
all_embeddings = []
for batch in dataloader:
    batch_embeddings = get_bert_embeddings(batch)
    all_embeddings.extend(batch_embeddings.numpy())

# Add embeddings to the dataframe
df['content'] = all_embeddings

## Get Embeddings for URL 

In [14]:
# Process data in batches
batch_size = 312  # Adjust based on memory usage
dataloader = DataLoader(df['url'], batch_size=batch_size)

In [15]:
all_embeddings = []
for batch in dataloader:
    batch_embeddings = get_bert_embeddings(batch)
    all_embeddings.extend(batch_embeddings.numpy())

# Add embeddings to the dataframe
df['url'] = all_embeddings

In [17]:
len(df['url'][0])

768