In [1]:
!pip install qdrant-client>=1.1.1
!pip install -U sentence-transformers==2.2.2
!pip install -U datasets==2.16.1

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 4.25.3 which is incompatible.[0m[31m
[0mCollecting sentence-transformers==2.2.2
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=9c724abf0149b79e65c57e15438b72ad69e3a8a13465e2b9037c4a983968cdd5
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbb

In [2]:
import time
import math
import torch
from itertools import islice
from tqdm import tqdm
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, concatenate_datasets

In [6]:
# Determine device based on GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [3]:
# Load the dataset
dataset = load_dataset("m-newhauser/senator-tweets")

Downloading readme:   0%|          | 0.00/659 [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/186M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/79754 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/19939 [00:00<?, ? examples/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['date', 'id', 'username', 'text', 'party', 'labels', 'embeddings'],
        num_rows: 79754
    })
    test: Dataset({
        features: ['date', 'id', 'username', 'text', 'party', 'labels', 'embeddings'],
        num_rows: 19939
    })
})

In [5]:
# If the embeddings column already exists, remove it (so we can practice generating it!)
for split in dataset:
    if 'embeddings' in dataset[split].column_names:
        dataset[split] = dataset[split].remove_columns('embeddings')

# Take a peak at the dataset
print(dataset)
dataset["train"].to_pandas().head()

DatasetDict({
    train: Dataset({
        features: ['date', 'id', 'username', 'text', 'party', 'labels'],
        num_rows: 79754
    })
    test: Dataset({
        features: ['date', 'id', 'username', 'text', 'party', 'labels'],
        num_rows: 19939
    })
})


Unnamed: 0,date,id,username,text,party,labels
0,2021-10-13 19:47:44,1448374915636383745,SenatorHassan,Happy th birthday to the @USNavy! The strength...,Democrat,1
1,2021-06-30 14:53:13,1410250073003462656,SenatorMenendez,The greatest generation's investment in infras...,Democrat,1
2,2021-08-08 01:11:29,1424176405881966599,SenBillCassidy,"Thanks to @SenTedCruz and @SenatorWarnock, th...",Republican,0
3,2021-04-14 14:02:49,1382333523567185921,SenBlumenthal,/ To get lasting change we cant just lock up t...,Democrat,1
4,2021-12-11 16:06:38,1469700160934621188,SenatorBraun,Today were celebrating years of the Hoosier st...,Republican,0


## Load the embedding model from the HuggingFace Hub

In [7]:
# Load the desired model
model = SentenceTransformer(
          'sentence-transformers/all-MiniLM-L6-v2',
          device=device
)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


## Step 4: Generate the embeddings

In [8]:
# Create function to generate embeddings (in batches) for a given dataset split
def generate_embeddings(split, batch_size=32):
    embeddings = []
    split_name = [name for name, data_split in dataset.items() if data_split is split][0]

    with tqdm(total=len(split), desc=f"Generating embeddings for {split_name} split") as pbar:
        for i in range(0, len(split), batch_size):
            batch_sentences = split['text'][i:i+batch_size]
            batch_embeddings = model.encode(batch_sentences)
            embeddings.extend(batch_embeddings)
            pbar.update(len(batch_sentences))

    return embeddings

In [9]:
# Generate and append embeddings to the train split
train_embeddings = generate_embeddings(dataset['train'])
dataset["train"] = dataset["train"].add_column("embeddings", train_embeddings)

# Generate and append embeddings to the test split
test_embeddings = generate_embeddings(dataset['test'])
dataset["test"] = dataset["test"].add_column("embeddings", test_embeddings)

Generating embeddings for train split: 100%|██████████| 79754/79754 [53:57<00:00, 24.63it/s]
Generating embeddings for test split: 100%|██████████| 19939/19939 [13:09<00:00, 25.24it/s]


## Create a local Qdrant vector database

In [10]:
# Combine train and test splits into a single dataset
combined_dataset = concatenate_datasets([dataset['train'], dataset['test']])

# Create an in-memory Qdrant instance
client = QdrantClient(":memory:")

# Create a Qdrant collection for the embeddings
client.create_collection(
    collection_name="senator-tweets",
    vectors_config=models.VectorParams(
        size=model.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE,
    ),
)

True

In [11]:
# Create function to upsert embeddings in batches
def batched(iterable, n):
    iterator = iter(iterable)
    while batch := list(islice(iterator, n)):
        yield batch

batch_size = 100

# Upsert the embeddings in batches
for batch in batched(combined_dataset, batch_size):
    ids = [point.pop("id") for point in batch]
    vectors = [point.pop("embeddings") for point in batch]

    client.upsert(
        collection_name="senator-tweets",
        points=models.Batch(
            ids=ids,
            vectors=vectors,
            payloads=batch,
        ),
    )

In [12]:
# Let's see what senators are saying about immigration policy
hits = client.search(
    collection_name="senator-tweets",
    query_vector=model.encode("Immigration policy").tolist(),
    limit=5
)
for hit in hits:
  print(hit.payload, "score:", hit.score)

{'date': '2021-06-08 20:50:41', 'username': 'SenatorRomney', 'text': 'Some policies that can realistically stem the illegal immigration crisis: - Completion of the barrier at our southern border - Enact mandatory E-Verify - Require asylum seekers to apply in their home country or the nearest safe location', 'party': 'Republican', 'labels': 0} score: 0.6172379504115852
{'date': '2021-11-03 17:56:55', 'username': 'JohnCornyn', 'text': 'Making crisis worse: Biden administration rescinds Trump-era policy limiting migrants at legal ports of entry - CNNPolitics https://t.co/LpSYwdKGER', 'party': 'Republican', 'labels': 0} score: 0.6011508173981881
{'date': '2021-03-04 17:16:42', 'username': 'SenTedCruz', 'text': 'President Bidens immigration policies are dangerous.', 'party': 'Republican', 'labels': 0} score: 0.5960108010346633
{'date': '2021-01-21 01:10:11', 'username': 'SenatorDurbin', 'text': 'With his Executive Orders, President Biden is turning the page on four years of immigration poli

In [13]:
# Most of those tweets are by Republicans... let's see what the Dem's are saying
hits = client.search(
    collection_name="senator-tweets",
    query_vector=model.encode("Immigration policy").tolist(),
    query_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="party",
                match=models.MatchValue(value="Democrat") # Filter by political party
            )
        ]
    ),
    limit=5
)
for hit in hits:
  print(hit.payload, "score:", hit.score)

{'date': '2021-01-21 01:10:11', 'username': 'SenatorDurbin', 'text': 'With his Executive Orders, President Biden is turning the page on four years of immigration policies that dragged our country backwards. Today, we move forward with a vision that reflects our proud heritage as a nation of immigrants. My full statement: https://t.co/Rwmu2esKnm', 'party': 'Democrat', 'labels': 1} score: 0.5954160068493908
{'date': '2021-05-09 23:00:00', 'username': 'SenatorSinema', 'text': 'Our Bipartisan Border Solutions Act will: - Create regional processing centers along the border - Provide more resources to improve the asylum process - Require @DHSgov to tell communities before releasing migrants https://t.co/e2yAL4TfH4', 'party': 'Democrat', 'labels': 1} score: 0.5691307755465603
{'date': '2021-07-10 16:00:02', 'username': 'SenatorSinema', 'text': 'Our Bipartisan Border Solutions Act with @JohnCornyn will help address the migrant crisis by creating regional processing centers to more quickly proc