In [1]:
# Install Qdrant client library
!pip install qdrant-client



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
!docker run -p 6333:6333 -d qdrant/qdrant


b573fb78e2501563369071c81b546708188a59ef88c77dfa9b1fca2a7093511b
docker: Error response from daemon: driver failed programming external connectivity on endpoint agitated_pasteur (f4093b053818b6deea0482eb7931b90378d087568e01bfd3970f104f51f6b3d9): Bind for 0.0.0.0:6333 failed: port is already allocated.


In [34]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import pandas as pd
from qdrant_client.models import PointStruct


## Connect to Qdrant

In [22]:
# Initialize the Qdrant client
client = QdrantClient(host="127.0.0.1", port=6333)

# Check server connection
try:
    response = client.get_collections()
    print("Qdrant is running. Collections:", response)
except Exception as e:
    print(f"Error connecting to Qdrant: {e}")


Qdrant is running. Collections: collections=[CollectionDescription(name='contractors')]


## Process Data Using Hugging Face Transformers

In [10]:
from transformers import AutoTokenizer, AutoModel

model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()


Downloading tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [None]:
df = pd.read_csv("/Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/raw_data/filtered_contractors_data.csv")

df['Combined_Text'] = df[['Company Name', 'Company Size', 'Interests', 'City', 'Address']].fillna('').agg(' '.join, axis=1)

def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

df['embedding'] = df['Combined_Text'].apply(generate_embedding)

# Save the updated DataFrame
df.to_csv("/Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/raw_data/contractors_with_embeddings.csv", index=False)


In [None]:
df.head()

Unnamed: 0,Company Name,Membership Number,Company Size,Organization Email,Address,City,Region,Interests,Combined_Text,embedding
0,Al-Wessam Contracting Company,106110611,Small Company Size,Alwessam7@Gmail.Com,Al Wessam Contracting Company,BILJURASHI,Bahah,"Mining support services, Oil and natural gas e...",Al-Wessam Contracting Company Small Company Si...,"[-0.02744017168879509, -0.03267991170287132, 0..."
1,Inmayoun Contracting Company,215421544,Medium Company Size,Inmayoun@Gmail.Com,abdelaziz,RIYADH,Riyadh,"Construction of buildings, Construction of bui...",Inmayoun Contracting Company Medium Company Si...,"[0.027565114200115204, -0.07196860015392303, 0..."
2,Awared General Contracting Company,160916095,Small Company Size,Fared@Fared-Est.Com,Riyadh - alezdehar District -,RIYADH,Riyadh,"Construction of buildings, Construction of bui...",Awared General Contracting Company Small Compa...,"[0.06581185013055801, -0.11605323851108551, 0...."
3,acn solutions for contracting,446844684,Very Small Company Size,Alwa7ed@Hotmail.Com,jeddah al shiraa dis.,JEDDAH,Makkah,"Waste collection, treatment & disposal activit...",acn solutions for contracting Very Small Compa...,"[-0.02718176506459713, -0.07887285202741623, 0..."
4,Ratel Al Sharq Contracting Company,101010104,Small Company Size,Adel_77@Hotmail.Com,Riyadh - Al Narjis District - Anas Bin Malik Road,RIYADH,Riyadh,"Construction of buildings, Construction of bui...",Ratel Al Sharq Contracting Company Small Compa...,"[0.0501224510371685, 0.037430986762046814, -0...."


## Store data in Qdrant

In [None]:
# Create a new collection in Qdrant
client.recreate_collection(
    collection_name="contractors",
    vectors_config=VectorParams(
        size=384,
        distance=Distance.COSINE
    )
)

#Preparing points to upload to Qdrant
points = [
    PointStruct(
        id=i,
        vector=row['embedding'],
        payload={
            "Company Name": row["Company Name"],
            "Company Size": row["Company Size"],
            "Interests": row["Interests"],
            "City": row["City"],
            "Address": row["Address"],
        }
    )
    for i, row in df.iterrows()
]

# Upload points to Qdrant
client.upsert(collection_name="contractors", points=points)

print("Embeddings uploaded successfully!")


Embeddings uploaded successfully!


In [33]:
print(type(df['embedding'].iloc[0]))


<class 'list'>
