###  Retrieval-Augmented Generation (RAG) for Inventory Scoring

#### Import Packages

In [21]:
import os
import sys
import pandas as pd
from sentence_transformers import SentenceTransformer
import weaviate
import json
from weaviate.embedded import EmbeddedOptions

sys.path.append(os.path.abspath(os.path.join('..')))

#### Populate Merged Data into vector database

In [2]:
merged_df = pd.read_csv("../data/merged_data.csv")

In [9]:
merged_df.head(5)

Unnamed: 0.1,Unnamed: 0,campaign_id,budget_amount,budget_currencycode,KPI,pricing_model,geo_targeting,vertical,targeting,game_key,...,devicetype,devicemake,site_name,matchedfoldposition,browser,impression,engagement,click,CTR,ER
0,0,f4p5a01,277.12,USD,Engagement,CPE,All Country,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,b4f31c796c130fd594d253ba9336a01d/7ae9931edb2ed...,...,Tablet,Samsung,www.yahoo.com,Unknown,Chrome,57.0,0.0,0.0,0.0,0.0
1,1,f4p5a01,277.12,USD,Engagement,CPE,All Country,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,b4f31c796c130fd594d253ba9336a01d/7ae9931edb2ed...,...,Mobile,Apple,www.yahoo.com,Unknown,Mobile Safari,983.0,265.0,71.0,7.222787,26.958291
2,2,f4p5a01,277.12,USD,Engagement,CPE,All Country,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,b4f31c796c130fd594d253ba9336a01d/7ae9931edb2ed...,...,Mobile,Samsung,www.yahoo.com,Unknown,Chrome,9.0,0.0,0.0,0.0,0.0
3,3,f4p5a01,277.12,USD,Engagement,CPE,All Country,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,b4f31c796c130fd594d253ba9336a01d/7ae9931edb2ed...,...,Mobile,Apple,www.the-sun.com,Unknown,Mobile Safari UI/WKWebView,3.0,0.0,0.0,0.0,0.0
4,4,f4p5a01,277.12,USD,Engagement,CPE,All Country,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,b4f31c796c130fd594d253ba9336a01d/7ae9931edb2ed...,...,Mobile,Samsung,currently.att.yahoo.com,Unknown,Chrome Mobile,27.0,3.0,0.0,0.0,11.111111


#### Drop unncessary columns

In [3]:
merged_df = merged_df.drop(columns=['Unnamed: 0', 'impression', 'engagement', 'click', 'budget_currencycode', 'budget_amount', 'pricing_model', 'devicemake', 'osfamily', 'devicetype'])

In [4]:
merged_df.columns

Index(['campaign_id', 'KPI', 'geo_targeting', 'vertical', 'targeting',
       'game_key', 'adformat', 'creative', 'renderingcontext', 'site_name',
       'matchedfoldposition', 'browser', 'CTR', 'ER'],
      dtype='object')

#### Sampling data

In [5]:
data_group_by_campaign = merged_df['campaign_id'].value_counts()
data_group_by_campaign

campaign_id
p19pznm    497402
lj8gesi    404475
nzup6yr    302474
noaiquo     83504
woln7nu     78826
            ...  
6rlgajs      2283
vsi8o9t      1734
np5elle      1673
quhw0xz      1220
cvpj79c       806
Name: count, Length: 78, dtype: int64

In [6]:
filtered_df = merged_df[(merged_df['geo_targeting'] != 'All Country') & (merged_df['CTR'] != 0) & (merged_df['ER'] != 0)]

def sample_rows(group):
    return group.sample(n=min(len(group), 100), random_state=1)

sampled_df = filtered_df.groupby('campaign_id').apply(sample_rows).reset_index(drop=True)

In [7]:
sampled_df.shape

(1100, 14)

#### Check Null values

In [8]:
# Calculate the percentage of missing values in each column
missing_percentage = sampled_df.isnull().sum() / len(merged_df) * 100

# Print the percentage of missing values for each column
print(missing_percentage)

campaign_id            0.0
KPI                    0.0
geo_targeting          0.0
vertical               0.0
targeting              0.0
game_key               0.0
adformat               0.0
creative               0.0
renderingcontext       0.0
site_name              0.0
matchedfoldposition    0.0
browser                0.0
CTR                    0.0
ER                     0.0
dtype: float64


#### Generating Embeddings

In [20]:
# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

  return torch._C._cuda_getDeviceCount() > 0


In [19]:
# Assuming sampled_df is your original DataFrame
# And focusing on columns that could benefit from embeddings for this example
textual_columns = ['KPI', 'geo_targeting', 'vertical', 'targeting', 'adformat', 'creative', 'renderingcontext', 'matchedfoldposition', 'site_name', 'browser']

# Initialize an empty DataFrame to store embeddings
embeddings_df = pd.DataFrame()

# Generate embeddings for each specified column and add to the new DataFrame
for column in sampled_df.columns:
    # Apply the model to each item in the column and store the result in the new DataFrame
    # Note: This creates a list of embeddings for each row, which are high-dimensional vectors
    if column in textual_columns:
        embeddings_df[f'{column}_embeddings'] = sampled_df[column].apply(lambda x: model.encode(x).tolist())
    else:
        embeddings_df[column] = sampled_df[column]


# Demonstrating the structure of the new DataFrame with embeddings
print(embeddings_df.head())
embeddings_df.to_csv("../data/sampled_embedded_data.csv")

2024-03-23 16:11:47,693 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-03-23 16:11:48,176 - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/1.1" 200 0
2024-03-23 16:11:48,370 - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HTTP/1.1" 200 0
2024-03-23 16:11:48,559 - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/README.md HTTP/1.1" 200 0
2024-03-23 16:11:48,809 - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/1.1" 200 0
2024-03-23 16:11:49,118 - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/sentence_bert_config.json HTTP/1.1" 200 0
2024-03-23 16:11:49,441 - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config.json HTTP/

  campaign_id_embeddings                                     KPI_embeddings  \
0                66bpf2h  [0.024438880383968353, 0.07737276703119278, 0....   
1                66bpf2h  [0.024438880383968353, 0.07737276703119278, 0....   
2                66bpf2h  [0.024438880383968353, 0.07737276703119278, 0....   
3                66bpf2h  [0.024438880383968353, 0.07737276703119278, 0....   
4                66bpf2h  [0.024438880383968353, 0.07737276703119278, 0....   

                            geo_targeting_embeddings  \
0  [-0.009008824825286865, 0.013777172192931175, ...   
1  [-0.009008824825286865, 0.013777172192931175, ...   
2  [-0.009008824825286865, 0.013777172192931175, ...   
3  [-0.009008824825286865, 0.013777172192931175, ...   
4  [-0.009008824825286865, 0.013777172192931175, ...   

                                 vertical_embeddings  \
0  [0.036567866802215576, 0.006975037977099419, -...   
1  [0.036567866802215576, 0.006975037977099419, -...   
2  [0.03656786680221

#### Load the Emedded sample dataset

In [9]:
embedded_data = pd.read_csv("../data/sampled_embedded_data.csv")

In [10]:
embedded_data.columns

Index(['campaign_id', 'KPI_embeddings', 'geo_targeting_embeddings',
       'vertical_embeddings', 'targeting_embeddings', 'game_key',
       'adformat_embeddings', 'creative_embeddings',
       'renderingcontext_embeddings', 'site_name_embeddings',
       'matchedfoldposition_embeddings', 'browser_embeddings', 'CTR', 'ER'],
      dtype='object')

#### Create sampled Data dictionary

In [31]:
sampled_dic = merged_df.iloc[:2, :].to_dict(orient='records')

#### Create Weivate Connection

In [32]:
client = weaviate.Client( embedded_options = EmbeddedOptions(),
                            additional_headers = {
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY") # Replace with your inference API key
    })

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


embedded weaviate is already listening on port 8079


In [34]:
# client.schema.delete_class("inventory_data")

#### Create Weivate Database Schema

In [35]:
client.schema.create("../db/schema.json")

#### Storing Data in a Vector Database (Weaviate)

In [36]:
# for value in sampled_dic:
#     client.data_object.create(value, "Inventory_data")
    
client.batch.configure(batch_size=100)  
with client.batch as batch: 
    for properties in sampled_dic:  
        batch.add_data_object(
            data_object=properties,
            class_name="Inventory_data"
        )

In [42]:
inventory_data = client.data_object.get(class_name="Inventory_data")

#### Semantic search - Retrieving Relevant Inventories


In [41]:
response = (
    client.query
    .get("Inventory_data", ["campaign_id", "browser", "creative", "targeting"])
    .with_near_text({"concepts": ["Sports"]})
    .with_limit(2)
    .do()
)

print(json.dumps(response, indent=4))

{
    "data": {
        "Get": {
            "Inventory_data": [
                {
                    "browser": "Chrome",
                    "campaign_id": "f4p5a01",
                    "creative": "Sensory Video",
                    "targeting": "Opinion Leaders| Investors| Policy Makers| C&I| Golf Enthusiasts"
                },
                {
                    "browser": "Mobile Safari",
                    "campaign_id": "f4p5a01",
                    "creative": "Sensory Video",
                    "targeting": "Opinion Leaders| Investors| Policy Makers| C&I| Golf Enthusiasts"
                }
            ]
        }
    }
}
