In [None]:
%pip install langchain accelerate transformers tensorflow  -q


Python interpreter will be restarted.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pymc3 3.11.5 requires numpy<1.22.2,>=1.15.0, but you have numpy 1.26.4 which is incompatible.
scipy 1.7.3 requires numpy<1.23.0,>=1.16.5, but you have numpy 1.26.4 which is incompatible.
Python interpreter will be restarted.


In [None]:
# IMPORTS
import os
from langchain import PromptTemplate, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModelForQuestionAnswering
import torch
import boto3
from io import BytesIO
from langchain import HuggingFacePipeline



In [None]:
def list_folders_in_folder(bucket_name, folder_path):
    """
    List the folders directly under a particular folder (prefix) in an S3 bucket.
    
    Args:
        bucket_name (str): Name of the S3 bucket.
        folder_path (str): Path to the folder (prefix) in the bucket.
        
    Returns:
        list: List of folder names in the specified folder.
    """
    # Create an S3 client
    s3 = boto3.client('s3')
    
    # List objects in the folder (prefix) with delimiter set to '/'
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_path, Delimiter='/')
    
    # Extract folder names from the response
    folders_path = [prefix['Prefix'] for prefix in response.get('CommonPrefixes', [])]
    folders_in_folder = [prefix['Prefix'].split('/')[-2] for prefix in response.get('CommonPrefixes', [])]
    
    return folders_path, folders_in_folder

bucket_name = 'analyst-adhoc'
folder_path = 'Campaign_Snapshot_Canada/Test_Canada_Campaign_Snapshot_MiQ_LLM/'

folders_path, folders_in_folder = list_folders_in_folder(bucket_name, folder_path)

print(folders_path)
print(folders_in_folder)

['Campaign_Snapshot_Canada/Test_Canada_Campaign_Snapshot_MiQ_LLM/City/', 'Campaign_Snapshot_Canada/Test_Canada_Campaign_Snapshot_MiQ_LLM/Creative/', 'Campaign_Snapshot_Canada/Test_Canada_Campaign_Snapshot_MiQ_LLM/Demostats/', 'Campaign_Snapshot_Canada/Test_Canada_Campaign_Snapshot_MiQ_LLM/Eshopper/', 'Campaign_Snapshot_Canada/Test_Canada_Campaign_Snapshot_MiQ_LLM/Impressions_count/', 'Campaign_Snapshot_Canada/Test_Canada_Campaign_Snapshot_MiQ_LLM/Numeris/', 'Campaign_Snapshot_Canada/Test_Canada_Campaign_Snapshot_MiQ_LLM/Prizm/', 'Campaign_Snapshot_Canada/Test_Canada_Campaign_Snapshot_MiQ_LLM/Region/', 'Campaign_Snapshot_Canada/Test_Canada_Campaign_Snapshot_MiQ_LLM/Strategy/', 'Campaign_Snapshot_Canada/Test_Canada_Campaign_Snapshot_MiQ_LLM/Tactic/', 'Campaign_Snapshot_Canada/Test_Canada_Campaign_Snapshot_MiQ_LLM/Temporal/']
['City', 'Creative', 'Demostats', 'Eshopper', 'Impressions_count', 'Numeris', 'Prizm', 'Region', 'Strategy', 'Tactic', 'Temporal']


In [None]:
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
CACHE_DIR = "dbfs:/tmp/hf_mistral_cache"
ADVERTISER_FLAG = "Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8rou;6ndxcjp-2023-05-01-2023-05-30/"

In [None]:
def list_parquet_files_in_folder(bucket_name, folder_path):
    """
    List the Parquet files in a folder (prefix) in an S3 bucket.
    
    Args:
        bucket_name (str): Name of the S3 bucket.
        folder_path (str): Path to the folder (prefix) in the bucket.
        
    Returns:
        list: List of Parquet file keys (paths) in the specified folder.
    """
    # Create an S3 client
    s3 = boto3.client('s3')
    
    # List objects in the folder (prefix)
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_path)
    
    # Filter Parquet files from the response
    parquet_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.parquet')]
    
    return parquet_files


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID,device_map="auto",cache_dir=CACHE_DIR)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer,
    max_new_tokens=512
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=pipe,
    model_kwargs={"temperature": 0.2, "max_length": 2048},
)

In [None]:
template = """
You are an analyst reviewing data insights in the ad-tech domain. You will be provided with data columns and their corresponding values, representing various metrics and attributes from an advertising campaign. Your task is to narrate the insights derived from this data. You will be provided one row of the dataframe.

Row : {row}

Please narrate the insights for each relevant data point, providing analysis and interpretations based on the provided values.
"""

In [None]:
prompt = PromptTemplate(template=template, input_variables=['row'])
chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
import pandas as pd
log = pd.DataFrame(columns=['Advertisement_Flag', 'Folder_name', 'URI', 'Parquet_contents', 'LLM_Response'])



In [None]:
for i in range(len(folders_in_folder)):
  PATH = f"{folders_path[i]}{ADVERTISER_FLAG}"
  parquet_files_in_folder = list_parquet_files_in_folder(bucket_name, PATH)
  if len(parquet_files_in_folder) != 0:
    p = parquet_files_in_folder[0]
    s3uri = f"s3://{bucket_name}/{p}"
    df = spark.read.parquet(s3uri)
    row = df.first()
    response = chain.invoke({'row': row})
    res = response['text']
    new_log={'Advertisement_Flag' : f'{ADVERTISER_FLAG}', 'Folder_name' : f'{folders_in_folder[i]}', 'URI' : f'{s3uri}', 'Parquet_contents' : f'{row}', 'LLM_Response' : f'{res}'} 
    log.loc[len(log)] = new_log

  else:
    new_log={'Advertisement_Flag' : f'{ADVERTISER_FLAG}', 'Folder_name' : f'{folders_in_folder[i]}', 'URI' : f'{PATH}', 'Parquet_contents' : 'NULL', 'LLM_Response' : 'parquet file not found!'} 
    log.loc[len(log)] = new_log
  

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
log.head()

Unnamed: 0,Advertisement_Flag,Folder_name,URI,Parquet_contents,LLM_Response
0,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,City,Campaign_Snapshot_Canada/Test_Canada_Campaign_...,,parquet file not found!
1,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Creative,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,Row(creative_name='CA_TENA_DISPLAY_INCO_CA_APE...,\nInsights:\n\n1. The creative name 'CA_TENA_D...
2,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Demostats,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,"Row(catdesc='Households by Maintainer Age', de...",\nInsights:\n\n1. The given data represents th...
3,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Eshopper,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,"Row(Category='Online Product Purchase', Descri...",\nInsights:\n1. The given row represents an ad...
4,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Impressions_count,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,"Row(advertiser='CA - Tena c.o Starcom Canada',...",\nInsights:\n\n1. Impression count display: Th...


In [None]:
log.tail()

Unnamed: 0,Advertisement_Flag,Folder_name,URI,Parquet_contents,LLM_Response
6,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Prizm,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,"Row(name='Indieville', prizmdescriptor='Younge...",\nInsights:\n\n1. The name of the segment is '...
7,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Region,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,"Row(region='Newfoundland and Labrador', impres...",\nInsights:\n\n1. Region: The advertising camp...
8,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Strategy,Campaign_Snapshot_Canada/Test_Canada_Campaign_...,,parquet file not found!
9,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Tactic,Campaign_Snapshot_Canada/Test_Canada_Campaign_...,,parquet file not found!
10,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Temporal,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,"Row(date1='2023-05-28', time_of_day='09', day_...",\nInsights:\n1. The date of this data point is...


In [None]:
import pandas as pd
d = pd.read_csv('/dbfs/tmp/s3_insights.csv')



In [None]:
d

Unnamed: 0,Advertisement_Flag,Folder_name,URI,Parquet_contents,LLM_Response
0,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,City,Campaign_Snapshot_Canada/Test_Canada_Campaign_...,,parquet file not found!
1,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Creative,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,Row(creative_name='CA_TENA_DISPLAY_INCO_CA_APE...,\nInsights:\n\n1. The creative name 'CA_TENA_D...
2,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Demostats,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,"Row(catdesc='Households by Maintainer Age', de...",\nInsights:\n\n1. The given data represents th...
3,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Eshopper,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,"Row(Category='Online Product Purchase', Descri...",\nInsights:\n1. The given row represents an ad...
4,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Impressions_count,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,"Row(advertiser='CA - Tena c.o Starcom Canada',...",\nInsights:\n\n1. Impression count display: Th...
5,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Numeris,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,"Row(Category='Personal Care - Cosmetics', Desc...",\nInsights:\n1. The given row represents data ...
6,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Prizm,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,"Row(name='Indieville', prizmdescriptor='Younge...",\nInsights:\n\n1. The name of the segment is '...
7,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Region,s3://analyst-adhoc/Campaign_Snapshot_Canada/Te...,"Row(region='Newfoundland and Labrador', impres...",\nInsights:\n\n1. Region: The advertising camp...
8,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Strategy,Campaign_Snapshot_Canada/Test_Canada_Campaign_...,,parquet file not found!
9,Advertiser_flag=dzx5gjn-72lfq2o;jyptqpl;byl8ro...,Tactic,Campaign_Snapshot_Canada/Test_Canada_Campaign_...,,parquet file not found!


In [None]:
print(d["Folder_name"][1])
print(d['LLM_Response'][1])

Creative

Insights:

1. The creative name 'CA_TENA_DISPLAY_INCO_CA_APEX Programmatic Display Video-CA-TENA-DD-INCO-Men-M45+-PRO-PROOX-PD-DCPM-TENA-Men-Acceptors-StandardBanner-Prop-EN-160x600-New-GLD000KJGC_230228_DCM_360691291' suggests that this is a programmatic display video ad campaign for TENA, targeting men aged 45 and above, with a creative size of 160x600 pixels. The creative name also indicates that this is a new campaign, as denoted by the 'New' keyword.

2. The 'impressions_display' metric indicates that this creative has been displayed 7 times. This means that the ad has been shown to potential viewers 7 times, but it does not necessarily mean that 7 unique individuals have seen the ad.

3. The 'clicks' metric is 0, indicating that no clicks have been recorded for this creative yet. This could mean that the ad is not performing well or that it has not been shown to enough people yet to generate any clicks.

4. The 'total_conversions' metric is also 0, suggesting that no co

In [None]:
print(d["Folder_name"][2])
print(d['LLM_Response'][2])

Demostats

Insights:

1. The given data represents the campaign metric for 'Households by Maintainer Age' with a specific description 'Maintainers 25 To 34'. This indicates that the campaign is targeting households where the age of the maintainer falls between 25 and 34.

2. The 'impressions_display' value is 9. This metric represents the number of times the ad was displayed to users. In this case, the ad was displayed 9 times in total for this specific target group.

3. The 'clicks' value is 0. This metric indicates the number of clicks the ad received from users in this target group. In this case, no clicks were recorded for the 'Maintainers 25 To 34' age group.

4. The 'total_conversions' value is also 0. This metric represents the number of conversions, which is a desired action taken by the user after clicking on the ad. In this case, no conversions were recorded for the 'Maintainers 25 To 34' age group.

5. Based on the provided data, it appears that the ad campaign is not perfor