In [1]:
import os
import re
import yaml
import json
import ast
import praw
from crewai import Agent, Task, Crew, Process
from langchain_openai import ChatOpenAI

from textwrap import dedent
from reddit_helper import *    

from langchain.llms import OpenAI, Ollama
from langchain_openai import ChatOpenAI

from pydantic import BaseModel

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


In [2]:
api_file_path = 'api_keys.json'
with open(api_file_path, 'r') as file:
    api_keys = json.load(file)        
openai_gpt35 = ChatOpenAI(model_name="gpt-3.5-turbo", 
                                temperature=0.7,
                                openai_api_key=api_keys['openai'])
openai_gpt4 = ChatOpenAI(model_name="gpt-4", 
                                temperature=0.7,
                                openai_api_key=api_keys['openai'])        
llama31_8b = Ollama(model='llama3.1:8b',)
gemma2_9b = Ollama(model='gemma2:9b',)

In [48]:
cfg_file_path = 'casaai_config.yaml'
with open(cfg_file_path, 'r') as yaml_file:
    cfg = yaml.safe_load(yaml_file)
product_long = cfg.get('product_long_description', '')      
product_short = cfg.get('product_short_description', '')
domain = cfg.get('domain')
output_format_1 = cfg.get('output_format_1')

In [49]:
print(output_format_1)

^^comment_id^^decision^^justification


In [4]:
def re_string(input_string):

    pattern = r"'comment_id':\s*'([^']*)',\s*'rank':\s*(\d+)"

    # Use re.findall to extract the values
    matches = re.findall(pattern, input_string)

    if matches:
        comment_id, rank = matches[0]
    else:
        comment_id = "na"
        rank = 0

    justification = input_string.split("justification")[-1]
    
    return comment_id, rank, justification

In [40]:
backstory = "You are a content analyst with expertise in analyzing web content and \
             extracting relevant information. \
             - You should analyze the comment and see if it is relevant to {domain} \
             - You should identify whether comment is a question or seeking suggestion/opinion/guidance or \
               asking for advise that comes under scope of {product_short}. \
            "
goal = "Analyze the content and identify whether it is relevant or not"
content_analysis_agent = Agent(
                            role="Content Analyst",
                            goal=goal,
                            backstory=backstory,
                            allow_delegation=False,
                            verbose=False,
                            llm=gemma2_9b,
                            )

In [41]:
class DecisionOutput(BaseModel):
    comment_id: str
    decision: str
    justification: str

In [42]:
tip_text = "If you do your BEST WORK, I'll give you a $100 commission!"

In [58]:
descr = '''{comment_id} is the id of the comment that you need to provide decision and justification. You will get the data in {input_data}.
          You should: 
            - Analyze this and determine relevance of {comment_id} with respect to {domain} 
            - You should check whether comment is seeking information/guidance/advise or sharing an opinion or asking a question that 
             comes under the scope of {product_short}. Such comments should be identified as relevant.
              Example : "In some situations, i agree with you. Emptiness is better than extra things in the room."
              But i think in this case, it would be better to have something on the wall.
            - Bot messages that has URLs are irrelevant
              Example : "Looks like the Crate and Barrel Cortez Natural Floating Dresser.
              https://www.crateandbarrel.com/cortez-natural-floating-dresser/s501378"
            - Short messages that are not questions or requests or opinions(less than 15 words) are irrelevant
            - Merely expressing positive sentiment, interest or appreciations alone without any follow-up questions or requests are irrelevant
              Example : "THATS WILD!!!!!"
              Example : "Pairing the electric blue and maroon together is an absolute power move, and you have my respect."
              Example : "Nice tannoys! I've wish they were more available in the states."
              Example : "Your style is DELICIOUS!!!!!!!!"
            - Mere statements without any follow-up questions or requests are irrelevant
              Example-1 : "The overhead/pendulum lighting looks perfect."
              Example-2 : "this is the dream"
            - Offensive messages or messages with foul language are irrelevant
            - If relevant populate 'relevant' against 'decision' key of json output, else pass 'discard'. \
              {tip_text}
         '''

expected_out = "comment id, decision and a brief justification (less than 15 words) \
                explaining the rationale behind the decision" 
        
content_analysis_task = Task(
                            description=descr,
                            expected_output=expected_out,
                            output_json=DecisionOutput,
                            agent=content_analysis_agent,
                            )

In [59]:
response_creation_crew = Crew(
    agents=[content_analysis_agent,],
    tasks=[content_analysis_task,],
    verbose=False,
)



In [12]:
reddit_posts, reddit_post_ids = fetch_reddit_test()
condensed_reddit_data, unique_post_ids, unique_comment_ids = condense_data(reddit_posts, reddit_post_ids)

post_cnt: 0 + 10 = 10
comm_cnt: 0 + 560 = 560
cond_cnt: 570 = 10 + 560
Cross_ck: 10 = 10


In [31]:
comment_dict = {}
tot_data = 0
for idx1, item in enumerate(condensed_reddit_data):
    for idx2, data in enumerate(item):
        tot_data += 1
        c_id = data['comment_id']
        p_id = data['parent_id']
        text = data['text']
        comment_dict[c_id] = {'comment_id': c_id, 'parent_id': p_id, 'text': text}
    print(idx1, ':', idx2)
comment_dict_list = list(comment_dict.keys())
print(len(comment_dict), '=', tot_data, len(comment_dict_list) )

0 : 169
1 : 108
2 : 31
3 : 57
4 : 0
5 : 33
6 : 67
7 : 60
8 : 1
9 : 34
570 = 570 570


In [14]:
def get_data_details(comm_id):
    done = 0
    data_details = []
    while done==0:
        if comm_id in comment_dict:
            details = comment_dict[comm_id]
            data_details.append(details)
            comm_id = details['parent_id']
        else:
            done = 1
    return data_details

In [None]:
decision_lst = []
for idx, reddit_data_item in enumerate(condensed_reddit_data):
    comment_lst = []
    for idx2, comment_data in enumerate(reddit_data_item):        
        comm_id = comment_data['comment_id']  
        comment_text = comment_data['text']
        print(f'STARTING {idx} - {idx2} - {comm_id}')   
        input_dict = {"comment_id": comm_id,
                      "input_data": comment_text,
                      "product_long": product_long,
                      "product_short": product_short,
                      "domain": domain,
                      "output_format":output_format_1, 
                      "tip_text":tip_text}
        decision_result = response_creation_crew.kickoff(inputs=input_dict)
        try:
            json_out = ast.literal_eval(decision_result.json)
        except:
            print('Exception in json - trying re')
            _, d, j = re_string(decision_result.raw)            
            json_out = {'comm_id': comm_id, 'decision': str(d), 'justification': str(j)}
        comment_lst.append(json_out)
    decision_lst.append(comment_lst)

STARTING 0 - 0 - kgesxn
[93m Pydantic validation error: 1 validation error for DecisionOutput
comment_id
  Field required [type=missing, input_value={'comment id': 'kgesxn', ...not design discussion.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing. Attempting to handle partial JSON.[00m
[93m Pydantic validation error: 1 validation error for DecisionOutput
comment_id
  Field required [type=missing, input_value={'comment id': 'kgesxn', ...not design discussion.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing. The JSON structure doesn't match the expected model. Attempting alternative conversion method.[00m
STARTING 0 - 1 - ggfdsvg
[93m Error parsing JSON: Expecting value: line 1 column 1 (char 0). Attempting to handle partial JSON.[00m
STARTING 0 - 2 - ggfh2zn
STARTING 0 - 3 - ggepn0p
STARTING 0 - 4 - ggfd5lx
[93m Pydantic validation error: 1 validation error for DecisionOutput
comment_

In [46]:
len(decision_lst), len(decision_lst[0]), decision_lst[0][0]

(10,
 170,
 {'comment_id': 'kgesxn',
  'decision': 'discard',
  'justification': 'Sharing an accomplishment, not seeking advice.'})

In [47]:
analysis_lst = []
bad_list = []
for item in decision_lst:
    tot = 0
    rel = 0
    disc = 0   
    bad_cnt = 0
    sep = "^"
    for comment in item:
        tot += 1
        try:
            if comment['decision'] == 'relevant':
                rel += 1
            else:
                disc += 1
            comm_key = list(comment.keys())[0]
            comm_id  = comment[comm_key].strip()
            if comm_id in comment_dict_list:
                parent_id = comment_dict[comm_id]["parent_id"]
                if parent_id is None:
                    parent_id = 'NA'
                decision = comment['decision']
                justify  = comment["justification"]
                comment_text = comment_dict[comm_id]["text"]
                text_str = sep + comm_id + sep + parent_id + sep + decision + sep + justify + sep + comment_text + sep
                analysis_lst.append(text_str)           
            else: 
                print(f'Comment ID not found : {comm_id}')
                bad_cnt += 1
                bad_list.append(comm_id)
        except:
            print(f'decision key error : {comment}')
            bad_cnt += 1
    print(f'tot : {tot}, rel : {rel}, discarded : {disc}, bad_cnt : {bad_cnt} analysis : {len(analysis_lst)}')

Comment ID not found : str
Comment ID not found : str
Comment ID not found : ggggb0gv
Comment ID not found : ggg8fu
Comment ID not found : ggggp581
Comment ID not found : ggggbtyv
Comment ID not found : str
Comment ID not found : ggy9lj
Comment ID not found : gggglvit
Comment ID not found : ggggx92
Comment ID not found : gggmm3zh
tot : 170, rel : 72, discarded : 98, bad_cnt : 11 analysis : 159
tot : 109, rel : 60, discarded : 49, bad_cnt : 0 analysis : 268
tot : 32, rel : 14, discarded : 18, bad_cnt : 0 analysis : 300
decision key error : {'fxw7yxc': {'decision': 'relevant', 'justification': 'Asks for AI-generated interior design suggestions.'}}
tot : 58, rel : 29, discarded : 28, bad_cnt : 1 analysis : 357
tot : 1, rel : 0, discarded : 1, bad_cnt : 0 analysis : 358
Comment ID not found : krct1qh
Comment ID not found : krcylty
tot : 34, rel : 20, discarded : 14, bad_cnt : 2 analysis : 390
tot : 68, rel : 37, discarded : 31, bad_cnt : 0 analysis : 458
tot : 61, rel : 29, discarded : 32,

In [29]:
comment_dict['gggg8fu']

KeyError: 'gggg8fu'

In [25]:
comment_dict['ggfqz4a']['parent_id']

'kgesxn'

In [97]:
with open('decision_result_v4_0820.txt', "w") as file:
    # Iterate through the list and write each string to the file
    for item in analysis_lst:
        file.write(item + "\n")  # Add a newline character after each string

print(f"List has been written")

List has been written


In [98]:
with open('decision_result_v4_0820.txt', "r") as file:
    # Read all lines and strip the newline character from each line
    analysis_lst_loaded_back = [line.strip() for line in file.readlines()]
print(len(analysis_lst_loaded_back))

739


In [34]:
# For each post-comment combo select the post-comments whose score >= 7

In [None]:
# Then select a threshold like 20% or top 3 whichever is bigger of comments to craft response

In [None]:
# Build response for these comments With Manager LLM + only 1 response writer + meta reviewer