## Import Modules and Setup Environment

In [1]:
import os
from tqdm.notebook import tqdm

import polars as pl
from argmap.dataModel import Summary, Comments, Topics, HierarchicalTopics, Arguments, ArgumentCommentMap, DataModel

from dotenv import load_dotenv
load_dotenv()

# this allows categorical data from various sources to be combined and handled gracefully; performance cost is acceptable
pl.enable_string_cache()

EMBED_MODEL_ID = os.getenv("EMBED_MODEL_ID")

In [2]:
import guidance
from guidance import select, instruction, user, assistant

from argmap.guidance import generate_line, generate_phrase

## Load Dataset

In [3]:
from IPython.display import display_markdown

DATASET = "american-assembly.bowling-green"
# "scoop-hivemind.biodiversity"
# "scoop-hivemind.freshwater"
# "scoop-hivemind.taxes"
# "scoop-hivemind.ubi"
# "scoop-hivemind.affordable-housing"
# "london.youth.policing"
# "canadian-electoral-reform"
# "brexit-consensus"
# "ssis.land-bank-farmland.2rumnecbeh.2021-08-01"

summary = Summary(DATASET)
comments = Comments(DATASET).load_from_parquet()
topics = Topics(DATASET).load_from_parquet()
hierarchicalTopics = HierarchicalTopics(DATASET).load_from_parquet()

display_markdown(f"""
### Dataset: {DATASET}
### {summary.topic}
### {summary.get('conversation-description')}
### Full Report: [{summary.url}]({summary.url})
""", raw=True)


### Dataset: american-assembly.bowling-green
### Improving Bowling Green / Warren County
### What do you believe should change in Bowling Green/Warren County in order to make it a better place to live, work and spend time?
### Full Report: [https://pol.is/9wtchdmmun](https://pol.is/9wtchdmmun)


## Initialize Language Model

In [4]:
import os
from argmap.helpers import getTorchDeviceVersion, loadLanguageModel

CUDA_MINIMUM_MEMORY_GB = os.getenv("CUDA_MINIMUM_MEMORY_GB")
MODEL_ID = os.getenv("MODEL_ID")
MODEL_REVISION = os.getenv("MODEL_REVISION")

if MODEL_ID is None:
    raise Exception("MODEL_ID environment variable is required.")

print(getTorchDeviceVersion())

languageModel = loadLanguageModel(MODEL_ID, MODEL_REVISION, CUDA_MINIMUM_MEMORY_GB)

Device: NVIDIA H100 PCIe
Python: 3.11.7 | packaged by conda-forge | (main, Dec 23 2023, 14:43:09) [GCC 12.3.0]
PyTorch: 2.2.1
CUDA: 12.1
CUDNN: 8902

Initializing language model: TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ gptq-4bit-32g-actorder_True...
Language model initialized.
CUDA Memory: 52.0 GB free, 26.2 GB allocated, 79.1 GB total


## Generate Arguments

### Prompt Guidance for Argument Generation

Here, we only feed "agreeable comments" to the language model context to aid generation. Agreeable comments are based on an `agreeability` factor defined as:

$$
agreeability = \frac{likes}{likes + dislikes}
$$

We only consider comments above an agreeability threshold of 0.5 as agreeable comments. These comments have more agree votes than disagree votes.

In [5]:
@guidance
def generate_topic_arguments(lm, summary, topic, agreeableComments, argumentCount, arguments, temperature=0, progress_bar=None):

    if progress_bar:
        lm.echo = False

    with instruction():
        lm += f"""\
        First, identify unique areas of improvement from the statements.
        Then for each area, list the associated problems and the requested action from the statements. If no ACTIONABLE SOLUTIONS are present, output None.
        Using the problem, proposed action, and the statements, create unique, short, and convincing one-sentence arguments that urges the leaders for change or improvement.
        Avoid repetitive phrases.

        DISCUSSION QUESTION: {summary.get('conversation-description')}

        """

        lm += f"""\
        TOPIC: {topic['Title']}
        KEYWORDS: {', '.join(topic['Representation'])}

        STATEMENTS:
        """

        for commentId, commentText, agrees, disagrees in agreeableComments.select('commentId', 'commentText', 'agrees', 'disagrees').iter_rows():
            lm += f"{commentId}. {commentText} ({int(agrees * 100 / (agrees + disagrees))}% voters agree)\n"

        lm += f"""
        ---
        PROBLEMS IDENTIFIED: <comma-separated list of problems from the statements>
        ACTIONABLE SOLUTIONS: <comma-separated list of actionable solution from the statements, if any>
        ARGUMENT: <make a compelling argument in one sentence that urges the need for action>
        ARGUMENT LABEL: <short three word label that describes the argument>
        """

    with user():
        lm += f"List the {argumentCount} most important areas of improvements from these statements, each on a new line.\n"

    areasOfImprovement = []
    with assistant():
        for i in range(argumentCount):
            lm += "- " + generate_line('areaOfImprovement', temperature) + "\n"
            areasOfImprovement.append(lm['areaOfImprovement'])

    # for each keyword, generate an argument
    for argumentId, area in enumerate(areasOfImprovement):
        with user():
            lm += f"""\
            AREA OF IMPROVEMENT: {area}
            """

    # for i in range(argumentCount):
        with assistant():
            lm += f"""\
            PROBLEMS IDENTIFIED: {generate_phrase('problem', temperature, 100)}
            ACTIONABLE SOLUTIONS: {generate_phrase('solution', temperature, 100)}
            ARGUMENT: {generate_line('argument', temperature, 100)}
            ARGUMENT LABEL: {generate_phrase('argumentTitle', temperature, 20)}
            """

            arguments.addRow({
                'topicId': topic['Topic'],
                'argumentId': argumentId,
                'argumentTitle': lm['argumentTitle'],
                'argumentContent': lm['argument'],
                'thoughts': [[
                    f"AREA: {area}",
                    f"PROBLEMS: {lm['problem']}",
                    f"SOLUTIONS: {lm['solution']}"
                ]]
            })

            if progress_bar:
                progress_bar.update()

    return lm

In [6]:
import math

agreeabilityThreshold = 0

topicId = 7

arguments = Arguments(DATASET).initialize()

agreeableComments = comments.df.filter(pl.col('topicId') == topicId, pl.col('agreeability') > agreeabilityThreshold).sort('agreeability', descending=True)

args = {
    'summary': summary,
    'topic': topics.get(topicId),
    'agreeableComments': agreeableComments,
    'argumentCount': 0,
    'arguments': arguments,
}

args['argumentCount'] = math.ceil(math.log(len(args['agreeableComments'])) * 2)

lm = languageModel + generate_topic_arguments(**args)

## Determine precise relationships between statements and arguments

In [7]:
# TODO: use different prompts to experimentally calculate the usefulness of these relationships
# embeddings provide a baseline for similarity
# how to determine what relationships are useful? diversity?
# most relationships indicate support. Is that because the comments are generally supportive?
# what would change if the options are listed in a different order?

# The support / refute / undercut relationships in the previous run do not appear to be reflected accurately in some cases. In the following attempt, we ask the language model to visit each argument, explain which comment it supports and why. We run this indiviudally in a fresh context to limit any bias from unrelated text.

# In the following generation, the initial content and arguments are placed in the overall context. Then each comment is evaluated in its own context that includes a copy of the initial text. This context is reset after evaluating each comment and not carried over to the next comment.

### Prompt Guidance to determine relationship

In [8]:
# @guidance
# def generate_argument_relation(lm, discussionTitle, discussionQuestion, topicId, topic, comments, topicArguments, argumentCommentMap, context_reset=False, temperature=0, echo=True, progress_bar=None):
#     lm.echo = echo

#     with instruction():
#         lm += f"""\
#         The following set of statements is from an online discussion about {discussionTitle} conducted on a discussion platform Polis.
#         There are total of {len(topicArguments)} arguments and {len(comments)} statements in this dataset.
#         For each of the following statements, select an argument that is most relevant to the comment.
#         Then express your thoughts about which relationships might apply: SUPPORT, REFUTE, UNRELATED. If a relationship does not apply, simply state that.
#         Then finally pick the most likely relationship between the statement and the argument: "SUPPORT" or "REFUTE".
#         If the statement and argument are not related, state "UNRELATED".
#         """
#         # For each given statement, identify the most relevant argument. Then identify the relationship between the argument and the statement.

#     with user():
#         lm += f"""\
#         Discussion Subject: {discussionQuestion}
#         Topic: {topic['CustomName']}
#         Topic Keywords: {', '.join(topic['Representation'])}

#         ARGUMENTS:
#         """

#         for argument, title, content in topicArguments.select('argumentId', 'argumentTitle', 'argumentContent').iter_rows():
#             lm += f"""
#             Argument {argument}: {title}
#             {content}
#             """

#     for commentRow in comments.select('commentId', 'commentText', 'agrees', 'disagrees').iter_rows(named=True):
#         if context_reset:
#             lm + generate_argument_relation_reasoning(commentRow, topicId, topicArguments, argumentCommentMap, temperature=temperature, echo=echo)
#         else:
#             lm = lm + generate_argument_relation_reasoning(commentRow, topicId, topicArguments, argumentCommentMap, temperature=temperature, echo=echo)
#         if progress_bar:
#             progress_bar.update(1)

#     return lm

In [9]:
# commentCount = comments.df.filter(pl.col('moderated') == 1).get_column('topicId').value_counts().sort('topicId').get_column('count').to_numpy()
# argCount = arguments.df.get_column('topicId').value_counts().sort('topicId').get_column('count').to_numpy()
# np.dot(commentCount, argCount)

arguments = Arguments(DATASET).load_from_parquet()
arguments.glimpse()

Rows: 102
Columns: 5
$ topicId               <u16> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ argumentId            <u16> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
$ argumentTitle         <str> 'Occupational Tax for Homeless Support', 'Affordable Housing and Property Ownership', 'Strong Landlord-Tenant Laws', 'Term Limits for Fair Representation', 'Police Presence in County Areas', 'Grocery Store Competition', 'Tenant Support in Downtown District', 'Diversity in City/County Government', 'Poverty Reduction and Elderly Support', 'Rental Assistance and Affordability'
$ argumentContent       <str> 'Implement a small occupational tax on businesses to fund temporary housing and support services for the homeless, creating a safer and more prosperous community for all.', 'Increase affordable housing options and enforce stronger landlord/tenant laws to preserve neighborhoods and promote responsible property ownership.', 'Establish stronger tenant and renter rights, enforce leash laws, and encourage property maintenance

In [15]:
@guidance
def guidance_topic_correlate(lm, topicId, topicComments, topicArguments, argumentCommentMap, thought=False, reason=False, context_reset=True, temperature=0, progress_bar=None):

    if progress_bar is not None:
        lm.echo = False

    with instruction():
        lm += f"""\
        You will be presented a statement and an argument. Statement is a user-generated comment from a discussion. Argument is an actionable solution.

        TASK: Determine whether the statement supports, refutes, or is unrelated to the argument.
        SUPPORT: The argument is consistent with the statement. A person who agrees with the statement will definitely support the argument.
        REFUTE: The argument goes against the statement. A person who agrees with the statement will definitely with the argument.
        UNRELATED: The statement and argument are not directly related. Implementing the argument will not directly address the underlying issue.

        ---
        OUTPUT FORMAT
        THOUGHT: Deliberate on how strongly a person who agrees with the statement will support the argument.
        RELATIONSHIP: One of the following: SUPPORT, REFUTE, UNRELATED
        REASON: Provide a reason for your choice.
        """

    # iterate over each argument
    for argumentId, argumentTitle, argumentContent in topicArguments.select('argumentId', 'argumentTitle', 'argumentContent').iter_rows():
        args = {
            'topicId': topicId,
            'argumentId': argumentId,
            'argumentTitle': argumentTitle,
            'argumentContent': argumentContent,
            'topicComments': topicComments,
            'argumentCommentMap': argumentCommentMap,
            'thought': thought,
            'reason': reason,
            'temperature': temperature,
            'context_reset': context_reset,
            'progress_bar': progress_bar,
        }
        if context_reset:
            lm + guidance_argument_correlate(**args)
        else:
            lm += guidance_argument_correlate(**args)

    return lm


@guidance
def guidance_argument_correlate(lm, topicId, argumentId, argumentTitle, argumentContent, topicComments, argumentCommentMap, thought=False, reason=False, temperature=0, context_reset=True, progress_bar=None):
    with user():
        lm = lm + f"""\
        ARGUMENT {argumentId}: {argumentTitle}
        {argumentContent}
        """

    # iterate over each comment
    for commentId, commentText, in topicComments.select('commentId', 'commentText').iter_rows():
        args = {
            'topicId': topicId,
            'argumentId': argumentId,
            'commentId': commentId,
            'commentText': commentText,
            'argumentCommentMap': argumentCommentMap,
            'thought': thought,
            'reason': reason,
            'temperature': temperature,
        }
        if context_reset:
            lm + guidance_comment_correlate(**args)
        else:
            lm += guidance_comment_correlate(**args)

        if progress_bar is not None:
            progress_bar.update()

    return lm


@guidance
def guidance_comment_correlate(lm, topicId, argumentId, commentId, commentText, argumentCommentMap, thought=False, reason=False, temperature=0):
    with user():
        lm += f"""\
        STATEMENT {commentId}: {commentText}
        """

    with assistant():

        reasoning = []
        if thought:
            lm += f"THOUGHT: {generate_line('thought', temperature, 100)}\n"
            reasoning.append(f"THOUGHT: {lm['thought']}")

        lm += f"RELATIONSHIP: {select(['SUPPORT', 'REFUTE', 'UNRELATED'], name='relationship')}\n"
        relationship = lm['relationship']

        if reason and not relationship == 'UNRELATED':
            lm += f"REASON: {generate_line('reasoning', temperature, 100)}"
            reasoning.append(f"REASON: {lm['reasoning']}")

        argumentCommentMap.addRow({
            'commentId': commentId,
            'topicId': topicId,
            'argumentId': argumentId,
            'relationship': lm['relationship'],
            'reasoning': [reasoning],
        })

    return lm

In [16]:
argumentCommentMap = ArgumentCommentMap(DATASET).initialize()

topicId = 7

topicComments = comments.df.filter(pl.col('topicId') == topicId)
topicArguments = arguments.df.filter((pl.col('topicId') == topicId) & (pl.col('argumentId') == 6))

args = {
    'topicId': topicId,
    'topicComments': topicComments,
    'topicArguments': topicArguments,
    'argumentCommentMap': argumentCommentMap,
    'context_reset': False,
    'thought': True,
    'reason': True,
}

languageModel + guidance_topic_correlate(**args)