## Import Packages and Setup Environment

In [None]:
from polis.data_model import Comments, Topics, HierarchicalTopics, Arguments, ArgumentCommentMap, Summary
import math
import sys
import os
from pprint import pprint
from tqdm.notebook import tqdm

import polars as pl
from argmap.dataModel import DataModel, Summary, Comments

from dotenv import load_dotenv
load_dotenv()

# this allows categorical data from various sources to be combined and handled gracefully; performance cost is acceptable
pl.enable_string_cache()

In [2]:
import guidance
from guidance import models, gen, select, instruction, one_or_more, zero_or_more

## Load Dataset

In [7]:
from IPython.display import display_markdown

# for now, work with one source
DATASET = "american-assembly.bowling-green"
# "scoop-hivemind.biodiversity"
# "scoop-hivemind.freshwater"
# "scoop-hivemind.taxes"
# "scoop-hivemind.ubi"
# "scoop-hivemind.affordable-housing"
# "london.youth.policing"
# "canadian-electoral-reform"
# "brexit-consensus"
# "ssis.land-bank-farmland.2rumnecbeh.2021-08-01"

summary = Summary(DATASET)

comments = Comments(DATASET).load_from_parquet()
topics = Topics(DATASET).load_from_parquet()
hierarchicalTopics = HierarchicalTopics(DATASET).load_from_parquet()

display_markdown(f"""
### Dataset: {DATASET}
#### {summary.topic}
#### {summary.get('conversation-description')}
#### Full Report: [{summary.url}]({summary.url})
#### Topic Count: {topics.df.height}
""", raw=True)

print("Topics Dataset Overview:")
topics.glimpse()

## Grammar Specification for Text Generation

In [4]:
@guidance(stateless=True)
def generate_line(lm, name: str, max_tokens=50, temperature=0):
    return lm + gen(name=name, max_tokens=max_tokens, temperature=temperature, list_append=True, stop=['\n'])

@guidance(stateless=True)
def generate_phrase(lm, name: str, max_tokens=50, temperature=0):
    return lm + gen(name=name, max_tokens=max_tokens, temperature=temperature, list_append=True, stop=['\n', '.'])

@guidance(stateless=True)
def generate_number(lm, name: str, min: int, max: int):
    return lm + select(list(range(min, max+1)), name=name, list_append=True)

In [5]:
@guidance
def generate_topic_titles(lm, main_title, question, df):
    lm += f"""\
    The following is a dataset of comments from an online discussion about {main_title}. Assign a terse title to each topic based on the given keywords. Avoid repetitive phrases.

    Question: {question}
    """

    for topic, keywords, docs in df.rows():
        lm += f"""
        Topic {topic}:
        Keywords: {', '.join(keywords)}
        Example Comments: {'; '.join(docs)}
        Topic Title: """ + gen(name='topic_title', list_append=True, stop=['\n', '.'], max_tokens=20, temperature=1) + "\n"

    return lm

## Initialize Language Model

In [9]:
import torch

if not torch.cuda.is_available():
	raise Exception("No CUDA device found")

mistral = models.TransformersChat("mistralai/Mistral-7B-Instruct-v0.2", device_map="auto") # ~31GB RAM

# mistral1 = models.TransformersChat("mistralai/Mistral-7B-v0.1", device_map="auto") ~26GB RAM
# llama = models.TransformersChat("meta-llama/Llama-2-13b-chat-hf", device_map="auto") # ~50GB RAM
# olmo = models.TransformersChat("allenai/OLMo-7B", device_map="auto") # unable to run without breaking environment
# mixtral = models.TransformersChat("TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ", device_map="auto") # #26GB RAM
# gpt2 = models.TransformersChat("openai-community/gpt2-large", device_map="auto") # ~4GB RAM

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Generate Topic Titles

### TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ

In [13]:
# grab topic keywords
df_topic_keywords_docs = df_topics.select('Topic', 'Representation', 'Representative_Docs')

# instruct LLM to generate topic titles
lm = mixtral + generate_topic_titles(summary['topic'], summary['conversation-description'], df_topic_keywords_docs)

### mistralai/Mistral-7B-Instruct-v0.2

In [12]:
# grab topic keywords
df_topic_keywords_docs = df_topics.select('Topic', 'Representation', 'Representative_Docs')

# instruct LLM to generate topic titles
lm = mistral + generate_topic_titles(summary['topic'], summary['conversation-description'], df_topic_keywords_docs)

### meta-llama/Llama-2-13b-chat-hf

In [15]:
df_topic_keywords_docs = df_topics.select('Topic', 'Representation', 'Representative_Docs')

lm = llama + generate_topic_titles(summary['topic'], summary['conversation-description'], df_topic_keywords_docs)

### openai-community/gpt2

In [12]:
df_topic_keywords_docs = df_topics.select('Topic', 'Representation', 'Representative_Docs')

lm = gpt2 + generate_topic_titles(summary['topic'], summary['conversation-description'], df_topic_keywords_docs)

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 3.81 GiB of which 12.00 MiB is free. Including non-PyTorch memory, this process has 3.79 GiB memory in use. Of the allocated memory 3.55 GiB is allocated by PyTorch, and 187.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Chat Instruction Format

So far, we used the completion technique to present a portion of the text to the model and have it generate text that best completes the content. In the following tests, we will use the chat instruction format instead. This is an interactive techqiue that starts with an instructional prompt, and then allows a conversation between user and assistant. Using guidance framework, we still control the content that is placed in assistant's context, but it allows for a more natural conversation flow and enables the model to better understand the task.

### Mistral without context reset

In [None]:
topic_labels = None
topic_titles = None

from guidance import user, assistant, instruction
import re

@guidance
def generate_topic_titles(lm, name_title, name_label, main_title, question, df, temperature=0):
    global topic_labels, topic_titles

    main_title_stop_words = re.split(r'\W+', main_title.lower()) # pick words in summary


    with instruction():
        lm += f"""\
        Respond with a detailed title and a short label to best represent each given topic.
        Avoid repetitive words such as "Enhancing" or "Improving".
        Start title with a noun.
        Label should be terse and up to 4 words.
        Do not output any of the following words: "{', '.join(main_title_stop_words)}"
        """

    with user():
        lm += f"""\
        The following is a dataset of comments from an online discussion.
        Discussion Question: {question}

        KEYWORDS: [a set of keywords that describe the topic]
        SAMPLE STATEMENTS: [a set of statements that best represent the topic]
        """

    with assistant():
        lm += f"""\
        TITLE: [a descriptive sentence that represents the topic]
        LABEL: [terse phrase]
        """

    topic_labels = []
    topic_titles = []

    for topic, keywords, docs in df.iter_rows():
        with user():
            lm = lm + f"""
            # Topic {topic}
            KEYWORDS: {', '.join(keywords)}
            SAMPLE STATEMENTS: {'; '.join(docs)}
            """
        with assistant():
            lm += f"TITLE: " + gen(name=name_title, stop=['\n', '.'], max_tokens=20, temperature=temperature) + "\n"
            lm += f"LABEL: " + gen(name=name_label, stop=['\n', '.'], max_tokens=12, temperature=temperature) + "\n"

        # since the title almost always starts with a verb, remove the first word if it ends in 'ing'
        title = lm['topic_title']
        # if title.split(' ')[0].endswith('ing'):
        #     title = ' '.join(title.split(' ')[1:])
        topic_labels += [lm['topic_label']]
        topic_titles += [title]

    return lm

# grab topic keywords
df_topic_keywords_docs = pl.from_pandas(topic_model.get_topic_info()).select('Topic', 'Representation', 'Representative_Docs')

# instruct LLM to generate topic titles
lm = mistral + generate_topic_titles('topic_title', 'topic_label', summary['topic'], summary['conversation-description'], df_topic_keywords_docs, temperature=0)

In [None]:
print("Titles:", '\n'.join(topic_titles), "", "Labels:", '\n'.join(topic_labels), sep='\n')

Titles:
Enhancing Educational Opportunities
Managing Rental Property Development
Improving Road Infrastructure
Regulating Community Ordinances
Urban Planning and Transportation
Community Economic Development
Government Transparency and Accountability
Attracting Tourists and Young Professionals
Urban Planning and Parking
Traffic Law Enforcement
Combating Substance Abuse and Teen Pregnancy
Urban Green Spaces and Recycling
Temporary Worker Benefits and Wages

Labels:
Education Funding
Rental Taxes
Road Safety
Community Regulations
Urban Development
Business Growth
Government Reform
Marketing Campaign
Parking Solutions
Traffic Safety
Health Education
Green Initiatives
Temp Worker Rights


### Mistral with context reset

In [None]:
topic_labels = None
topic_titles = None

from guidance import user, assistant, instruction
import re

# TODO: use a reset_context parameter instead of redeffing the function

@guidance
def generate_topic_titles(lm, name_title, name_label, main_title, question, df, temperature=0):
    global topic_labels, topic_titles

    main_title_stop_words = re.split(r'\W+', main_title.lower()) # pick words in summary


    with instruction():
        lm += f"""\
        Respond with a detailed title and a short label to best represent each given topic.
        Avoid repetitive words such as "Enhancing" or "Improving".
        Start title with a noun.
        Label should be terse and up to 4 words.
        Do not output any of the following words: "{', '.join(main_title_stop_words)}"
        """

    with user():
        lm += f"""\
        The following is a dataset of comments from an online discussion.
        Discussion Question: {question}

        KEYWORDS: [a set of keywords that describe the topic]
        SAMPLE STATEMENTS: [a set of statements that best represent the topic]
        """

    with assistant():
        lm += f"""\
        TITLE: [a descriptive sentence that represents the topic]
        LABEL: [terse phrase]
        """

    topic_labels = []
    topic_titles = []

    for topic, keywords, docs in df.iter_rows():
        with user():
            lm_topic = lm + f"""
            # Topic {topic}
            KEYWORDS: {', '.join(keywords)}
            SAMPLE STATEMENTS: {'; '.join(docs)}
            """
        with assistant():
            lm_topic += f"TITLE: " + gen(name=name_title, stop=['\n', '.'], max_tokens=20, temperature=temperature) + "\n"
            lm_topic += f"LABEL: " + gen(name=name_label, stop=['\n', '.'], max_tokens=12, temperature=temperature) + "\n"

        # since the title almost always starts with a verb, remove the first word if it ends in 'ing'
        title = lm_topic['topic_title']
        # if title.split(' ')[0].endswith('ing'):
        #     title = ' '.join(title.split(' ')[1:])
        topic_labels += [lm_topic['topic_label']]
        topic_titles += [title]

    return lm

# grab topic keywords
df_topic_keywords_docs = pl.from_pandas(topic_model.get_topic_info()).select('Topic', 'Representation', 'Representative_Docs')

# instruct LLM to generate topic titles
lm = mistral + generate_topic_titles('topic_title', 'topic_label', summary['topic'], summary['conversation-description'], df_topic_keywords_docs, temperature=0)

In [None]:
print("Titles:", '\n'.join(topic_titles), "", "Labels:", '\n'.join(topic_labels), sep='\n')

Titles:
Enhancing Educational Opportunities
Rental Property Taxation and Development in Cities
Infrastructure Improvements for Safer Roadways
Proposed Ordinances and Fairness Debates in Bowling Green
Urban Infrastructure Improvements
Community Development and Business Growth
Government Reforms for Transparency and Fairness
Enhancing Downtown Attractions for Young Professionals and Tourists
Urban Development and Parking Solutions in Downtown Bowling Green
Police and Traffic Enforcement
Addressing the Opioid Crisis and Sex Education
Urban Green Spaces Expansion
Wage and Tax Policies for Temporary Workers

Labels:
Education Funding
City Rental Taxes
Roadway Upgrades
Ordinance Debates
Infrastructure Upgrades
Business, Growth
Government Transparency
Downtown Revitalization
Urban Development, Parking
Police Traffic
Opioid Crisis, Sex Education
Green Spaces Expansion
Temp Wage, Tax Policy
