In [1]:
!pip install transformers[torch] pymongo

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import torch
import transformers
from transformers import AutoTokenizer

model = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
def analyze_survey_response(target="Jim", survey_response="He seems really sad, and he's been drinking a lot lately."):
    prompt = f"""\
You are provided with an anonymous survey response about a construction worker {target}. According to the response, might {target} be at risk for mental health issues? If so, are they lonely, depressed, anxious, or any combination of those issues? If so, list what you think they might be at risk for as a list of conditions separated by commas, or a single condition. Do not provide further text after your answer. If {target} does not seem at risk for any mental health issues, just answer "no" without any further text.

Survey response: "{survey_response}" 

Answer: """
    sequences = pipeline(
        prompt,
        max_new_tokens=100,
        do_sample=True,
        # top_k=1,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    response = sequences[0]['generated_text']
    # return last line
    response = response.splitlines()[-1]
    print(response)
    # extract from answer token
    if "Answer: " in response:
        response = response.split("Answer: ")[1].strip()
    # remove quotes
    response = response.replace('"', "")
    # split on commas and trim whitespace
    response = [r.strip() for r in response.split(",")]
    return response




In [4]:
def analyze_report(survey_responses: list[str], voice_response=None):
    nl = '\n'
    prompt = f"""\
Your task is to provide a construction manager advice on how to improve the mental health of their workers. For example, if a coworker is feeling isolated, you might suggest that the manager organize a team-building event. If a worker is feeling depressed, you might suggest the manager to provide the team with a mental health day. If a worker is feeling anxious, you might suggest the manager to provide the team with a day off. If a worker is feeling stressed, you might suggest the manager to provide the team with stress relieving activities. If a worker is feeling burnt out, you might suggest the manager to provide the team with a day off. If a worker is feeling exhausted, you might suggest the manager to provide the team with additional rest breaks.
You must not reveal the identity of anyone in your advice, because the employer might use this information to persecute the worker. Do not use anyone's name or any personal information in the surveys in your response.

You are provided with the results of various surveys about a construction worker. The goal of these surveys is to address the mental health of the worker based on the opinions and perspectives of their coworkers. The surveys are as follows:

###
{nl.join(survey_responses)}
###

"""
    voice_prompt = f"""\
When you recieved these survey responses, you called the worker to ask how they are doing. The worker responded with the following:
###
{voice_response}
###
""" if voice_response else ""

    # add voice prompt to prompt
    prompt += voice_prompt

    # add question
    prompt += f"""\
Based on the feedback of the worker in the phone call and the survey responses of the coworkers, construct a text message to be sent to the manager that will suggest an action to improve the mental health of the team. The message should be no longer than 160 characters.


This is how you should format your responses:
###
Your team is feeling isolated. Consider organizing a team-building event.
Your team is feeling depressed. Consider providing the team with a mental health day.
Your team is feeling anxious. Consider providing the team with a day off.
Your team is feeling stressed. Consider providing the team with stress relieving activities.
###

Message to the manager:\
"""

    sequences = pipeline(
        prompt,
        max_new_tokens=100,
        do_sample=True,
        # top_k=1,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    response = sequences[0]['generated_text']
    # extract from answer token
    response = response.split("Message to the manager:")[1].strip()
    return response


In [None]:
import time
from pymongo import MongoClient

uri = "mongodb+srv://hacker:ffzK61HllJvM19St@cluster0.acqccxh.mongodb.net/?retryWrites=true&w=majority"
client = MongoClient(uri)

# ping server
print(client.server_info())

db = client['db']

survey_analyses = db['survey_analyses']
reports = db['reports']

# watch for changes
print("Watching for changes...")
cursor = db.watch(full_document="updateLookup")

for change in cursor:
    try:
        print(change)
        # check if survey response or report
        if change['ns']['coll'] == 'voice_transcripts':
            print("getting data")
            # get data
            report_id = change['fullDocument']['_id']
            target = change['fullDocument']['target']
            voice_response = change['fullDocument'].get('voice_response', None)
            # get associated survey responses
            survey_responses = [x['response'] for x in
                                db['surveys'].find({'target': target}, {'response': 1, '_id': 0})]
            # call pipeline
            print("calling pipeline")
            start = time.time()
            response = analyze_report(survey_responses, voice_response)
            end = time.time()
            print('Inference time:', end - start)
            print('Response:', response)
            # put analysis in report_analyses collection
            report = {
                "report_id": report_id,
                "target": target,
                "survey_responses": survey_responses,
                "voice_response": voice_response,
                "body": response,
                "inference_time": end - start,
            }
            reports.insert_one(report)
        elif change['ns']['coll'] == 'surveys':
            # get data
            survey_id = change['fullDocument']['_id']
            target = change['fullDocument']['target']
            survey_response = change['fullDocument']['response']
            # call pipeline
            start = time.time()
            response = analyze_survey_response(target, survey_response)
            end = time.time()
            print('Inference time:', end - start)
            print('Response:', response)
            # put analysis in survey_analyses collection
            survey_analysis = {
                "survey_id": survey_id,
                "target": target,
                "survey_response": survey_response,
                "analysis": response,
                "inference_time": end - start,
            }
            survey_analyses.insert_one(survey_analysis)

    except KeyboardInterrupt:
        break


{'version': '6.0.11', 'gitVersion': 'f797f841eaf1759c770271ae00c88b92b2766eed', 'modules': ['enterprise'], 'allocator': 'tcmalloc', 'javascriptEngine': 'mozjs', 'sysInfo': 'deprecated', 'versionArray': [6, 0, 11, 0], 'bits': 64, 'debug': False, 'maxBsonObjectSize': 16777216, 'storageEngines': ['devnull', 'ephemeralForTest', 'inMemory', 'queryable_wt', 'wiredTiger'], 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1697968251, 143), 'signature': {'hash': b'\xc0}mR\xd8\xd2A\xf2\x82\x0fh\xad[s\x1e\x19\x048\x8e\xc2', 'keyId': 7228200195597533185}}, 'operationTime': Timestamp(1697968251, 143)}
Watching for changes...


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


{'_id': {'_data': '826534F0C6000000212B022C0100296E5A1004067B53CDD06545958AA83AC7AFE3B47A46645F696400646534F0C68D7EA8F79EC793B00004'}, 'operationType': 'insert', 'clusterTime': Timestamp(1697968326, 33), 'wallTime': datetime.datetime(2023, 10, 22, 9, 52, 6, 625000), 'fullDocument': {'_id': ObjectId('6534f0c68d7ea8f79ec793b0'), 'target': 'Tim', 'response': 'He is struggling with his mental health 5'}, 'ns': {'db': 'db', 'coll': 'surveys'}, 'documentKey': {'_id': ObjectId('6534f0c68d7ea8f79ec793b0')}}


The current implementation of Falcon calls `torch.scaled_dot_product_attention` directly, this will be deprecated in the future in favor of the `BetterTransformer` API. Please install the latest optimum library with `pip install -U optimum` and call `model.to_bettertransformer()` to benefit from `torch.scaled_dot_product_attention` and future performance optimizations.


Answer: "Lonely, depressed, and anxious"
Inference time: 3.3475558757781982
Response: ['Lonely', 'depressed', 'and anxious']
{'_id': {'_data': '826534F0CA000000022B022C0100296E5A1004087E1F32086D418F9739A80E48B524C946645F696400646534F0CA56AA64A66DC597D30004'}, 'operationType': 'insert', 'clusterTime': Timestamp(1697968330, 2), 'wallTime': datetime.datetime(2023, 10, 22, 9, 52, 10, 80000), 'fullDocument': {'_id': ObjectId('6534f0ca56aa64a66dc597d3'), 'survey_id': ObjectId('6534f0c68d7ea8f79ec793b0'), 'target': 'Tim', 'survey_response': 'He is struggling with his mental health 5', 'analysis': ['Lonely', 'depressed', 'and anxious'], 'inference_time': 3.3475558757781982}, 'ns': {'db': 'db', 'coll': 'survey_analyses'}, 'documentKey': {'_id': ObjectId('6534f0ca56aa64a66dc597d3')}}


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


{'_id': {'_data': '826534F0F3000000192B022C0100296E5A1004A91AB4954746487BADC0A657D87C72DC46645F696400646534F0F28D7EA8F79EC793B10004'}, 'operationType': 'insert', 'clusterTime': Timestamp(1697968371, 25), 'wallTime': datetime.datetime(2023, 10, 22, 9, 52, 51, 279000), 'fullDocument': {'_id': ObjectId('6534f0f28d7ea8f79ec793b1'), 'target': 'Farhan', 'voice_response': 'I have been really stressed out lately and it is affecting my mental health. I am not sure what to do about it.'}, 'ns': {'db': 'db', 'coll': 'voice_transcripts'}, 'documentKey': {'_id': ObjectId('6534f0f28d7ea8f79ec793b1')}}
getting data
calling pipeline
Inference time: 1.2551789283752441
Response: There is an issue with the mental health of some workers. Consider providing the team with extra rest breaks.
{'_id': {'_data': '826534F0F40000002D2B022C0100296E5A10043F1150212BF04FE1BE571EC24443EC5746645F696400646534F0F456AA64A66DC597D40004'}, 'operationType': 'insert', 'clusterTime': Timestamp(1697968372, 45), 'wallTime': date