# Predict tags from interview scripts

## Initialize libraries

In [None]:
# !pip install langchain_community
# !pip install langchain_aws

In [3]:
# !pip install faiss-gpu

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
# !pip install python-docx

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
# !pip install tqdm

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
import pandas as pd
from botocore.config import Config
from langchain_community.vectorstores import FAISS
from langchain_aws import ChatBedrock, BedrockEmbeddings

In [7]:
from transcript_loader import load_transcripts
from tag_definitions import tags

In [8]:
from pprint import pprint

In [9]:
import transcript_tagger

In [10]:
import importlib

try:
    importlib.reload(transcript_tagger)
    print("Module successfully reloaded")
except Exception as e:
    print(f"Error reloading module: {e}")

Module successfully reloaded


# Preprocess the input data

## Loading the transcripts

In [11]:
folder_path = "data/transcripts_train"
train_transcripts = load_transcripts(folder_path)

train_participants = list(train_transcripts.keys())
train_participants

['P18 Jenny W',
 'P16 Sari N',
 'P2 Tegan K',
 'P17 Ruby V',
 'P14 Sandy R',
 'P12 Lindsay S',
 'P15 Ben W',
 'P13 Tina B',
 'P10 Ashantae O',
 'P11 Jamie M',
 'P1 Olga M']

In [25]:
folder_path = "data/transcripts_test_single"
test_transcripts = load_transcripts(folder_path)

test_participants = list(test_transcripts.keys())
test_participants

['P5 Joana J']

## Loading the manual tags

In [13]:
df = pd.read_csv("data/Raw Condens Data - highlights_export.csv")

In [14]:
df.head(2)

Unnamed: 0,Quote,Participants,Tag Group,Tag,Segment,Copy to Miro,Link
0,"[P3 Lia O] 'He's, you know, old school. So I u...",P3 Lia O,Caregiver,grocery,"Kids 0 - 10, Caregiver","[P3 Lia O] 'He's, you know, old school. So I u...",https://app.condens.io/session/XK6q3c/A6cYxXoK...
1,[P3 Lia O] 'He if he wants like a pizza or som...,P3 Lia O,Caregiver,food delivery,"Kids 0 - 10, Caregiver",[P3 Lia O] 'He if he wants like a pizza or som...,https://app.condens.io/session/XK6q3c/A6cYxXoK...


## Splitting Train and Test Transcripts

In [15]:
def contains_match(participant, all_names):
    participant_id = participant.split(' ')[0]
    return any(participant_id in name for name in all_names)

# Apply the matching function to filter rows

matching_mask = df['Participants'].apply(lambda x: contains_match(x, train_participants))


# Split the DataFrame into those that match and those that don't
df_train = df[matching_mask]
df_test = df[~matching_mask]

## Checking tag definitions

In [17]:
# # Iterate over all tags
# for tag in tags:
#     print(f"{tag.tag_group} - {tag.tag}")

## Creating Vector Store and Starting LLM Instance

In [26]:
bedrock_config = Config(
            retries={'max_attempts': 100},
            read_timeout=1000,
        )

embeddings = BedrockEmbeddings(
    model_id="amazon.titan-embed-text-v2:0",
    # model_id="cohere.embed-english-v3"
    config=bedrock_config,
)


llm_instance = ChatBedrock(
    # model_id= "anthropic.claude-3-haiku-20240307-v1:0",
    # model_id="anthropic.claude-3-sonnet-20240229-v1:0"
    model_id="anthropic.claude-3-5-sonnet-20240620-v1:0",
    # model_id="anthropic.claude-3-5-sonnet-20241022-v2:0",
    # model_id="us.meta.llama3-2-90b-instruct-v1:0"
    # model_id="meta.llama3-1-405b-instruct-v1:0"
    # model_id="meta.llama3-1-70b-instruct-v1:0"
    region_name='us-east-1',
    config=bedrock_config,
    # model_kwargs={'temperature': 0.01} # Play around with this if your results isn't consistent
).with_retry()

Using the mapping to retrieve tag (not embedding it with the Quote)

In [27]:
quote_tag_map = df_train.set_index('Quote')['Tag'].to_dict()

In [28]:
next(iter(quote_tag_map.items()))

("[P11 Jamie] 'For one of your your grandparents. Yeah, Grandfather.'",
 'who i give care to')

In [29]:
all_quotes = list(quote_tag_map.keys())
all_tags = list(set(tag.lower() for tag in quote_tag_map.values()))

In [30]:
%%time
quote_vector_store = FAISS.from_texts(all_quotes, 
                                      embeddings,
                                      normalize_L2=True)

CPU times: user 2.63 s, sys: 104 ms, total: 2.73 s
Wall time: 1min 5s


In [31]:
%%time
tag_vector_store = FAISS.from_texts(all_tags,
                                    embeddings,
                                    normalize_L2=True)

CPU times: user 165 ms, sys: 7.14 ms, total: 172 ms
Wall time: 4.06 s


# Running the Tagger

In [32]:
import importlib

try:
    importlib.reload(transcript_tagger)
    print("Module successfully reloaded")
except Exception as e:
    print(f"Error reloading module: {e}")

Module successfully reloaded


In [33]:
tagger = transcript_tagger.TranscriptTagger(tags, llm_instance, quote_tag_map, quote_vector_store, tag_vector_store)

In [34]:
%%time
df_out = tagger.tag_transcripts(test_transcripts)

Tagging transcripts for each participant:   0%|          | 0/1 [00:00<?, ?it/s]
Tagging sections in transcript:   0%|          | 0/99 [00:00<?, ?it/s][A
Tagging sections in transcript:   1%|          | 1/99 [00:01<03:04,  1.88s/it][A
Tagging sections in transcript:   2%|▏         | 2/99 [00:05<05:01,  3.10s/it][A
Tagging sections in transcript:   3%|▎         | 3/99 [00:07<04:09,  2.60s/it][A
Tagging sections in transcript:   4%|▍         | 4/99 [00:10<04:12,  2.66s/it][A
Tagging sections in transcript:   5%|▌         | 5/99 [00:12<03:52,  2.48s/it][A
Tagging sections in transcript:   6%|▌         | 6/99 [00:16<04:45,  3.07s/it][A
Tagging sections in transcript:   7%|▋         | 7/99 [00:18<03:48,  2.48s/it][A
Tagging sections in transcript:   8%|▊         | 8/99 [00:23<05:08,  3.39s/it][A
Tagging sections in transcript:   9%|▉         | 9/99 [00:26<04:55,  3.28s/it][A
Tagging sections in transcript:  10%|█         | 10/99 [00:28<04:10,  2.81s/it][A
Tagging sections in trans

CPU times: user 12.3 s, sys: 177 ms, total: 12.4 s
Wall time: 5min 8s





In [102]:
# %%time
# df_out = tagger.tag_transcripts(test_transcripts)

Tagging transcripts for each participant:   0%|          | 0/7 [00:00<?, ?it/s]
Tagging sections in transcript:   0%|          | 0/132 [00:00<?, ?it/s][A
Tagging sections in transcript:   1%|          | 1/132 [00:02<06:22,  2.92s/it][A
Tagging sections in transcript:   2%|▏         | 2/132 [00:05<05:21,  2.47s/it][A
Tagging sections in transcript:   2%|▏         | 3/132 [00:06<04:22,  2.04s/it][A
Tagging sections in transcript:   3%|▎         | 4/132 [00:09<05:01,  2.36s/it][A
Tagging sections in transcript:   4%|▍         | 5/132 [00:10<04:19,  2.05s/it][A
Tagging sections in transcript:   5%|▍         | 6/132 [00:12<03:45,  1.79s/it][A
Tagging sections in transcript:   5%|▌         | 7/132 [00:14<04:14,  2.04s/it][A
Tagging sections in transcript:   6%|▌         | 8/132 [00:17<04:24,  2.13s/it][A
Tagging sections in transcript:   7%|▋         | 9/132 [00:20<04:58,  2.42s/it][A
Tagging sections in transcript:   8%|▊         | 10/132 [00:21<04:16,  2.10s/it][A
Tagging sectio

CPU times: user 1min 21s, sys: 1.2 s, total: 1min 23s
Wall time: 41min 24s





In [36]:
df_out.head()

Unnamed: 0,Quote,Tag,Confidence,Tag Group,Participant
0,So I work as like a chief customer officer. So...,what i am responsible for,0.9,No Tag Group,P5 Joana J
1,I have 4 kids and one step kid.,kids,1.0,Family makeup,P5 Joana J
2,"Yeah, they all live in the home with me, so th...",kids,1.0,Family makeup,P5 Joana J
3,"OK, three 5, I think 7 and then 10.",kids,0.9,Family makeup,P5 Joana J
4,But most of the times traditionally I grew up ...,childcare,0.9,No Tag Group,P5 Joana J


In [35]:
df_out.to_csv('output_one_script.csv', index=False)