In [1]:
import os
import sys
import json
import time
import logging
import random
import re
import nltk
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup,
)

`fused_weight_gradient_mlp_cuda` module not found. gradient accumulation fusion with weight gradient computation disabled.


In [2]:
nb_path = "/storage"
csvs_path = "/datasets/transcript-csvs"

outputs = "outputs"
tmp_csvs_path = f"{outputs}/csvs"
data_csv = f"{csvs_path}/data.csv"
train_csv = f"{tmp_csvs_path}/train.csv"
test_csv = f"{tmp_csvs_path}/test.csv"

checkpoint_path = f"{outputs}/checkpoint"
model_path = f"{outputs}/model"
result_path = f"{outputs}/trimmed"
result_file_path = f"{result_path}/trimmed.csv"

In [3]:
!rm -rf $result_path && mkdir -p $result_path

In [4]:
df = pd.read_csv(data_csv)

In [5]:
df['therapist_txt'].value_counts()[0:40]

THERAPIST: Yeah.                683
THERAPIST: Okay.                592
THERAPIST: Mmm hmm.             404
THERAPIST: Right.               353
THERAPIST: Sure.                158
THERAPIST:                       97
THERAPIST: Uh huh.               95
THERAPIST: Um hmm.               77
THERAPIST: Yes.                  74
THERAPIST: Yea.                  68
THERAPIST: Mm-hmm.               59
THERAPIST: Um hm.                58
THERAPIST: Um hum.               58
THERAPIST: Oh.                   57
THERAPIST: Mm.                   35
THERAPIST: I see.                34
THERAPIST: Oh, okay.             27
THERAPIST: Yeah.                 25
THERAPIST: Yeah?                 25
THERAPIST: Uh-huh.               22
THERAPIST:                       20
THERAPIST: Um-hum.               20
THERAPIST: No.                   20
THERAPIST: Uh hmm.               19
THERAPIST: Yeah. Yeah.           18
THERAPIST: OK.                   18
THERAPIST:  .                    18
THERAPIST: Wow.             

In [6]:
df['therapist_normalized'] = df['therapist_txt'].str.lower()

In [7]:
df['therapist_normalized'] = df['therapist_normalized'].str.strip()

In [8]:
df['therapist_normalized'] = df['therapist_normalized'].str.replace("\.$", "", regex=True)

In [9]:
df['therapist_normalized'] = df['therapist_normalized'].str.strip()

In [10]:
df['therapist_normalized'] = df['therapist_normalized'].str.replace("^therapist: \s+", "therapist: ", regex=True)

In [11]:
def replace_patterns(df, patterns, value, prompt="therapist: "):
    result = "|".join(patterns)
    result = f"^{prompt}({result})$"
    return df.str.replace(result, f"{prompt}{value}", regex=True)

In [12]:
hmm_patterns = [
    "hm",
    "hmm",
    "hmmm",
    "huh",
    "hum",
    "mm hm",
    "mm hmm",
    "mm",
    "mm-hmm mm-hmm",
    "mm-hmm",
    "mmhmm",
    "mmm hmm",
    "mmm",
    "uh hm",
    "uh hmm",
    "uh huh uh huh",
    "uh huh",
    "uh huh\. uh huh",
    "uh-huh",
    "um hm",
    "um hmm",
    "um hum um hum",
    "um hum",
    "um",
    "um-hmm",
    "um-hum",
    "umm-hmm",
]

In [13]:
yes_patterns = [
    "oh yeah",
    "yea",
    "yea, yea",
    "yea, yea, yea",
    "yeah yeah",
    "yeah",
    "yeah, sure",
    "yeah, yeah",
    "yeah\. yeah",
    "yep",
    "yes",
    "yes, yes",
]

In [14]:
df["therapist_normalized"] = replace_patterns(
    df["therapist_normalized"], hmm_patterns, "hmmm"
)

In [15]:
df["therapist_normalized"] = replace_patterns(
    df["therapist_normalized"], yes_patterns, "yeah"
)

In [16]:
df['therapist_normalized'].value_counts()[0:40]

therapist: hmmm                 963
therapist: yeah                 936
therapist: okay                 607
therapist: right                373
therapist: sure                 162
therapist:                      146
therapist: oh                    57
therapist: i see                 34
therapist: ok                    33
therapist: oh, okay              29
therapist: yeah?                 25
therapist: no                    22
therapist: wow                   20
therapist: ..                    17
therapist: what?                 16
therapist: what do you mean?     16
therapist: no?                   13
therapist: okay. okay            13
therapist: why?                  12
therapist: really?               11
therapist: good                  11
therapist: absolutely            10
therapist: is that right?         9
therapist: what's that?           9
therapist: interesting            9
therapist: okay?                  9
therapist: i know                 8
therapist: huh?             

In [17]:
df['therapist_selected'] = True

In [18]:
df_therapist = df['therapist_normalized'].value_counts().rename_axis('txts').reset_index(name='counts')

In [19]:
df_therapist = df_therapist[df_therapist['counts'].gt(10)]

In [20]:
df_therapist

Unnamed: 0,txts,counts
0,therapist: hmmm,963
1,therapist: yeah,936
2,therapist: okay,607
3,therapist: right,373
4,therapist: sure,162
5,therapist:,146
6,therapist: oh,57
7,therapist: i see,34
8,therapist: ok,33
9,"therapist: oh, okay",29


In [21]:
for i in range(len(df_therapist)):
    txt = df_therapist.iloc[i]['txts']
    matches = df[df['therapist_normalized'].str.fullmatch(re.escape(txt))]
    matches_selected = matches.sample(n=10)
    matches_dropped = matches.drop(matches_selected.index)
    df.at[matches_dropped.index,'therapist_selected'] = False

In [22]:
df[df['therapist_selected'] == True]['therapist_normalized'].value_counts()[0:40]

therapist: no?                  10
therapist: wow                  10
therapist: hmmm                 10
therapist: right                10
therapist:                      10
therapist: ..                   10
therapist: oh, okay             10
therapist: yeah?                10
therapist: i see                10
therapist: why?                 10
therapist: what do you mean?    10
therapist: ok                   10
therapist: oh                   10
therapist: really?              10
therapist: sure                 10
therapist: yeah                 10
therapist: absolutely           10
therapist: okay                 10
therapist: good                 10
therapist: no                   10
therapist: what?                10
therapist: okay. okay           10
therapist: interesting           9
therapist: what's that?          9
therapist: okay?                 9
therapist: is that right?        9
therapist: how so?               8
therapist: i know                8
therapist: huh?     

In [23]:
df['client_normalized'] = df['client_txt'].str.lower()

In [24]:
df['client_normalized'] = df['client_normalized'].str.strip()

In [25]:
df['client_normalized'] = df['client_normalized'].str.replace("\.$", "", regex=True)

In [26]:
df['client_normalized'] = df['client_normalized'].str.strip()

In [27]:
df['client_normalized'] = df['client_normalized'].str.replace("^client: \s+", "client: ", regex=True)

In [28]:
df["client_normalized"] = replace_patterns(
    df["client_normalized"], hmm_patterns, "hmmm", prompt="client: "
)
df["client_normalized"] = replace_patterns(
    df["client_normalized"], yes_patterns, "yeah", prompt="client: "
)

In [29]:
df['client_selected'] = True

In [30]:
df_client = df[df['therapist_selected'] == True]
df_client = df_client['client_normalized'].value_counts().rename_axis('txts').reset_index(name='counts')

In [31]:
df_client = df_client[df_client['counts'].gt(10)]

In [32]:
for i in range(len(df_client)):
    txt = df_client.iloc[i]["txts"]
    matches = df[df["client_normalized"].str.fullmatch(re.escape(txt))]
    matches_selected = matches.sample(n=10)
    matches_dropped = matches.drop(matches_selected.index)
    df.at[matches_dropped.index, "client_selected"] = False

In [33]:
df_selected = df[(df['therapist_selected'] == True) & (df['client_selected'] == True)]

In [34]:
df_selected['therapist_normalized'].value_counts()[0:40]

therapist: okay                         10
therapist: sure                         10
therapist: ok                           10
therapist: hmmm                         10
therapist: oh                           10
therapist: oh, okay                     10
therapist: what do you mean?            10
therapist: i see                        10
therapist: wow                          10
therapist:                               9
therapist: absolutely                    9
therapist: why?                          9
therapist: right                         9
therapist: interesting                   9
therapist: really?                       9
therapist: good                          9
therapist: no                            9
therapist: okay. okay                    9
therapist: is that right?                8
therapist: yeah?                         8
therapist: all right                     8
therapist: ..                            8
therapist: what?                         8
therapist: 

In [35]:
df_selected['client_normalized'].value_counts()[0:40]

client: all right                10
client: really?                  10
client: what?                    10
client: thank you                10
client: uh-hmm                   10
client: no                       10
client: so                       10
client: ok                       10
client: okay                     10
client: oh, yeah                 10
client: oh                       10
client: hmmm                     10
client: right                    10
client: i know                   10
client: sure                      9
client: oh, okay                  9
client: that's right              8
client: right, right              8
client: i don't know              8
client: yeah                      8
client:                           8
client: exactly                   8
client: so..                      8
client: you know?                 8
client: right. right              7
client: well                      6
client: oh!                       6
client: alright             

In [None]:
df_selected.to_csv(
    result_file_path,
    index=False,
    columns=["filename", "case_number", "client_txt", "therapist_txt"],
)