In [1]:
import numpy as np
import pandas as pd
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


#### MESC Dataset

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Dataset_Segmentation/MESC.csv')
print(df.head(10))
segmentation_df = df[['Utterance','Speaker','StartTime']]
segmentation_df.head(10)

                                           Utterance    Speaker  Emotion  \
0                                        I told you.     Client  sadness   
1                                      Told me what?  Therapist  neutral   
2  That you'd be sorry you ever encouraged me to ...     Client  sadness   
3                              I'm not sorry at all.  Therapist  neutral   
4       You didn't expect it to be like this, I bet.     Client  sadness   
5                                         Like what?  Therapist  neutral   
6                                     You know what?     Client  sadness   
7                                It's disappointing.     Client  sadness   
8                         I thought I'd feel better,     Client  sadness   
9                                          relieved.     Client  sadness   

               Strategy  Dialogue_ID  Utterance_ID  Season  Episode  \
0             undefined            0             0       1        1   
1         Open questi

Unnamed: 0,Utterance,Speaker,StartTime
0,I told you.,Client,"00:00:54,097"
1,Told me what?,Therapist,"00:00:55,260"
2,That you'd be sorry you ever encouraged me to ...,Client,"00:00:56,651"
3,I'm not sorry at all.,Therapist,"00:00:59,951"
4,"You didn't expect it to be like this, I bet.",Client,"00:01:03,404"
5,Like what?,Therapist,"00:01:05,880"
6,You know what?,Client,"00:01:09,711"
7,It's disappointing.,Client,"00:01:11,563"
8,"I thought I'd feel better,",Client,"00:01:14,360"
9,relieved.,Client,"00:01:16,106"


In [4]:
final_segmentation_df = []
current_speaker = segmentation_df.iloc[0]['Speaker']
current_utterance = segmentation_df.iloc[0]['Utterance']

for i in range(1, len(segmentation_df)):
  if segmentation_df.iloc[i]['Speaker'] == current_speaker:
    current_utterance += ' ' + segmentation_df.iloc[i]['Utterance']
  else:
    final_segmentation_df.append({ 'utterance': current_utterance, 'speaker': current_speaker})
    current_speaker = segmentation_df.iloc[i]['Speaker']
    current_utterance = segmentation_df.iloc[i]['Utterance']

final_segmentation_df.append({ 'utterance': current_utterance, 'speaker': current_speaker})
final_segmentation_df = pd.DataFrame(final_segmentation_df)
final_segmentation_df.head(10)

Unnamed: 0,utterance,speaker
0,I told you.,Client
1,Told me what?,Therapist
2,That you'd be sorry you ever encouraged me to ...,Client
3,I'm not sorry at all.,Therapist
4,"You didn't expect it to be like this, I bet.",Client
5,Like what?,Therapist
6,You know what? It's disappointing. I thought I...,Client
7,And it isn't?,Therapist
8,"No, it's horrible. I don't know if I'm able to...",Client
9,Are you all right?,Therapist


#### DIAC-WOZ

In [5]:
def merging_data(df):
  df = df[['value','speaker']]
  df = df.rename(columns={'value': 'Utterance'})
  df['speaker'] = df['speaker'].replace('Ellie','Therapist')
  df['speaker'] = df['speaker'].replace('Participant','Client')

  merged_df = []
  current_speaker = df.iloc[0]['speaker']
  current_utterance = df.iloc[0]['Utterance']

  for i in range(1, len(df)):
    if df.iloc[i]['speaker'] == current_speaker:
      current_utterance += ' ' + df.iloc[i]['Utterance']
    else:
      merged_df.append({ 'utterance': current_utterance, 'speaker': current_speaker})
      current_speaker = df.iloc[i]['speaker']
      current_utterance = df.iloc[i]['Utterance']

  merged_df.append({ 'utterance': current_utterance, 'speaker': current_speaker})
  merged_df = pd.DataFrame(merged_df)
  return merged_df

In [6]:
final_merged_df = pd.DataFrame()

for i in range(300,493):
  file_path = f"/content/drive/MyDrive/Dataset_Segmentation/Transcripts/{i}_TRANSCRIPT.csv"
  try:
    df = pd.read_csv(file_path, sep="\t")
    print(f"Successfully loaded {file_path}")
    data = merging_data(df)
    final_merged_df = pd.concat([final_merged_df, data], ignore_index=True)
  except FileNotFoundError:
    print(f"File {file_path} not found, skipping...")
  except Exception as e:
    print(f"An error occurred with file {file_path}: {e}")

print(final_merged_df.shape)
final_merged_df.head(10)

Successfully loaded /content/drive/MyDrive/Dataset_Segmentation/Transcripts/300_TRANSCRIPT.csv
Successfully loaded /content/drive/MyDrive/Dataset_Segmentation/Transcripts/301_TRANSCRIPT.csv
Successfully loaded /content/drive/MyDrive/Dataset_Segmentation/Transcripts/302_TRANSCRIPT.csv
Successfully loaded /content/drive/MyDrive/Dataset_Segmentation/Transcripts/303_TRANSCRIPT.csv
Successfully loaded /content/drive/MyDrive/Dataset_Segmentation/Transcripts/304_TRANSCRIPT.csv
Successfully loaded /content/drive/MyDrive/Dataset_Segmentation/Transcripts/305_TRANSCRIPT.csv
Successfully loaded /content/drive/MyDrive/Dataset_Segmentation/Transcripts/306_TRANSCRIPT.csv
Successfully loaded /content/drive/MyDrive/Dataset_Segmentation/Transcripts/307_TRANSCRIPT.csv
Successfully loaded /content/drive/MyDrive/Dataset_Segmentation/Transcripts/308_TRANSCRIPT.csv
Successfully loaded /content/drive/MyDrive/Dataset_Segmentation/Transcripts/309_TRANSCRIPT.csv
Successfully loaded /content/drive/MyDrive/Dataset

Unnamed: 0,utterance,speaker
0,hi i'm ellie thanks for coming in today i was ...,Therapist
1,good,Client
2,that's good where are you from originally,Therapist
3,atlanta georgia,Client
4,really why'd you move to l_a,Therapist
5,um my parents are from here um,Client
6,how do you like l_a,Therapist
7,i love it,Client
8,what are some things you really like about l_a,Therapist
9,i like the weather i like the opportunities um...,Client


#### Merging the Data

In [7]:
final_segmentation_df = pd.concat([final_segmentation_df, final_merged_df], ignore_index=True)
print(final_segmentation_df.shape)
final_segmentation_df.head(10)

(30192, 2)


Unnamed: 0,utterance,speaker
0,I told you.,Client
1,Told me what?,Therapist
2,That you'd be sorry you ever encouraged me to ...,Client
3,I'm not sorry at all.,Therapist
4,"You didn't expect it to be like this, I bet.",Client
5,Like what?,Therapist
6,You know what? It's disappointing. I thought I...,Client
7,And it isn't?,Therapist
8,"No, it's horrible. I don't know if I'm able to...",Client
9,Are you all right?,Therapist


In [8]:
final_segmentation_df.to_parquet("/content/drive/My Drive/Therapy_Session.parquet", engine="pyarrow", index=False)