In [201]:
from docx import Document 
import re
import pandas as pd

import os

In [202]:
"""
Chris Kim
Updated Nov 10, 2024
"""

test_doc = Document("doc_files_raw/Interview 01 Transcript.docx")
test_text = [para.text for para in test_doc.paragraphs if para.text.strip()]

test_text

['Interview Transcript ',
 'Participant 1 ',
 'Interviewer: Okay, so I wanted to begin with just some basic demographic information background. Can you just give me a little summary of who you are and your education and employment history?',
 "Participant 1: Okay I'm (Participant 1). Originally I'm from Japan, I came to United States 1991 or so and as an exchange student um at the college. And I graduated four year college in computer information systems and I moved to Georgia to be a part of Olympic game, but before that I uh I worked and then so after the Olympic Game, I continue to work at the same company as a full time there. Uh, uh but meanwhile I got the green card, so I moved uh I got a more professional job at Macy's so I moved to Macy's 1999 and since then I'm with Macy's um over 23 years.",
 'Interviewer: wow yeah. Wait, were you in the Olympics?',
 "Participant 1: Yes, no, no, no I'm part of the Olympic game. And my kids um my childhood dream was to be an Olympian. So and t

In [203]:
dialogues = []
for line in test_text:
    if line.startswith("Interviewer:"):
        dialogues.append(("Interviewer",line.replace("Interviewer:","").strip()))
    elif re.match(r"Participant \d+:", line):
        participant_label = re.match(r"(Participant \d+):", line).group(1)
        dialogues.append((participant_label, line.replace(f"{participant_label}:", "").strip()))
    else:
        dialogues.append(("Unknown",line.strip()))

dialogues

[('Unknown', 'Interview Transcript'),
 ('Unknown', 'Participant 1'),
 ('Interviewer',
  'Okay, so I wanted to begin with just some basic demographic information background. Can you just give me a little summary of who you are and your education and employment history?'),
 ('Participant 1',
  "Okay I'm (Participant 1). Originally I'm from Japan, I came to United States 1991 or so and as an exchange student um at the college. And I graduated four year college in computer information systems and I moved to Georgia to be a part of Olympic game, but before that I uh I worked and then so after the Olympic Game, I continue to work at the same company as a full time there. Uh, uh but meanwhile I got the green card, so I moved uh I got a more professional job at Macy's so I moved to Macy's 1999 and since then I'm with Macy's um over 23 years."),
 ('Interviewer', 'wow yeah. Wait, were you in the Olympics?'),
 ('Participant 1',
  "Yes, no, no, no I'm part of the Olympic game. And my kids um my ch

In [204]:
test_df = pd.DataFrame(dialogues, columns = ["Speaker","Dialogue"])
test_df.head

<bound method NDFrame.head of            Speaker                                           Dialogue
0          Unknown                               Interview Transcript
1          Unknown                                      Participant 1
2      Interviewer  Okay, so I wanted to begin with just some basi...
3    Participant 1  Okay I'm (Participant 1). Originally I'm from ...
4      Interviewer          wow yeah. Wait, were you in the Olympics?
..             ...                                                ...
145    Interviewer  Um should I email you, I mean I guess I alread...
146  Participant 1  yeah, yeah. [name] send me a flyer so I will f...
147    Interviewer            thank you so much that would be amazing
148  Participant 1                              you’re welcome! Okay.
149    Interviewer  Um oh real quick, uh do you have venmo or paypal?

[150 rows x 2 columns]>

In [205]:
'''Chris Kim
Updated Nov 23, 2024 
'''

# create function that separates these dfs


# Updated Jan 04, 2025 
def sep_speaker(input_path):
    doc = Document(input_path)
    text = [para.text for para in doc.paragraphs if para.text.strip()]

    dialog = []
    current_speaker = None # used to track speaker to ensure that multiple paragraphs of one speaker is accurately captured. 
    current_dialogue = [] 

    for line in text:
        # Check for a new speaker
        if line.startswith("Interviewer:"):
            if current_speaker:  
                dialog.append((current_speaker, " ".join(current_dialogue)))
            current_speaker = "Interviewer"
            current_dialogue = [line.replace("Interviewer:", "").strip()]
        elif re.match(r"Participant \d+:", line):
            if current_speaker:  
                dialog.append((current_speaker, " ".join(current_dialogue)))
            current_speaker = re.match(r"(Participant \d+):", line).group(1)
            current_dialogue = [line.replace(f"{current_speaker}:", "").strip()]
        #elif re.match(r"^Participant.*?:",line):
        #    if current_speaker:  
        #        dialog.append((current_speaker, " ".join(current_dialogue)))
        #    current_speaker = re.match(r"^Participant.*?:", line).group(1)
        #    current_dialogue = [line.replace(f"{current_speaker}:", "").strip()]
        else:
            
            if current_speaker:
                current_dialogue.append(line.strip())
            else:
                # If no speaker is identified, treat it as "Unknown"
                current_speaker = "Unknown"
                current_dialogue.append(line.strip())

    # Append the last speaker's dialogue
    if current_speaker and current_dialogue:
        dialog.append((current_speaker, " ".join(current_dialogue)))

    # Create a DataFrame from the dialog list
    df = pd.DataFrame(dialog, columns=["Speaker", "Dialogue"])
    return df



In [206]:


test_df1 = sep_speaker("doc_files_raw/Interview 01 Transcript.docx")
test_df1.head # code runs as expected. the sep_speaker function gives a label for the speaker of every paragraph. 



<bound method NDFrame.head of            Speaker                                           Dialogue
0          Unknown                 Interview Transcript Participant 1
1      Interviewer  Okay, so I wanted to begin with just some basi...
2    Participant 1  Okay I'm (Participant 1). Originally I'm from ...
3      Interviewer          wow yeah. Wait, were you in the Olympics?
4    Participant 1  Yes, no, no, no I'm part of the Olympic game. ...
..             ...                                                ...
132    Interviewer  Um should I email you, I mean I guess I alread...
133  Participant 1  yeah, yeah. [name] send me a flyer so I will f...
134    Interviewer            thank you so much that would be amazing
135  Participant 1                              you’re welcome! Okay.
136    Interviewer  Um oh real quick, uh do you have venmo or paypal?

[137 rows x 2 columns]>

In [207]:
test_df6 = sep_speaker('doc_files_raw/Interview 06 Transcript.docx')
test_df6.head

<bound method NDFrame.head of         Speaker                                           Dialogue
0       Unknown                 Interview Transcript Participant 6
1   Interviewer  So, to start with, I just want you to give lik...
2   Interviewer  Oh, wow okay. That's so many things, um yeah, ...
3   Interviewer  Six years ago, okay that's when you moved back...
4   Interviewer  Sure, yeah I can imagine that. In your day-to-...
5   Interviewer  And those groups of people are typically clien...
6   Interviewer  So in that case, are you interacting with like...
7   Interviewer  Sure, so with these coworkers and with the peo...
8   Interviewer  Sure yeah, and in those cases where you guys a...
9   Interviewer  Sure yeah yeah. Do you ever vent with any of y...
10  Interviewer  So how, how is it that you ended up, I mean ha...
11  Interviewer  Sure yeah that makes a lot of sense. You menti...
12  Interviewer  Got it and on like a day-to-day basis how many...
13  Interviewer  Got it, I see t

In [208]:
# debugging

unknown_count1 = test_df1[test_df1['Speaker'] == 'Unknown']
print(unknown_count1)

# when the speaker speaks for multiple paragraphs, the code does not pick it up. Hence excess of unknown 

   Speaker                            Dialogue
0  Unknown  Interview Transcript Participant 1


In [209]:
test_df4 = sep_speaker("doc_files_raw/Interview 04 Transcript.docx") # randomly chose a different doc to test. 
test_df4.head



<bound method NDFrame.head of           Speaker                                           Dialogue
0         Unknown                 Interview Transcript Participant 4
1     Interviewer  Okay, so I want to begin with just asking you ...
2   Participant 4  Okay yeah so just to begin with, where I am fr...
3     Interviewer  Awesome, yeah so you work um okay, so I don't ...
4   Participant 4  right, uh I mean now everybody's remote, so we...
..            ...                                                ...
89    Interviewer                                  Full time. I see.
90  Participant 4  Yeah so been contractor, then I left and then ...
91    Interviewer                 And can I ask you how old you are.
92  Participant 4                          Sure I’m 41 golden years.
93    Interviewer  Perfect okay. Um well that's pretty much all o...

[94 rows x 2 columns]>

In [210]:
# same issue found on interview 4.
print(test_df4[test_df4['Speaker'] == 'Unknown'])

   Speaker                            Dialogue
0  Unknown  Interview Transcript Participant 4


In [211]:
# rewriting the initial for loop to keep track of speaker. 

def tag_speaker(input_path):
    
    
    doc = Document(input_path)

    text = [para.text for para in doc.paragraphs if para.text.strip()]
    
    dialog = []
    current_speaker = "Unknown"

    for line in text:
        if line.startswith("Interviewer:"):
            current_speaker = "Interviewer"
            dialog.append(("Interviewer",line.replace("Interviewer:","").strip()))
        elif line.startswith("Allison S:"):
            current_speaker = "Interviewer"
            dialog.append(("Interviewer",line.replace("Allison S:","").strip()))    
        elif re.match(r"Participant \d+:", line): 
            current_speaker = "Participant"
            participant_label = re.match(r"(Participant \d+):", line).group(1)
            dialog.append(("Participant", line.replace(f"{participant_label}:", "").strip()))
        elif line.startswith("Participant:"):
            current_speaker = "Participant"
            dialog.append(("Participant", line.replace("Participant: ","").strip()))
        elif re.match(r"Participant( \d+|\s*\(.*?\))?:", line): 
            current_speaker = "Participant"
            participant_label = re.match(r"(Participant( \d+|\s*\(.*?\))?):", line).group(1)
            dialog.append(("Participant", line.replace(f"{participant_label}:", "").strip()))
    
        else:
            dialog.append((current_speaker,line.strip()))
    df = pd.DataFrame(dialog, columns = ["Speaker","Dialogue"])
    return df

''' OLD CODE
def sep_speaker(input_path):
    doc = Document(input_path)
    text = [para.text for para in doc.paragraphs if para.text.strip()]
    
    dialog = []
    current = "Unknown"
    for line in text:
        if line.startswith("Interviewer:"):
            
            dialog.append(("Interviewer",line.replace("Interviewer:","").strip()))
            current = 'Interviewer'
        elif re.match(r"Participant \d+:", line):
            participant_label = re.match(r"(Participant \d+):", line).group(1)
            dialog.append((participant_label, line.replace(f"{participant_label}:", "").strip()))
            current = 'Participant'
        else:
            if current == 'Participant':
                
                dialog.append((participant_label))
            elif current == 'Interviewer':
                dialog.append(("Interviewer"))
            else:
                dialog.append(("Unknown",line.strip()))
    df = pd.DataFrame(dialog, columns = ["Speaker","Dialogue"])
    return df;

'''

'''def tag_speaker(input_path):
    doc = Document(input_path)

    text = [para.text for para in doc.paragraphs if para.text.strip()]
    
    dialog = []
    current_speaker = "Unknown"

    for line in text:
        if line.startswith("Interviewer:"):
            current_speaker = "Interviewer"
            dialog.append(("Interviewer",line.replace("Interviewer:","").strip()))
        elif line.startswith("Allison S:"):
            current_speaker = "Interviewer"
            dialog.append(("Interviewer",line.replace("Allison S:","").strip()))    
        elif re.match(r"Participant \d+:", line): 
            current_speaker = "Participant"
            participant_label = re.match(r"(Participant \d+):", line).group(1)
            dialog.append(("Participant", line.replace(f"{participant_label}:", "").strip()))
        elif line.startswith("Participant:"):
            current_speaker = "Participant"
            dialog.append(("Participant", line.replace("Participant: ","").strip()))
        else:
            dialog.append((current_speaker,line.strip()))
    df = pd.DataFrame(dialog, columns = ["Speaker","Dialogue"])
    return df'''

'def tag_speaker(input_path):\n    doc = Document(input_path)\n\n    text = [para.text for para in doc.paragraphs if para.text.strip()]\n    \n    dialog = []\n    current_speaker = "Unknown"\n\n    for line in text:\n        if line.startswith("Interviewer:"):\n            current_speaker = "Interviewer"\n            dialog.append(("Interviewer",line.replace("Interviewer:","").strip()))\n        elif line.startswith("Allison S:"):\n            current_speaker = "Interviewer"\n            dialog.append(("Interviewer",line.replace("Allison S:","").strip()))    \n        elif re.match(r"Participant \\d+:", line): \n            current_speaker = "Participant"\n            participant_label = re.match(r"(Participant \\d+):", line).group(1)\n            dialog.append(("Participant", line.replace(f"{participant_label}:", "").strip()))\n        elif line.startswith("Participant:"):\n            current_speaker = "Participant"\n            dialog.append(("Participant", line.replace("Participan

In [214]:
# re- test if it works.

tag_1 = tag_speaker("doc_files_raw/Interview 01 Transcript.docx")
tag_1.head

#print(tag_1[tag_1['Speaker'] == 'Unknown'])


<bound method NDFrame.head of          Speaker                                           Dialogue
0        Unknown                               Interview Transcript
1        Unknown                                      Participant 1
2    Interviewer  Okay, so I wanted to begin with just some basi...
3    Participant  Okay I'm (Participant 1). Originally I'm from ...
4    Interviewer          wow yeah. Wait, were you in the Olympics?
..           ...                                                ...
145  Interviewer  Um should I email you, I mean I guess I alread...
146  Participant  yeah, yeah. [name] send me a flyer so I will f...
147  Interviewer            thank you so much that would be amazing
148  Participant                              you’re welcome! Okay.
149  Interviewer  Um oh real quick, uh do you have venmo or paypal?

[150 rows x 2 columns]>

In [215]:
tag_6 = tag_speaker("/Users/grealish/Documents/GitHub/interview_corpus/doc_files_raw/Interview 06 Transcript.docx")
tag_6.head


<bound method NDFrame.head of         Speaker                                           Dialogue
0       Unknown                               Interview Transcript
1       Unknown                                      Participant 6
2   Interviewer  So, to start with, I just want you to give lik...
3   Participant  I was born in Santa Barbara California. Mid 80...
4   Interviewer  Oh, wow okay. That's so many things, um yeah, ...
..          ...                                                ...
71  Participant  I knew it was time limited. So, knowing that t...
72  Interviewer  overall do you like your job, and if you have ...
73  Participant  What I do, and why I do it? Ten. The company e...
74  Interviewer  Is there anything else about your workplace ha...
75  Participant  So one thing that's come up in my company a lo...

[76 rows x 2 columns]>

In [216]:
# export tagged df to into csv files. 
input_folder = "doc_files_raw"
output_folder = "df_tagged"

for file_name in os.listdir(input_folder):
    if file_name.endswith(".docx"):
        input_path = os.path.join(input_folder,file_name)
        cleaned_df = tag_speaker(input_path)

        output_file_name = file_name.replace(".docx"," tagged.csv")
        output_path = os.path.join(output_folder, output_file_name)
        cleaned_df.to_csv(output_path,index = False)



In [None]:
# Clean interview guide:
guide_doc = Document("Interview Guide.docx")