<h3>Data preparation</h3>

In [2]:
from bs4 import BeautifulSoup
import requests

In [42]:
# URL of the Friends episode transcript
url = "https://fangj.github.io/friends/season/0101.html" 

# Send an HTTP GET request to fetch the webpage content
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser") 

In [43]:
from collections import defaultdict

# A default dict is used to store interaction counts. The defaylt value is 0 for missing keys
interactions = defaultdict(int)

previous_speaker = None
speaker = None

# Function to split a string with multiple speakers (e.g. "Monica and Rachel")
def split_speaker(speaker):
    return [s.strip() for s in speaker.split(" and ")]

# Iterate over all <p> tags
for p in soup.find_all("p"):
    # Look for a bold tag to identify the speaker
    bold_tag = p.find("b")
    if bold_tag:
        speaker = bold_tag.get_text(strip=True).replace(":", "")
        # Split the speaker string if there's more than one
        speakers = split_speaker(speaker)
    else:
        speakers = []

    if previous_speaker and speakers:
        for s in speakers:
            # Check if the speaker is not a special case
            if previous_speaker not in ["All", "Closing Credits", "Commercial Break", s] and s not in ["All", "Closing Credits", "Commercial Break"]:
                interactions[(previous_speaker, s)] += 1

    previous_speaker = speakers[-1] if speakers else previous_speaker

interactions

defaultdict(int,
            {('Monica', 'Joey'): 9,
             ('Joey', 'Chandler'): 12,
             ('Chandler', 'Phoebe'): 1,
             ('Phoebe', 'Monica'): 7,
             ('Monica', 'Chandler'): 4,
             ('Chandler', 'Joey'): 12,
             ('Joey', 'Phoebe'): 3,
             ('Phoebe', 'Chandler'): 2,
             ('Chandler', 'Monica'): 7,
             ('Chandler', 'Ross'): 6,
             ('Ross', 'Joey'): 9,
             ('Joey', 'Monica'): 9,
             ('Monica', 'Ross'): 14,
             ('Ross', 'Chandler'): 7,
             ('Ross', 'Phoebe'): 4,
             ('Phoebe', 'Ross'): 4,
             ('Ross', 'Monica'): 10,
             ('Joey', 'Ross'): 7,
             ('Monica', 'Rachel'): 20,
             ('Rachel', 'Monica'): 18,
             ('Rachel', 'Ross'): 16,
             ('Ross', 'Rachel'): 13,
             ('Rachel', 'Phoebe'): 4,
             ('Joey', 'Rachel'): 5,
             ('Monica', 'Phoebe'): 5,
             ('Phoebe', 'Rachel'): 2,
       

In [45]:
# New dictonary that contains interaction counts, with speaker pairs treated as unoriented
unoriented_interactions = defaultdict(int)

for (s1, s2), count in interactions.items():
    # Sort the tuple of speakers to ensure the order of speakers does not matter
    key = tuple(sorted([s1, s2]))
    unoriented_interactions[key] += count

unoriented_interactions

defaultdict(int,
            {('Joey', 'Monica'): 18,
             ('Chandler', 'Joey'): 24,
             ('Chandler', 'Phoebe'): 3,
             ('Monica', 'Phoebe'): 12,
             ('Chandler', 'Monica'): 11,
             ('Joey', 'Phoebe'): 4,
             ('Chandler', 'Ross'): 13,
             ('Joey', 'Ross'): 16,
             ('Monica', 'Ross'): 24,
             ('Phoebe', 'Ross'): 8,
             ('Monica', 'Rachel'): 38,
             ('Rachel', 'Ross'): 29,
             ('Phoebe', 'Rachel'): 6,
             ('Joey', 'Rachel'): 6,
             ('Chandler', 'Paul'): 3,
             ('Monica', 'Paul'): 26,
             ('Joey', 'Paul'): 4,
             ('Chandler', 'Rachel'): 13,
             ('Frannie', 'Rachel'): 1,
             ('Frannie', 'Monica'): 8,
             ('Frannie', 'Joey'): 1})

In [46]:
# Store dictionary as csv file
import csv

with open("interactions.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Character1", "Character2", "InteractionCount"]) # Header
    for (char1, char2), count in unoriented_interactions.items():
        writer.writerow([char1, char2, count])