# Relationship between FRIENDS characters analysis

Using the `friends_quotes` dataset, we will analyze the relationships between the characters in the show.

If we could break the quotes into conversations, and each conversation could involve multiple characters, then we could use association rule mining to find the relationships between the characters.

## How to define a conversation?

1. Separate the quotes by episode
2. Separate the quotes by greetings

## Packages

In [1]:
import pandas as pd
from collections import Counter


In [2]:
quotes = pd.read_csv("../data/friends_quotes.csv")
# change author and quote to lowercase
quotes['author'] = quotes['author'].str.lower()
quotes['quote'] = quotes['quote'].str.lower()
# remove all punctuation
quotes['quote'] = quotes['quote'].str.replace(r'[^\w\s]', '', regex=True)
quotes['author'] = quotes['author'].str.replace(r'[^\w\s]', '', regex=True)
# remove "and " from author
quotes['author'] = quotes['author'].str.replace(r'\band \b', '', regex=True)
# remove "the " from author
quotes['author'] = quotes['author'].str.replace(r'\bthe \b', '', regex=True)

quotes.head()

Unnamed: 0,author,episode_number,episode_title,quote,quote_order,season
0,monica,1.0,Monica Gets A Roommate,theres nothing to tell hes just some guy i wor...,0.0,1.0
1,joey,1.0,Monica Gets A Roommate,cmon youre going out with the guy theres gotta...,1.0,1.0
2,chandler,1.0,Monica Gets A Roommate,all right joey be nice so does he have a hump ...,2.0,1.0
3,phoebe,1.0,Monica Gets A Roommate,wait does he eat chalk,3.0,1.0
4,phoebe,1.0,Monica Gets A Roommate,just cause i dont want her to go through what ...,4.0,1.0


## Greetings

- If a greetings appears in the quote, then the conversation starts
- Next 5 quotes greetings are part of the same conversation
- Next greetings are part of the next conversation


In [3]:
greetings = [
    "hello",
    "hi",
    # "hey",
    # "greetings",
    # "what's up",
    "howdy",
    "yo",
    "sup",
    "morning",
    "good morning",
]

def is_greeting(quote: str) -> bool:
    """
    Check if the quote is a greeting
    :param quote: string
    :return: bool
    """
    words = quote.split()
    for greeting in greetings:
        if greeting in words:
            return True
    return False

quotes["has_greeting"] = quotes["quote"].apply(
    is_greeting
)

quotes.head()


Unnamed: 0,author,episode_number,episode_title,quote,quote_order,season,has_greeting
0,monica,1.0,Monica Gets A Roommate,theres nothing to tell hes just some guy i wor...,0.0,1.0,False
1,joey,1.0,Monica Gets A Roommate,cmon youre going out with the guy theres gotta...,1.0,1.0,False
2,chandler,1.0,Monica Gets A Roommate,all right joey be nice so does he have a hump ...,2.0,1.0,False
3,phoebe,1.0,Monica Gets A Roommate,wait does he eat chalk,3.0,1.0,False
4,phoebe,1.0,Monica Gets A Roommate,just cause i dont want her to go through what ...,4.0,1.0,False


## Separate the quotes by episode

In [4]:
# For each episode, separate the quotes by greetings
# the first conversation not necessarily starts with a greeting [first 10 quotes not greeting, the conversation starts with the first greeting]
# Then next conversation starts with the next greeting, record the quote_order, if next greeting is not in the next 5 quotes, then the conversation ends, the next conversation starts

def separate_conversations(quotes: pd.DataFrame) -> list[pd.DataFrame]:
    """
    Separate the quotes by greetings
    :param quotes: DataFrame
    :return: list of conversations
    """
    conversations = []
    conversation = []
    for i, row in quotes.iterrows():
        if row['has_greeting']:
            if len(conversation) > 5:
                conversations.append(conversation)
            conversation = [row]
        else:
            conversation.append(row)
    if len(conversation) > 5:
        conversations.append(conversation)

    conversations_dfs = []
    for i in conversations:
        conversation_df = pd.concat(i, axis=1).T
        conversations_dfs.append(conversation_df)
    return conversations_dfs


# loop through the episodes and separate the quotes by greetings
conversations_dfs = []
quotes_by_episode = {}
episodes = quotes['episode_title'].unique()
for episode in episodes:
    quotes_by_episode[episode] = quotes[quotes['episode_title'] == episode]
    conversations = separate_conversations(quotes_by_episode[episode])
    for conversation in conversations:
        conversation['episode_title'] = episode
        conversations_dfs.append(conversation)

print(len(conversations_dfs))

1410


## Get the characters in each conversation

In [5]:
## Get the characters in each conversation
def get_characters(conversation: pd.DataFrame) -> list[str]:
    """
    Get the characters in each conversation
    :param conversation: DataFrame
    :return: list of characters
    """
    characters = []
    for i, row in conversation.iterrows():
        characters.append(row['author'])
    return sorted(characters)

# get the characters in each conversation
conversations_characters = []
for conversation in conversations_dfs:
    characters = get_characters(conversation)
    conversations_characters.append(characters)

conversations_characters = pd.Series(conversations_characters)


## Get the characters in each conversation
def get_characters(conversation: pd.DataFrame) -> list[str]:
    """
    Get the characters in each conversation
    :param conversation: DataFrame
    :return: list of characters
    """
    characters = []
    for i, row in conversation.iterrows():
        # change author == "all" to monica joey chandler ross rachel phoebe
        if row['author'] == "all":
            characters.extend(["monica", "joey", "chandler", "ross", "rachel", "phoebe"])
        # if contains space
        elif " " in row['author']:
            # split the author by space
            authors = row['author'].split(" ")
            for author in authors:
                # remove space from author
                author = author.strip()
                if author not in characters:
                    characters.append(author)
        else:
            characters.append(row['author'])
    return sorted(set(characters))


# get the characters in each conversation
conversations_characters = []
for conversation in conversations_dfs:
    characters = get_characters(conversation)
    conversations_characters.append(characters)

conversations_characters = pd.Series(conversations_characters)
conversations_characters


0          [chandler, joey, monica, phoebe, rachel, ross]
1                  [chandler, joey, monica, phoebe, ross]
2       [chandler, joey, monica, paul, phoebe, rachel,...
3       [chandler, joey, monica, paul, phoebe, rachel,...
4       [chandler, joey, monica, on, paul, priest, rac...
                              ...                        
1405       [chandler, joey, monica, phoebe, rachel, ross]
1406    [1, agent, attendant, chandler, gate, joey, ma...
1407               [chandler, joey, monica, phoebe, ross]
1408    [1, 2, 3, air, attendant, chandler, gate, joey...
1409    [air, chandler, joey, monica, phoebe, rachel, ...
Length: 1410, dtype: object

## Apply Association Rule Mining

In [6]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

# Convert conversations_characters (list of lists) into a one-hot encoded DataFrame
te = TransactionEncoder()
te_ary = te.fit(conversations_characters).transform(conversations_characters)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)


### Perform Frequent Itemset Mining

In [7]:
# Find frequent itemsets with a minimum support threshold
frequent_itemsets = apriori(df_encoded, min_support=0.05, use_colnames=True)
frequent_itemsets.sort_values(by="support", ascending=False, inplace=True)
frequent_itemsets.head()


Unnamed: 0,support,itemsets
0,0.785816,(chandler)
5,0.785816,(rachel)
6,0.785816,(ross)
1,0.764539,(joey)
2,0.761702,(monica)


### Generate Association Rules

In [8]:
# Generate association rules with a minimum confidence threshold
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
rules.sort_values(by=["confidence", "lift"], ascending=False, inplace=True)
rules.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
542,"(monica, rachel, joey, phoebe, ross)",(chandler),0.473759,0.785816,0.458156,0.967066,1.230652,1.0,0.085869,6.503417,0.356154,0.571681,0.846235,0.775049
392,"(monica, phoebe, joey, ross)",(chandler),0.507092,0.785816,0.487943,0.962238,1.224508,1.0,0.089462,5.67192,0.371968,0.606167,0.823693,0.791588
545,"(rachel, joey, chandler, phoebe, ross)",(monica),0.476596,0.761702,0.458156,0.96131,1.262054,1.0,0.095132,6.159083,0.396713,0.587273,0.837638,0.7814
451,"(monica, phoebe, rachel, joey)",(chandler),0.503546,0.785816,0.483688,0.960563,1.222378,1.0,0.087994,5.431104,0.366443,0.600352,0.815875,0.788043
361,"(monica, rachel, joey, ross)",(chandler),0.509929,0.785816,0.488652,0.958275,1.219466,1.0,0.087942,5.133286,0.36723,0.605448,0.805193,0.790058


### Analyze Results

In [9]:
# Display the top 10 association rules
rules[["antecedents", "consequents", "support", "confidence", "lift"]].head(10)


Unnamed: 0,antecedents,consequents,support,confidence,lift
542,"(monica, rachel, joey, phoebe, ross)",(chandler),0.458156,0.967066,1.230652
392,"(monica, phoebe, joey, ross)",(chandler),0.487943,0.962238,1.224508
545,"(rachel, joey, chandler, phoebe, ross)",(monica),0.458156,0.96131,1.262054
451,"(monica, phoebe, rachel, joey)",(chandler),0.483688,0.960563,1.222378
361,"(monica, rachel, joey, ross)",(chandler),0.488652,0.958275,1.219466
454,"(phoebe, rachel, joey, chandler)",(monica),0.483688,0.957865,1.257532
424,"(phoebe, rachel, ross, chandler)",(monica),0.484397,0.956583,1.255849
150,"(monica, joey, ross)",(chandler),0.533333,0.955527,1.215969
178,"(monica, phoebe, joey)",(chandler),0.529078,0.952746,1.212429
422,"(monica, phoebe, rachel, ross)",(chandler),0.484397,0.951253,1.21053


### Save Results

In [10]:
# Save the frequent itemsets and rules to CSV files for further analysis
import os
if not os.path.exists("../results"):
    os.makedirs("../results")
frequent_itemsets.to_csv("../results/frequent_itemsets.csv", index=False)
rules.to_csv("../results/association_rules.csv", index=False)



In [11]:
results = rules[["antecedents", "consequents", "support", "confidence", "lift"]].copy().reset_index(drop=True)
# select only one character from antecedents
main_characters = [
    "monica",
    "joey",
    "chandler",
    "ross",
    "rachel",
    "phoebe"
]


# select antecedent has only one character and is joey
results[results["antecedents"].apply(lambda x: len(x) == 1 and "joey" in x)].head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift
157,(joey),(chandler),0.669504,0.875696,1.114378
233,(joey),(ross),0.641844,0.839518,1.068339
254,(joey),(rachel),0.63617,0.832096,1.058895
268,(joey),(monica),0.631915,0.826531,1.08511
317,(joey),(phoebe),0.615603,0.805195,1.106554
367,(joey),"(monica, chandler)",0.595035,0.778293,1.14431
390,(joey),"(ross, chandler)",0.587234,0.768089,1.160778
432,(joey),"(rachel, chandler)",0.569504,0.744898,1.159278
442,(joey),"(rachel, ross)",0.565248,0.739332,1.114929
444,(joey),"(phoebe, chandler)",0.564539,0.738404,1.176441


In [12]:
results[results["antecedents"].apply(lambda x: len(x) == 1 and "ross" in x)].head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift
222,(ross),(rachel),0.663121,0.843863,1.073869
229,(ross),(chandler),0.661702,0.842058,1.071572
299,(ross),(joey),0.641844,0.816787,1.068339
314,(ross),(monica),0.634043,0.806859,1.059284
363,(ross),(phoebe),0.613475,0.780686,1.072872
428,(ross),"(joey, chandler)",0.587234,0.747292,1.116189
429,(ross),"(monica, chandler)",0.587234,0.747292,1.09873
450,(ross),"(rachel, chandler)",0.578014,0.73556,1.144745
474,(ross),"(monica, rachel)",0.567376,0.722022,1.122437
477,(ross),"(rachel, joey)",0.565248,0.719314,1.130694


In [13]:
results[results["antecedents"].apply(lambda x: len(x) == 1 and "chandler" in x)].head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift
183,(chandler),(monica),0.680142,0.865523,1.136302
205,(chandler),(joey),0.669504,0.851986,1.114378
230,(chandler),(ross),0.661702,0.842058,1.071572
297,(chandler),(rachel),0.642553,0.81769,1.040562
329,(chandler),(phoebe),0.62766,0.798736,1.097679
413,(chandler),"(monica, joey)",0.595035,0.75722,1.198295
426,(chandler),"(monica, ross)",0.587234,0.747292,1.178616
427,(chandler),"(joey, ross)",0.587234,0.747292,1.16429
440,(chandler),"(monica, rachel)",0.58156,0.740072,1.150498
449,(chandler),"(monica, phoebe)",0.578723,0.736462,1.168067


In [14]:
results[results["antecedents"].apply(lambda x: len(x) == 1 and "monica" in x)].head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift
114,(monica),(chandler),0.680142,0.892924,1.136302
218,(monica),(rachel),0.643262,0.844507,1.074688
251,(monica),(ross),0.634043,0.832402,1.059284
261,(monica),(joey),0.631915,0.829609,1.08511
266,(monica),(phoebe),0.630496,0.827747,1.137547
362,(monica),"(joey, chandler)",0.595035,0.781192,1.166823
381,(monica),"(ross, chandler)",0.587234,0.77095,1.165101
402,(monica),"(rachel, chandler)",0.58156,0.763501,1.18823
410,(monica),"(phoebe, chandler)",0.578723,0.759777,1.210491
433,(monica),"(rachel, ross)",0.567376,0.744879,1.123293


In [15]:
results[results["antecedents"].apply(lambda x: len(x) == 1 and "rachel" in x)].head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift
221,(rachel),(ross),0.663121,0.843863,1.073869
292,(rachel),(monica),0.643262,0.818592,1.074688
296,(rachel),(chandler),0.642553,0.81769,1.040562
308,(rachel),(joey),0.63617,0.809567,1.058895
355,(rachel),(phoebe),0.617021,0.785199,1.079074
441,(rachel),"(monica, chandler)",0.58156,0.740072,1.088114
451,(rachel),"(ross, chandler)",0.578014,0.73556,1.111617
470,(rachel),"(joey, chandler)",0.569504,0.724729,1.082488
473,(rachel),"(monica, ross)",0.567376,0.722022,1.138759
478,(rachel),"(joey, ross)",0.565248,0.719314,1.120699


In [16]:
results[results["antecedents"].apply(lambda x: len(x) == 1 and "phoebe" in x)].head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift
180,(phoebe),(monica),0.630496,0.866472,1.137547
189,(phoebe),(chandler),0.62766,0.862573,1.097679
214,(phoebe),(rachel),0.617021,0.847953,1.079074
215,(phoebe),(joey),0.615603,0.846004,1.106554
226,(phoebe),(ross),0.613475,0.84308,1.072872
334,(phoebe),"(monica, chandler)",0.578723,0.795322,1.169347
372,(phoebe),"(joey, chandler)",0.564539,0.775828,1.158812
399,(phoebe),"(monica, rachel)",0.556028,0.764133,1.187902
403,(phoebe),"(monica, joey)",0.555319,0.763158,1.207691
404,(phoebe),"(ross, chandler)",0.555319,0.763158,1.153325


In [17]:
results[results["antecedents"].apply(lambda x: len(x) == 2 and "monica" in x and "chandler" in x)].head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift
160,"(monica, chandler)",(joey),0.595035,0.87487,1.14431
188,"(monica, chandler)",(ross),0.587234,0.863399,1.09873
198,"(monica, chandler)",(rachel),0.58156,0.855057,1.088114
208,"(monica, chandler)",(phoebe),0.578723,0.850886,1.169347
357,"(monica, chandler)","(joey, ross)",0.533333,0.78415,1.221715
366,"(monica, chandler)","(rachel, ross)",0.529787,0.778936,1.174653
368,"(monica, chandler)","(phoebe, joey)",0.529078,0.777894,1.263629
373,"(monica, chandler)","(rachel, joey)",0.52695,0.774765,1.217859
394,"(monica, chandler)","(phoebe, ross)",0.521986,0.767466,1.251014
398,"(monica, chandler)","(phoebe, rachel)",0.519858,0.764338,1.238754


In [18]:
results[results["antecedents"].apply(lambda x: len(x) == 2 and "rachel" in x and "ross" in x)].head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift
169,"(rachel, ross)",(chandler),0.578014,0.871658,1.10924
197,"(rachel, ross)",(monica),0.567376,0.855615,1.123293
203,"(rachel, ross)",(joey),0.565248,0.852406,1.114929
262,"(rachel, ross)",(phoebe),0.549645,0.828877,1.1391
328,"(rachel, ross)","(monica, chandler)",0.529787,0.79893,1.174653
344,"(rachel, ross)","(joey, chandler)",0.524823,0.791444,1.182135
387,"(rachel, ross)","(monica, joey)",0.509929,0.768984,1.216911
392,"(rachel, ross)","(monica, phoebe)",0.50922,0.767914,1.217952
401,"(rachel, ross)","(phoebe, chandler)",0.506383,0.763636,1.216641
414,"(rachel, ross)","(phoebe, joey)",0.502128,0.757219,1.230045


In [19]:
results[results["antecedents"].apply(lambda x: len(x) == 2 and "phoebe" in x and "joey" in x)].head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift
59,"(phoebe, joey)",(chandler),0.564539,0.917051,1.167005
97,"(phoebe, joey)",(monica),0.555319,0.902074,1.184287
126,"(phoebe, joey)",(ross),0.546809,0.888249,1.130353
144,"(phoebe, joey)",(rachel),0.541844,0.880184,1.12009
192,"(phoebe, joey)","(monica, chandler)",0.529078,0.859447,1.263629
246,"(phoebe, joey)","(ross, chandler)",0.514184,0.835253,1.26228
280,"(phoebe, joey)","(monica, ross)",0.507092,0.823733,1.299176
287,"(phoebe, joey)","(rachel, chandler)",0.504965,0.820276,1.276589
295,"(phoebe, joey)","(monica, rachel)",0.503546,0.817972,1.2716
302,"(phoebe, joey)","(rachel, ross)",0.502128,0.815668,1.230045
