In [81]:
import pandas
import os
import glob
import re
from py_linq import Enumerable
os.environ["R_HOME"] = "C:\Program Files\R\R-4.3.1"

from rpy2.robjects.packages import importr
import rpy2

In [89]:
base = importr("base")
utils = importr("utils")

# install required dependencies
dependencies = ["lsa", "LSAfun"]
new_packages = [dependency for dependency in dependencies if dependency not in utils.installed_packages()]
for new_package in new_packages:
    print(f"installing required dependency {new_package}")
    utils.install_packages(new_package, "http://cran.us.r-project.org")

lsafun = importr("LSAfun")
lsa = importr("lsa")

In [62]:
file_names = glob.glob("convos/*.csv")
convo_data_frames = [pandas.read_csv(path) for path in file_names]
input_data = pandas.concat(convo_data_frames, ignore_index=True)

# aggregate text by participant and conversation. note that this currently treats conversations across different days separately
aggregated_text_by_participant = Enumerable([row for _, row in input_data.iterrows()]) \
    .group_by(key_names=["participant_id", "f_id", "m_id", "date.number"], 
              key=lambda row: (row["ID"], row["F.ID"], row["M.ID"], row["Date.Number"])) \
    .select(lambda group: {"ID": group.key, "Text": group \
        .select(lambda row: row["Event"]) \
        .aggregate(lambda a, b: f"{a}\n{b}")}) \
    .to_list()
    
# LSA requires text *files* instead of just text for god knows why
for entry in aggregated_text_by_participant:
    print(entry)
    participant_text_file = open(f"participant_text_for_{entry['ID'].participant_id}.txt", "w")
    participant_text_file.write(entry["Text"])
    participant_text_file.close()

{'ID': {'participant_id': 'Juliet', 'f_id': 'Juliet', 'm_id': 'Romeo', 'date.number': 1}, 'Text': "What man art thou that, thus bescreened in night,\nSo stumblest on my counsel?\nMy ears have not yet drunk a hundred words\nOf that tongue's uttering, yet I know the sound.\nArt thou not Romeo and a Montague?\nHow camest thou hither, tell me, and wherefore?\nThe orchard walls are high and hard to climb,\nAnd the place death, considering who thou art,\nIf any of my kinsmen find thee here."}
{'ID': {'participant_id': 'Romeo', 'f_id': 'Juliet', 'm_id': 'Romeo', 'date.number': 1}, 'Text': "I take thee at thy word.\nCall me but love, and I'll be new baptized;\nHenceforth I never will be Romeo.\nBy a name\nI know not how to tell thee who I am.\nMy name, dear saint, is hateful to myself,\nBecause it is an enemy to thee.\nHad I it written, I would tear the word.\nNeither, fair maid, if either thee dislike."}


In [63]:
# create a list of pairs of participants who conversed
convo_participant_pairs = Enumerable([row for _, row in input_data.iterrows()]) \
    .group_by(key_names=["f_id", "m_id"], 
              key=lambda row: (row["F.ID"], row["M.ID"])) \
    .select(lambda group: { "f_id": group.key.f_id, "m_id": group.key.m_id })

In [82]:
#tvectors = lsa.textmatrix(f"{base.getwd()[0]}/convos")
base.load("TASA.rda")

for pair in convo_participant_pairs:
    f_id = pair["f_id"]
    m_id = pair["m_id"]
    
    # get first matching aggregated conversation for each participant
    f_text = Enumerable(aggregated_text_by_participant).where(
        lambda group: 
            group["ID"].participant_id == f_id 
            and group["ID"].f_id == f_id
            and group["ID"].m_id == m_id
    ).select(lambda group: group["Text"]).first()
    
    m_text = Enumerable(aggregated_text_by_participant).where(
        lambda group: 
            group["ID"].participant_id == m_id 
            and group["ID"].f_id == f_id
            and group["ID"].m_id == m_id
    ).select(lambda group: group["Text"]).first()
    
    # replace any whitespace sequences with a single space
    m_text = re.sub(r"\s+", " ", m_text)
    f_text = re.sub(r"\s+", " ", f_text)
    
    # get cosine similarity between participants" text
    result = lsafun.costring(f_text, m_text, tvectors=rpy2.robjects.globalenv["TASA"])
    print(f"Result for {f_id} & {m_id}: {result}")

f_text="What man art thou that, thus bescreened in night, So stumblest on my counsel? My ears have not yet drunk a hundred words Of that tongue's uttering, yet I know the sound. Art thou not Romeo and a Montague? How camest thou hither, tell me, and wherefore? The orchard walls are high and hard to climb, And the place death, considering who thou art, If any of my kinsmen find thee here."
m_text="I take thee at thy word. Call me but love, and I'll be new baptized; Henceforth I never will be Romeo. By a name I know not how to tell thee who I am. My name, dear saint, is hateful to myself, Because it is an enemy to thee. Had I it written, I would tear the word. Neither, fair maid, if either thee dislike."
Note: not all elements in x were found in rownames(tvectors)

Note: not all elements in y were found in rownames(tvectors)

Result for Juliet & Romeo: [1] 0.5500876

