In [1]:
import sqlite3
import os
from pathlib import Path as pth
import subprocess as sp
import warnings
from typing import List, Tuple, Any, Dict
from itertools import chain
from collections import defaultdict, Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot  as plt
import seaborn  as sns
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re

## Change working directory

In [2]:
os.chdir(pth(pth.home() / 'dev/sms-analysis'))

## Load Data

In [3]:
messages = pd.read_csv('processed-data/X-messages.csv')
messages.head()

Unnamed: 0,date_time,text,from_me,sender
0,2021-03-11 17:25:57,So when is a good time?,1,13608307613
1,2021-03-11 17:26:37,Tuesday’s I’m free after 1130,1,13608307613
2,2021-03-11 23:51:13,this is Christian right?,0,13608307613
3,2021-03-11 23:52:07,im done at 3 on tuesday’s so anytime after tha...,0,13608307613
4,2021-03-12 09:17:13,Sounds like a plan,1,13608307613


In [4]:

# tokenize messages by sender

nlp = spacy.blank("en")

def tokenize(text):
    return [t.text.lower() for t in nlp(text) if t.is_alpha]

messages = (
    messages
        .dropna(subset=["text"])
        .assign(tokens=lambda df: df["text"].apply(tokenize))
)

# # Filter out stop words
# stop_words = set(STOP_WORDS)
# messages["tokens"] = messages["tokens"].apply(
#     lambda toks: [t for t in toks if t not in stop_words]
# )


# filter for non_empty tokens
messages = messages[messages.tokens.apply(len) != 0]


my_tokens = (
    messages["tokens"][messages.from_me==1]
    .explode()
    .rename("token")
    .reset_index(drop=True)
)

their_tokens = (
    messages["tokens"][messages.from_me==0]
    .explode()
    .rename("token")
    .reset_index(drop=True)
)

In [5]:
my_tokens.value_counts().sort_values( ascending = False)
their_tokens.value_counts().sort_values( ascending = False)

token
i           2771
you         2054
to          1405
the          857
and          743
            ... 
timeline       1
solidify       1
fishing        1
tube           1
hitting        1
Name: count, Length: 3593, dtype: int64

In [6]:
# patt = re.compile(r'love')
their_token_total = their_tokens.shape[0]
my_token_total = my_tokens.shape[0]

def print_word_count(pattern: str):
    print(pattern)
    
    their_count = (their_tokens.str.contains(pattern, case=False)).sum()
    my_count = (my_tokens.str.contains(pattern, case=False)).sum()

    my_prop = round(100*my_count/my_token_total, 2)
    their_prop = round(100*their_count/their_token_total, 2)

    print("  Them: ", their_count, f'({their_prop}%)')
    print("  Me: ", my_count, f'({my_prop}%)')

In [7]:
patterns = ['love', 'like', 'happy', 'sad', 'miss', 'wish',  
            'hat', 'dog', 'mad', 'time', 'plan', 'you', 'me', 
            'how|why|where|when']
for patt in patterns:
    print_word_count(patt)

love
  Them:  715 (1.63%)
  Me:  579 (2.05%)
like
  Them:  249 (0.57%)
  Me:  287 (1.02%)
happy
  Them:  31 (0.07%)
  Me:  21 (0.07%)
sad
  Them:  20 (0.05%)
  Me:  12 (0.04%)
miss
  Them:  105 (0.24%)
  Me:  118 (0.42%)
wish
  Them:  16 (0.04%)
  Me:  27 (0.1%)
hat
  Them:  654 (1.49%)
  Me:  472 (1.67%)
dog
  Them:  4 (0.01%)
  Me:  3 (0.01%)
mad
  Them:  39 (0.09%)
  Me:  27 (0.1%)
time
  Them:  171 (0.39%)
  Me:  77 (0.27%)
plan
  Them:  78 (0.18%)
  Me:  40 (0.14%)
you
  Them:  2348 (5.35%)
  Me:  1639 (5.8%)
me
  Them:  1249 (2.84%)
  Me:  832 (2.94%)
how|why|where|when
  Them:  363 (0.83%)
  Me:  331 (1.17%)


## use LWIC dictionary

In [None]:

# Download dictionary
if not pth.exists(pth("raw-data/LIWC2007.dic")):
    sp.run([
        'curl',
        '-L',
        'https://raw.githubusercontent.com/Harsh-Panchal-1403/LIWC_PROJECT/master/LIWC2007_English100131.dic',
        '-o',
        'raw-data/LIWC2007.dic'
    ])



### Read in dictionary 


In [None]:

# def read_dic(path: str) -> Dict[re.Pattern, list[str]]:
#     with open(path, 'r') as f:
#         lines = f.readlines()
#     # find lines with field names
#     key_marker = [s.strip() == '%' for s in lines]
#     key_marker_indx = np.where(key_marker)[0]

#     # save keys as dict of key: refnum
#     full_keys = lines[key_marker_indx[0] + 1 : key_marker_indx[1] ]
#     values = lines[key_marker_indx[1]+1:]

#     ref_num_dict = {}
#     pat = re.compile(r'^(\d+)\t(.+)')
#     for s in full_keys:
#         m = pat.search(s)
#         if m:
#             ref_num_dict[int(m.group(1))] = m.group(2).strip()

#     # save strings as dict string: refnum
#     string_dict = {}
#     # val = "sdlfkj   130294  13294   130459"
#     for val in values:
#         refs = [int(x) for x in re.findall(r'\d+', val)]
#         string = val.split('\t', 1)[0].strip()
#         string = re.sub(pattern=r'\s+',repl= '', string=string)
#         string_dict.setdefault(string, []).extend(refs)

#     # make one unified dict with key: strings
#     full_dict = {}
#     for string, ref_nums in string_dict.items():
#         cats = []
#         for ref_num in ref_nums:
#             if ref_num in ref_num_dict:
#                 cats.append(ref_num_dict[ref_num])
#         string = re.compile("^" + re.escape(string).replace(r"\*", ".*") + "$")
#         full_dict[string] = cats

#     return full_dict, [re.sub(r'\d+\t|\n', '', x) for x in full_keys]


# # Read dic
# dic, categories = read_dic("raw-data/LIWC2007.dic")




### Map Texts to Categories



In [None]:

# # return categories for word
# def get_categories(tokens) :
#     res = []
#     for tok in tokens:
#         res.append([
#             cat
#             for patt, cats in dic.items()
#             if patt.match(tok)
#             for cat in cats
#         ])

#     # category x message series
#     res = pd.Series(res).explode().value_counts()

#     # add missing categories
#     if not res.shape[0] == len(categories) :
#         diff = set(categories).difference(set(res.index))
#         add = pd.Series(0,  index = list(diff))
#         res = pd.concat([res, add], axis = 0)

#     res = res.sort_index()

#     return res

# #%%
# # get_categories( messages.loc[1,'tokens'])


# # Get categories for each message as vector of category counts
# category_counts = pd.DataFrame([get_categories(x) for x in messages.tokens])
# category_counts.shape
# messages.shape

# messages.reset_index(drop = True, inplace = True)
# category_counts.reset_index(drop = True, inplace = True)

# messages.index
# messages.columns
# category_counts.index
# category_counts.columns

# x = pd.concat([messages, category_counts], axis = 1)

# if x.shape[0] == messages.shape[0]:
#     messages = x
# else:
#     warnings.warn(
#         "Row count mismatch: x does not match messages; assignment skipped.",
#         UserWarning
#     )

# #%% [markdown]
# # ## Compare category counts

# #%%
# my_cat_counts = messages.loc[messages['from_me']==1,'achieve':].sum(axis = 0)
# their_cat_counts = messages.loc[messages['from_me']==0,'achieve':].sum(axis = 0)

# # normalize counts
# # total_cat_counts = sum(my_cat_counts, their_cat_counts)
# my_cat_freq = my_cat_counts/my_cat_counts.sum(0)
# their_cat_freq = their_cat_counts/their_cat_counts.sum(0)

# cat_freq_summary = pd.concat([my_cat_freq, their_cat_freq], axis = 1).rename(columns={0: "me", 1: "them"})

# cat_freq_summary['me_over_them'] = cat_freq_summary['me'].div(cat_freq_summary['them'], axis = 0).round(4)
# cat_freq_summary['log_me_over_them'] = np.log2(cat_freq_summary['me_over_them'])

# # add patterns to df
# cat_to_patterns = defaultdict(list)
# for patt, cats in dic.items():
#     for cat in cats:
#         cat_to_patterns[cat].append(patt.pattern)

# cat_freq_summary["patterns"] = (
#     cat_freq_summary.index.map(lambda c: cat_to_patterns.get(c, []))
# )


# # print summary
# (
#     cat_freq_summary
#     .sort_values(by="log_me_over_them",key=lambda s: s.abs(), ascending = False)
#     .to_csv('results/summary.txt', sep='\t')
# )

## Use ConvoKit tool
Jonathan P. Chang, Caleb Chiam, Liye Fu, Andrew Wang, Justine Zhang, Cristian Danescu-Niculescu-Mizil. 2020. "ConvoKit: A Toolkit for the Analysis of Conversations". Proceedings of SIGDIAL.

  1. Download the toolkit: pip3 install convokit
  2. Download Spacy's English model: python3 -m spacy download en
  3. Download NLTK's 'punkt' model: import nltk; nltk.download('punkt') (in Python interpreter)

In [9]:
from convokit import Corpus, Utterance, Speaker, TextParser, Coordination,PolitenessStrategies
import nltk; nltk.download('punkt')
# spacy.load('en_core_web_sm')

  import pkg_resources
[nltk_data] Downloading package punkt to /Users/canderson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Construct corpus

In [10]:

messages.date_time = pd.to_datetime(messages.date_time)
df = messages.copy()

In [11]:


def speaker_id(row):
    return "me" if row["from_me"] == 1 else "them"
# make speakers
speakers = {
    "me": Speaker(id="me"),
    "them": Speaker(id="them")
}
utterances = []
conversation_id = "sms_conversation_1"
prev_utt_id = None

for i, row in df.iterrows():
    utt_id = f"utt_{i}"

    utt = Utterance(
        id=utt_id,
        speaker=speakers[speaker_id(row)],
        text=row["text"],
        reply_to=prev_utt_id,
        conversation_id=conversation_id,
        meta={
            "timestamp": row["date_time"].isoformat(),
            "from_me": row["from_me"],
            "sender": row["sender"]
            # "tokens": row["tokens"]
        }
    )

    utterances.append(utt)
    prev_utt_id = utt_id

corpus = Corpus(
    utterances=utterances
)

In [19]:
[x for x  in dir(corpus) if not bool(re.search(r'^_', x)) ]

['add_meta',
 'add_utterances',
 'append_vector_matrix',
 'backend',
 'backend_mapper',
 'config',
 'conversations',
 'corpus_dirpath',
 'delete_metadata',
 'delete_vector_matrix',
 'directed_pairwise_exchanges',
 'dump',
 'dump_info',
 'dump_vectors',
 'filter_conversations_by',
 'filter_utterances',
 'from_pandas',
 'get_attribute_table',
 'get_conversation',
 'get_conversation_ids',
 'get_conversations_dataframe',
 'get_full_attribute_table',
 'get_meta',
 'get_object',
 'get_object_ids',
 'get_speaker',
 'get_speaker_convo_attribute_table',
 'get_speaker_convo_info',
 'get_speaker_ids',
 'get_speakers_dataframe',
 'get_utterance',
 'get_utterance_ids',
 'get_utterances_dataframe',
 'get_vector_matrix',
 'get_vectors',
 'has_conversation',
 'has_speaker',
 'has_utterance',
 'id',
 'iter_conversations',
 'iter_objs',
 'iter_speakers',
 'iter_utterances',
 'load_info',
 'merge',
 'meta',
 'meta_index',
 'organize_speaker_convo_history',
 'print_summary_stats',
 'random_conversation',


### Tokenize

In [12]:

parser = TextParser('en_core_web_sm')
corpus = parser.transform(corpus)

### Analyze Corpus

In [13]:
corpus.print_summary_stats()

[t["tok"] for sent in corpus.get_utterance("utt_0").meta["en_core_web_sm"] for t in sent["toks"]]

Number of Speakers: 2
Number of Utterances: 9281
Number of Conversations: 1


['So', 'when', 'is', 'a', 'good', 'time', '?']

In [14]:
print("Speakers in corpus:", list(corpus.iter_speakers()))  
print(corpus.speaking_pairs(speaker_ids_only=True)  )

Speakers in corpus: [Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x176d84b90>, 'id': 'me', 'meta': ConvoKitMeta({})}), Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x176d84b90>, 'id': 'them', 'meta': ConvoKitMeta({})})]
{('me', 'them'), ('them', 'them'), ('me', 'me'), ('them', 'me')}


### Speaker Coordination

|Feature|me_to_them|them_to_me|Interpretation|
|---|---|---|---|
|auxverb|0.31|0.05|You strongly accommodate their auxiliary verbs; they barely adapt to yours|
|pronoun|−0.02|0.21|You slightly diverge; they strongly accommodate|
|article|0.00|0.00|No coordination either way|

In [15]:
# speaker coordination
coord = Coordination(target_thresh=3, speaker_thresh=5, utterances_thresh=5)  

coord.fit(corpus)  

coord.transform(corpus)

me_coord_scores = corpus.get_speaker("me").meta["coord"]['them']
them_coord_scores = corpus.get_speaker("them").meta["coord"]['me']
feature_freqs = pd.concat([pd.Series(me_coord_scores).rename("me_to_them"), pd.Series(them_coord_scores).rename("them_to_me")], axis = 1)

In [16]:
feature_freqs['diff'] = feature_freqs['me_to_them']-feature_freqs['them_to_me']

feature_freqs.sort_values('diff', key = lambda x: abs(x), ascending = False)

Unnamed: 0,me_to_them,them_to_me,diff
quant,0.122962,0.153296,-0.030333
adverb,0.066464,0.096798,-0.030333
ipron,0.120974,0.150931,-0.029957
ppron,0.040133,0.069325,-0.029191
auxverb,0.047595,0.069215,-0.02162
preps,0.053648,0.066091,-0.012443
conj,0.096239,0.104406,-0.008167
article,0.078558,0.077746,0.000812


### Politeness

Politeness features

| Feature name              | What it measures | Typical interpretation in discourse analysis |
|---------------------------|------------------|----------------------------------------------|
| Please                    | Presence of the word “please” anywhere in the utterance | Politeness marker; mitigates imposition |
| Please_start              | Utterance begins with “please” | High politeness or deference at turn entry |
| HASHEDGE                  | Any hedge expression (aggregate indicator) | Linguistic uncertainty, softening, or non-commitment |
| Indirect_(btw)            | Indirect discourse marker such as “by the way” | Topic shift or low-imposition insertion |
| Hedges                    | Count or presence of hedging terms (e.g., “maybe”, “kind of”) | Reduced certainty; politeness or epistemic caution |
| Factuality                | Use of factual/assertive language | Speaker presents information as objective or certain |
| Deference                 | Deferential language (e.g., “if you don’t mind”) | Power asymmetry or respect toward interlocutor |
| Gratitude                 | Expressions of thanks | Positive social signaling; rapport maintenance |
| Apologizing               | Apologies or regret expressions | Face-saving, repair, or politeness strategy |
| 1st_person_pl.            | First-person plural pronouns (“we”, “us”) | Inclusivity, shared responsibility, alignment |
| 1st_person                | First-person singular pronouns (“I”, “me”) | Self-focus, agency, or ownership of stance |
| 1st_person_start          | Utterance begins with a first-person pronoun | Self-initiated stance or framing |
| 2nd_person                | Second-person pronouns (“you”) | Addressing, directing, or engaging the interlocutor |
| 2nd_person_start          | Utterance begins with a second-person pronoun | Direct engagement; can signal instruction or confrontation |
| Indirect_(greeting)       | Indirect greeting (e.g., “hey”, “hope you’re well”) | Social lubrication before substantive content |
| Direct_question           | Explicit interrogative form | Information-seeking or directive questioning |
| Direct_start              | Utterance begins with a direct request or statement | Low mitigation; task-oriented or assertive style |
| HASPOSITIVE               | Presence of positive-affect words | Positive sentiment or encouragement |
| HASNEGATIVE               | Presence of negative-affect words | Criticism, frustration, or negative sentiment |
| SUBJUNCTIVE               | Subjunctive or hypothetical constructions (“would”, “could”) | Politeness, mitigation, or counterfactual framing |
| INDICATIVE                | Indicative (statement-of-fact) constructions | Assertion, certainty, or declarative stance |

In [17]:
# Initialize politeness analyzer (requires parsed text)  
ps = PolitenessStrategies(parse_attribute_name="en_core_web_sm")  
corpus= ps.fit_transform(corpus)  

In [18]:
  
# Get politeness scores for each speaker's utterances  
me_utterances = list(corpus.iter_utterances(lambda x: x.speaker.id == "me"))  
them_utterances = list(corpus.iter_utterances(lambda x: x.speaker.id == "them"))  
  
# Calculate average politeness strategies per speaker  
me_strategies = pd.DataFrame([utt.meta["politeness_strategies"] for utt in me_utterances])
them_strategies = pd.DataFrame([utt.meta["politeness_strategies"] for utt in them_utterances])

out = pd.concat([me_strategies.sum(0), them_strategies.sum(0)],axis = 1)
out.columns = ['me', 'them']
out.index = [re.sub(pattern = r'feature_politeness_|==',repl = '', string= x) for x in out.index]

# normalize by total utterances spoken
utt_counts = Counter(
    utt.speaker.id
    for utt in corpus.iter_utterances()
)

out['me_self_normalized'] = out.me/utt_counts['me']
out['them_self_normalized'] = out.them/utt_counts['them']

out['diff'] = out.me_self_normalized-out.them_self_normalized

out.sort_values('diff', key = lambda x: abs(x), ascending = False)

Unnamed: 0,me,them,me_self_normalized,them_self_normalized,diff
1st_person,961,1388,0.221225,0.281142,-0.059918
2nd_person,1218,1651,0.280387,0.334414,-0.054027
HASPOSITIVE,1694,2155,0.389963,0.4365,-0.046537
HASHEDGE,404,682,0.093002,0.138141,-0.045139
1st_person_start,952,1292,0.219153,0.261697,-0.042545
1st_person_pl.,217,407,0.049954,0.082439,-0.032485
Direct_start,112,277,0.025783,0.056107,-0.030324
Indirect_(greeting),8,109,0.001842,0.022078,-0.020237
HASNEGATIVE,489,650,0.112569,0.131659,-0.01909
Gratitude,123,218,0.028315,0.044156,-0.015841
