Import packages for entire notebook

In [None]:
import sqlite3, json, re, datetime, math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.patches as mpatches
import matplotlib.dates as mdates
from wordcloud import WordCloud
from collections import Counter
%matplotlib inline

Import stopwords

In [None]:
stopwords = json.load(open('stopwords.json', 'r'))

Connect to Messages database using Sqlite3

In [None]:
conn = sqlite3.connect('/Users/Brienna/Library/Messages/chat.db')
c = conn.cursor()

**Let user identify which handle_id to analyze**

View headings in the Messages data

In [None]:
c.execute('select * from message')
c.description

Extract the rowid, text, is_from_me, and datetime columns, placing into dataframe

In [None]:
cmd1 = 'SELECT ROWID, text, is_from_me, \
        datetime(date + strftime(\'%s\',\'2001-01-01\'), \'unixepoch\') as date_utc \
        FROM message WHERE handle_id=47'
c.execute(cmd1)
df_msg = pd.DataFrame(c.fetchall(), columns=['id', 'text', 'is_from_me', 'time'])
df_msg

Convert datatime to something useable

In [None]:
df_msg['time'] = [datetime.datetime.strptime(str(t), '%Y-%m-%d %H:%M:%S') + datetime.timedelta(hours=-4) for t in df_msg['time']]
df_msg['new_date'] = [d.date() for d in df_msg['time']]
df_msg['new_time'] = [d.time() for d in df_msg['time']]
df_msg['new_hours'] = [d.hour for d in df_msg['time']]
df_msg

How long the conversation has been going

In [None]:
start = df_msg['new_date'].iloc[0]
end = df_msg['new_date'].iloc[-1]
print('from ' + str(start) + ' until ' + str(end))

Total messages sent

In [None]:
total = len(df_msg)
by_me = len(df_msg[df_msg['is_from_me'] == 1])
by_himher = total - by_me
print('Total: ' + str(total))
print('From me: ' + str(by_me))
print('From him/her: ' + str(by_himher))

Word frequencies

In [None]:
frequencies = {}
for message in df_msg['text']:
    if message != None:
        words = message.split(" ")
        for word in words:
            word = re.sub(r'[^\w\s]','', word).lower().strip()
            if word not in stopwords and word != '':
                if word in frequencies:
                    frequencies[word] += 1
                else:
                    frequencies[word] = 1

frequencies_sorted = sorted(frequencies.items(), key=lambda kv: kv[1])
print(frequencies_sorted)

Most active day:

In [None]:
def mostCommon(lst):
    data = Counter(lst)
    return max(lst, key=data.get)

most_common_day = mostCommon(list(df_msg['new_date']))
print(most_common_day)

Number of texts on that day:

In [None]:
df_on_the_most_active_day = df_msg[df_msg['new_date'] == most_common_day]
num_of_texts = len(df_on_the_most_active_day)
num_of_texts_from_me = len(df_on_the_most_active_day[df_msg['is_from_me'] == 1])
num_of_texts_from_himher = num_of_texts - num_of_texts_from_me

print('Total texts sent on ' + str(most_common_day) + ' was ' + str(num_of_texts))
print('From me: ' + str(num_of_texts_from_me))
print('From him/her: ' + str(num_of_texts_from_himher))

Average messages per day that we texted

In [None]:
messages_total = 0;
distinct_days = 0;
last_day_tracked = None;

for index, row in df_msg.iterrows():
    message = row['text']
    if message != None:
        messages_total += 1
        current_day = row['new_date']
        if last_day_tracked != current_day: 
            distinct_days += 1
        last_day_tracked = current_day
            
print('Sent ' + str(math.floor(messages_total / distinct_days)) + ' messages on average each day.')

Create radar/spider plot showing average daily activity

In [None]:
# Format data frames

df_24hrs_me = df_msg[df_msg['is_from_me'] == 1]['new_hours']
df_24hrs_himher = df_msg[df_msg['is_from_me'] == 0]['new_hours']
values_me = df_24hrs_me.value_counts().sort_index().values.flatten().tolist() # IMPORTANT TO SORT HOURS
values_himher = df_24hrs_himher.value_counts().sort_index().values.flatten().tolist() # IMPORTANT TO SORT HOURS

# We need to repeat the first value to close the circular graph:
values_me += values_me[:1]
values_himher += values_himher[:1]

# Get number of variables
categories = set(list(df_msg['new_hours'])[1:]) # set() reduces to distinct values
N = len(categories)

In [None]:
# Set angle of each axis in the plot (again repeating first value to close the circular graph)
angles = [n / float(N) * 2 * math.pi for n in range(N)]
angles += angles[:1]

# Initialize spider plot
ax = plt.subplot(111, polar=True)

## If you want the first axis to be on top
ax.set_theta_offset(math.pi/2)
ax.set_theta_direction(-1)

# Draw one axe per variable + add labels 
plt.xticks(angles[:-1], categories, color='grey', size=8);

# Draw ylabels
ax.set_rlabel_position(0)
plt.yticks([1000,2000,3000,4000,5000,6000], ["1k", "2k", "3k","4k","5k","6k"], color='grey', size=8)
plt.ylim(0,max(values_me))

## ----------- Plot Individual 1 :: me
ax.plot(angles, values_me, linewidth=1, linestyle='solid')
ax.fill(angles, values_me, 'b', alpha=0.1);
 
## ----------- Plot Individual 2 :: himher
ax.plot(angles, values_himher, linewidth=1, linestyle='solid')
ax.fill(angles, values_himher, 'r', alpha=0.1)

red_patch = mpatches.Patch(color='r', label='Him',alpha=0.1)
blue_patch = mpatches.Patch(color='b', label='Me',alpha=0.1)
plt.legend(handles=[red_patch, blue_patch],loc='upper right', bbox_to_anchor=(0.1,0.1));

First occurrence of "I love you"

In [None]:
df_msg[df_msg['text'].str.contains('i love you', case=False) == True].sort_values(by='time').head(10)


# idxmax shows first index value by condition, only necessitates that index is unique

Alternative query to avoid cases like "I love your wordplay"

In [None]:
love_tests = pd.Series(['I love you', 'I love your wordplay']) # I don't really have other instances rn
love_tests.str.contains(r'i love you\b.*', case=False)
ilys = df_msg[df_msg['text'].str.contains(r'i love you\b.*', case=False) == True].sort_values(by='time')
print('Said "I love you" ' + str(len(ilys)) + ' times')
ilys.head(10)

In [None]:
# The actual texts:
print(df_msg.iloc[54171].text)
print(df_msg.iloc[54172].text)

In [None]:
messages_week = df_msg.groupby(pd.Grouper(key='time', freq='W-MON')).count()
fig, ax = plt.subplots()
fig.set_size_inches(20, 10)
plt.plot(messages_week.text, label='messages', color='pink') # the 200 is where on y axis the arrow points to

fig.suptitle('Weekly message overview', fontsize=20)
plt.xlabel('Weeks', fontsize=18)
plt.ylabel('Messages', fontsize=18)
plt.annotate('Ireland', (mdates.date2num(datetime.datetime(2018, 3, 15)), 200), xytext=(-100,0), 
            textcoords='offset points', size=20,
            va='center', ha='center',
            arrowprops=dict(arrowstyle="->",
                           connectionstyle='arc3, rad=-0.2',
                           lw=2),
            )
plt.annotate('Bri in DC', (mdates.date2num(datetime.datetime(2018, 8, 28)), 205), xytext=(50, -50),
            textcoords='offset points', size=20,
            va='center', ha='center',
            arrowprops=dict(arrowstyle="->", lw=2))

**NATURAL LANGUAGE PROCESSING**

https://github.com/guiem/my_notebooks/blob/master/anniversary/anniversary.ipynb

Generate word cloud (needs stopwords and fixing).

In [None]:
fullTexts = ""
for message in df_msg['text']:
    if message != None:
        fullTexts += message.lower()

def generate_wordcloud(text):
    wordcloud = WordCloud(font_path = '/Library/Fonts/Verdana.ttf',
                         relative_scaling = 1.0).generate(text)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

generate_wordcloud(fullTexts)

df_with_text = df_msg.text_normalized.dropna()
top_1000 = pd.Series(' '.join(df_with_text).split()).value_counts()[:1000]
wc = WordCloud(background_color='white')
wc.generate_from_frequencies(list(top_1000).to_dict().items())
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()