# Imports

In [14]:
from __future__ import print_function
from __future__ import division

import copy
import json
import re
import string

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn  # To improve the chart styling.
import wordtree

from IPython.display import display
from IPython.display import HTML
from IPython.display import Javascript
from wordcloud import STOPWORDS
import ipywidgets as widgets
from wordcloud import WordCloud

#import iphone_connector

# Load the data from disk and set up the dataframes

In [17]:
%matplotlib inline
matplotlib.style.use('ggplot')
pd.set_option('display.max_colwidth', 1000)
df1 = pd.read_csv("Sedus – Insights Content Template - All Data.csv")
df1['Content']=df1['Content'].fillna("")
#print(df1)
#iphone_connector.initialize()

#fully_merged_messages_df, address_book_df = iphone_connector.get_cleaned_fully_merged_messages()
#full_names = set(address_book_df.full_name)  # Handy set to check for misspellings later on.
#fully_merged_messages_df.full_name.replace('nan nan nan', 'Unknown', inplace=True)

WORDS_PER_PAGE = 450  # Based upon http://wordstopages.com/
print('\nTotal pages if all texts were printed: {0:,d} (Arial size 12, single spaced)\n'.format(
    sum(df1.Content.apply(lambda x: len(x.split())))//WORDS_PER_PAGE))


Total pages if all texts were printed: 24 (Arial size 12, single spaced)



# Diving deeper into the actual text

### Visualize a word tree of texts exchanged with a specific contact

In [None]:
# Note this requires an internet connection to load Google's JS library.
def get_json_for_word_tree(contact):
    df = fully_merged_messages_df[(fully_merged_messages_df.full_name == contact)]
    print('Exchanged {0:,} texts with {1}'.format(df.shape[0], contact))
    
    array_for_json = [[text[1]] for text in df.text.iteritems()]
    array_for_json.insert(0, [['Phrases']])
    return json.dumps(array_for_json)
    
CONTACT_NAME = 'Mom'
ROOT_WORD = 'feel'
HTML(wordtree.get_word_tree_html(get_json_for_word_tree('Mom'),
                                 ROOT_WORD.lower(),
                                 lowercase=True,
                                 tree_type='double'))

### Preprocessing and data munging for TFIDF

In [18]:
punctuation = copy.copy(string.punctuation)
punctuation += u'“”‘’\ufffc\uff0c'  # Include some UTF-8 punctuation that occurred.
punct_regex = re.compile(u'[{0}]'.format(punctuation))
spaces_regex = re.compile(r'\s{2,}')
numbers_regex = re.compile(r'\d+')

def clean_text(input_str):
    processed = input_str.lower()
    processed = punct_regex.sub('', processed)
    # Also try: processed = numbers_regex.sub('_NUMBER_', processed)
    processed = numbers_regex.sub('', processed)
    processed = spaces_regex.sub(' ', processed)
    
    return processed

# The normal stopwords list contains words like "i'll" which is unprocessed.
processed_stopwords = [clean_text(word) for word in STOPWORDS]

### Create TFIDF matrix for all contacts

Note the methods below focus on texts received from these contacts, not texts you've sent to them.

In [86]:
grouped_by_name = df1.groupby('Content_Num')['Content'].apply(lambda x: ' '.join(x)).to_frame()
#grouped_by_name.head()
#print(grouped_by_name.index)

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
import numpy as np

vectorizer = TfidfVectorizer(preprocessor=clean_text,
                             tokenizer=tokenize.WordPunctTokenizer().tokenize,
                             stop_words=processed_stopwords,
                             ngram_range=(1, 2), max_df=.9, max_features=50000)
tfidf_transformed_dataset = vectorizer.fit_transform(grouped_by_name.Content)
word_list = pd.Series(vectorizer.get_feature_names())

print('TFIDF sparse matrix is {0}MB'.format(tfidf_transformed_dataset.data.nbytes / 1024 / 1024))
print('TFIDF matrix has shape: {0}'.format(tfidf_transformed_dataset.shape))


TFIDF sparse matrix is 0.08097076416015625MB
TFIDF matrix has shape: (142, 7980)


### Helper methods to leverage the TFIDF matrix

In [88]:
def get_word_summary_for_content(content_num, top_n=25):
   # content = convert_unicode_to_str_if_needed(content)
    content_num = int(content_num)
    tfidf_record = _get_tfidf_record_for_content(content_num)
    if tfidf_record is None:
        print('"{0}" was not found.'.format(content_num))
        return
    sorted_indices = tfidf_record.argsort()[::-1]
    return pd.DataFrame({'Word': word_list.iloc[sorted_indices[:top_n]]}).reset_index(drop=True)

def get_word_summary_for_diffs(content_num, other_content_num, top_n=25):
    #content = convert_unicode_to_str_if_needed(content)
    content_num = int(content_num)
    #other_content = convert_unicode_to_str_if_needed(other_content)
    other_content_num = int(other_content_num)
    
    tfidf_record_content = _get_tfidf_record_for_content(content_num)
    tfidf_record_other_content = _get_tfidf_record_for_content(other_content_num)
    
    if tfidf_record_content is None or tfidf_record_other_content is None:
        # Print out the first contact not found.
        content_not_found = content if tfidf_record_content is None else other_content
        print('"{0}" was not found.'.format(content_not_found))
        return
    sorted_indices = (tfidf_record_content - tfidf_record_other_content).argsort()[::-1]
    return pd.DataFrame({'Word': word_list.iloc[sorted_indices[:top_n]]}).reset_index(drop=True)

# Returns the row in the TFIDF matrix for a given contact by name.
def _get_tfidf_record_for_content(content_num):
    if content_num not in grouped_by_name.index:
        return None
    row = np.argmax(grouped_by_name.index == content_num)
    return tfidf_transformed_dataset.getrow(row).toarray().squeeze()

### Words that identify a specific contact

In [89]:
widgets.interact(
    get_word_summary_for_content,
    content_num=widgets.Text(value="1", description='Content_Num:', placeholder='Enter number'),
    top_n=widgets.IntSlider(min=5, max=100, step=1, value=5, description='Max words to show:')
)

interactive(children=(Text(value='1', description='Content_Num:', placeholder='Enter number'), IntSlider(value…

<function __main__.get_word_summary_for_content(content_num, top_n=25)>

### Words that identify the difference between two contacts

In [91]:
widgets.interact(
    get_word_summary_for_diffs,
    content_num=widgets.Text(value="1", description='Content1:', placeholder='Enter number'),
    other_content_num=widgets.Text(value="2",description='Content2:', placeholder='Enter 2nd number'),
    top_n=widgets.IntSlider(description='Max words to show:', min=5, max=20, step=1, value=5)
)

interactive(children=(Text(value='1', description='Content1:', placeholder='Enter number'), Text(value='2', de…

<function __main__.get_word_summary_for_diffs(content_num, other_content_num, top_n=25)>

To do:
- Take top 10 words from each Content Piece and save in array in dataframe
- Visualize connections between these words -- using categories of magazine, and section names