In [None]:
import pandas as pd
import numpy
import json
import os
import glob
import re
import plotly.express as px
from datetime import datetime, timedelta

In [None]:
def parse_single_json(path, name):
    # Open Json
    try:
        with open(path) as json_data:
            data = json.load(json_data)
            # Fails here if json does not have messages object
            # Filters out any other json files like settings.json 
            df = pd.DataFrame.from_dict(data['messages'])

            df = df[df['sender_name'] == name]
            return df
    except:
        print("Invalid JSON")
        pass

In [None]:
# Gets path of any path that ends with json within script dir
# Saves path as well as parent folder 
def get_paths():
    path_obj = {}
    cwd = str(os.getcwdb())[2:-1]
    for root, dirs, files in os.walk(cwd):
        for file in files:
            if file.endswith(".json"):
                path_obj[os.path.join(root, file)] = os.path.join(root, file).split('\\')[-2][0:-11]
    return(path_obj)


In [None]:
name = "Ali Adnan"

In [None]:
# Create mass dataframe of all messages jsons found using get_paths() 
def combine_dfs():
    dfs = []
    for key,value in get_paths().items():
        # Populate dataframe with messages from {name} parsed from JSON
        try:
            data = parse_single_json(key, name)
        # print(data)
            data["sender"] = value
            dfs.append(data)
        except:
            pass
    df_combined = pd.concat(dfs, sort=True)
    return df_combined

df_combined = combine_dfs()

In [None]:
# Add more calculated collumns
def add_calcfields(df_combined):

    df_combined['timestamp_ms'] = pd.to_datetime(df_combined['timestamp_ms'], unit='ms') # set timestamp datatype
    df_combined['date'] = df_combined['timestamp_ms'].apply(lambda x: (x + timedelta(hours=8)).date()) # calculate date from timestamp
    df_combined['character_count'] = df_combined['content'].str.len() # calculate character count
    df_combined['word_count'] = df_combined['content'].apply(lambda s : len(str(s).split(' '))) # calculate wordcount based on spaces
    return df_combined

df_combined = add_calcfields(df_combined)


In [None]:
# Add day of week categories
def add_dayofweek(df_combined):
    df_combined['day_of_week'] = df_combined['timestamp_ms'].dt.day_name() # calculate day of week from timestamp
    df_combined['day_of_week'] = pd.Categorical(df_combined['day_of_week'], categories=
        ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
        ordered=True)
    return df_combined

df_combined = add_dayofweek(df_combined)

In [None]:

def remove_punc(df_combined):
    df_combined['content'] = df_combined['content'].apply(lambda x : str(x).replace(r'[^\w\s]+', ''))
    return df_combined

# df_combined = remove_punc(df_combined)



In [None]:
def get_media_count(df_combined,media_type):
    df_combined = df_combined[df_combined[media_type].isnull() == 0]
    df_combined = df_combined[media_type].apply(lambda x : len(x))
    return df_combined

df_photos = get_media_count(df_combined,'photos')
df_videos = get_media_count(df_combined,'videos')


In [None]:
# Drop all unneeded columns & reset index
def drop_unneeded_cols(df_combined):
    df_combined = df_combined.reset_index(drop=False)
    print(df_combined)
    df_combined = df_combined.drop(['audio_files', 'call_duration', 'files','gifs','missed','reactions','share','sticker','users','videos','photos','type','index'], axis=1)

    return df_combined

df_combined = drop_unneeded_cols(df_combined)


In [None]:
# Drop all content null rows
def drop_null(df_combined):
    df_combined = df_combined[df_combined["content"].isnull() == 0]
    return df_combined

df_combined = drop_null(df_combined)


In [None]:
print(df_combined)

In [None]:
def generate_word_valuecounts(df_combined):
    df_wordcount_rawlist = df_combined['content'].values.tolist()
    df_wordcount_bigstring = " ".join(df_wordcount_rawlist)
    df_wordcount_list = df_wordcount_bigstring.split(" ")
    df_wordcount_rawseries = pd.Series(df_wordcount_list)
    df_wordcount_rawseries = df_wordcount_rawseries.str.replace(r'[^\w\s]+', '')
    df_wordcount_rawseries = df_wordcount_rawseries.str.lower()
    df_wordcount_series = df_wordcount_rawseries.value_counts()
    return df_wordcount_series
# df_wordcount_list = ''
df_wordcount_series = generate_word_valuecounts(df_combined)

In [None]:
print(df_wordcount_series)

In [None]:
def get_top20_words(df_wordcount_series):
    top20words = df_wordcount_series.iloc[0:20]
    return top20words
    
top20words =  get_top20_words(df_wordcount_series)

In [None]:


def add_local_time(df_combined, locale, timedelta):
    from datetime import datetime, timedelta
    df_combined[f'timestamp_{locale}'] = df_combined['timestamp_ms'].apply(lambda x: (x + timedelta(hours=8))) # set timestamp datatype
    return df_combined

df_combined = add_local_time(df_combined,'hkt',(+8))
print(df_combined)

### Make Graphs!

In [None]:
import cufflinks as cf
import plotly.offline
import plotly.graph_objects as go
import psutil

cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [None]:
df_combined['hour_of_day'] = df_combined['timestamp_hkt'].apply(lambda x : x.strftime('%H'))
df_hour_fig = df_combined.groupby(['hour_of_day'])['content'].count().iplot(dimensions=(900,600),colors=["DarkOrchid",],kind='bar',title="Texts on Hour",yTitle="Frequency",xTitle="Hour",asFigure=True)
df_hour_fig.write_image("images/hour_msgs.svg")
df_hour_fig.show()

In [None]:
df_date_fig = df_combined.groupby(['date'])['content'].count().iplot(dimensions=(900,600),colors=["MediumTurquoise",],kind='bar',title="Texts on Day",yTitle="Frequency",xTitle="Day",asFigure=True)
df_date_fig.write_image("images/date_msgs.svg")
df_date_fig.show()

In [None]:
df_date_rollingsum = df_combined.groupby(['date'])['content'].count().cumsum()
df_date_rollingsum_fig = df_date_rollingsum.iplot(dimensions=(900,600),colors=["MediumSpringGreen",],kind='area',fill=True,title="Cumulative Messages Over Time ",yTitle="Frequency",xTitle="Day",asFigure=True)
df_date_rollingsum_fig.write_image("images/cumu_msgs.svg")
df_date_rollingsum_fig.show()

In [None]:
messages_on_day = df_combined.groupby(['day_of_week'])['content'].count().iplot(dimensions=(900,600),colors=["Aquamarine",],kind='bar',title="Messages on Day",yTitle="Frequency",xTitle="Day",asFigure=True)
messages_on_day.write_image("images/messages_on_day.svg")
messages_on_day.show()

In [None]:
top20words_fig = top20words[0:-1].iplot(dimensions=(900,600),subplots=True,colors=["red",],kind='bar',title="Most Commonly Used Words",yTitle="Frequency",xTitle="Word",asFigure=True)
top20words_fig.write_image("images/common_words.svg")
top20words_fig.show()

In [None]:
# Make images folder
if not os.path.exists("images"):
    os.mkdir("images")

In [None]:
# I wish there was a better way to make plot.ly text tables

header = dict(values=['Metric', 'Value'])

metric_col = ["Total Number of Messages Sent","Number of Photos Sent","Number of Videos Sent","Total Number of Words Sent","Total Number of Characters Sent","Average Number of Messages Sent per Day","Average Word Count per Message", "Average Character Count per Message"]

values_col = [df_combined.content.count(),df_photos.sum(),df_videos.sum(),df_combined.character_count.sum(),df_combined.word_count.sum(),df_combined.groupby(['date'])['content'].count().mean(),df_combined.word_count.mean(), df_combined.character_count.mean()]

data = [go.Table(header=header,cells=dict(align='left',values=[metric_col, values_col]))] 

fig = go.Figure(data)
fig.update_layout(width=600,
    height=600,)
fig.write_image("images/stats.svg")
fig.show()

In [None]:


person_fig = df_combined.groupby(['sender'])['content'].count().reset_index(name='count').sort_values(['count'], ascending=False).set_index('sender').head(6)
person_fig['sender_hidden'] = range(1, len(person_fig) + 1)
person_fig['sender_hidden'] = person_fig['sender_hidden'].apply(lambda x : 'Person' + str(x))
person_fig = person_fig.set_index('sender_hidden')
print(person_fig)
person_fig = person_fig.iplot(dimensions=(900,600),colors=["PaleGreen",],kind='bar',title="Messages Sent to Person",yTitle="Frequency",xTitle="Person",asFigure='True')
person_fig.write_image("images/person.svg")

person_fig.show()


In [None]:
def generate_markov_corpus(df_combined):
    words_list = df_combined['content'].values.tolist()
    corpus = '\n'.join(words_list)
    corpus = re.sub(r'[^\w\s]','',corpus)
    corpus = corpus.lower()
    return corpus

corpus = generate_markov_corpus(df_combined)


In [None]:
import markovify
text_model = markovify.NewlineText(corpus)


In [None]:
markov_array = []
for i in range(10):
    markov_array.append(text_model.make_short_sentence(320))

In [None]:
pd.Series(markov_array).to_excel('mm.xlsx')

In [None]:
header=dict(align='left',values=['No.', 'Generated Markov Chain'])
fig2 = go.Figure(data=[go.Table(header=header,cells=dict( align='left',values=[list(range(1,11)), markov_array]))])
fig2.update_layout(width=800,height=800,)

fig2.show()

In [None]:
df_combined['month_year'] = df_combined['timestamp_hkt'].apply(lambda x : '{year}-{month}'.format(year=x.year, month=x.month))

message_conciseness = df_combined.groupby(['month_year'])['word_count'].mean().iplot(dimensions=(900,600),colors=["Aquamarine",],kind='bar',title="Message Words/Message over Time",yTitle="Average Words/Message",xTitle="Binned by Month",asFigure=True)
message_conciseness.write_image("images/message_conciseness.svg")
message_conciseness.show()


In [None]:
df_dateM_fig = df_combined.groupby(['month_year'])['content'].count().iplot(dimensions=(900,600),colors=["SeaGreen",],kind='bar',title="Texts Binned by Month",yTitle="Frequency",xTitle="Date",asFigure=True)
df_dateM_fig.write_image("images/dateM_msgs.svg")
df_dateM_fig.show()

# LSTM TEXT GENERATION 

In [None]:
# import sys
# import numpy as np
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import Dropout
# from keras.layers import LSTM
# from keras.callbacks import ModelCheckpoint
# from keras.utils import np_utils

In [None]:
# nlp_corpus = df_combined['content'].values.tolist()
# nlp_corpus = ".".join(nlp_corpus).lower()
# raw_text = nlp_corpus
# import io
# with io.open('aa.txt', "w", encoding="utf-8") as f:
#     f.write(raw_text)


In [None]:
# chars = sorted(list(set(raw_text)))
# char_to_int = dict((c, i) for i, c in enumerate(chars))
# int_to_char = dict((i, c) for i, c in enumerate(chars))
# # summarize the loaded data
# n_chars = len(raw_text)
# n_vocab = len(chars)
# print ("Total Characters: ", n_chars)
# print ("Total Vocab: ", n_vocab)
# # prepare the dataset of input to output pairs encoded as integers
# seq_length = 100
# dataX = []
# dataY = []
# for i in range(0, n_chars - seq_length, 1):
# 	seq_in = raw_text[i:i + seq_length]
# 	seq_out = raw_text[i + seq_length]
# 	dataX.append([char_to_int[char] for char in seq_in])
# 	dataY.append(char_to_int[seq_out])
# n_patterns = len(dataX)
# print ("Total Patterns: ", n_patterns)
# # reshape X to be [samples, time steps, features]
# X = numpy.reshape(dataX, (n_patterns, seq_length, 1))

# # normalize
# X = X / float(n_vocab)
# # one hot encode the output variable
# y = np_utils.to_categorical(dataY)

In [None]:


# # define the LSTM model
# model = Sequential()
# model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
# model.add(Dropout(0.2))
# model.add(Dense(y.shape[1], activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam')
# # define the checkpoint
# filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]
# # fit the model
# model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

In [None]:
# y = np_utils.to_categorical(dataY)
# # define the LSTM model
# model = Sequential()
# model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
# model.add(Dropout(0.2))
# model.add(Dense(y.shape[1], activation='softmax'))
# # load the network weights
# filename = "weights-improvement-01-2.7836.hdf5"
# model.load_weights(filename)
# model.compile(loss='categorical_crossentropy', optimizer='adam')
# # pick a random seed
# start = numpy.random.randint(0, len(dataX)-1)
# pattern = dataX[start]
# print ("Seed:")
# print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# # generate characters
# for i in range(1000):
# 	x = numpy.reshape(pattern, (1, len(pattern), 1))
# 	x = x / float(n_vocab)
# 	prediction = model.predict(x, verbose=0)
# 	index = numpy.argmax(prediction)
# 	result = int_to_char[index]
# 	seq_in = [int_to_char[value] for value in pattern]
# 	sys.stdout.write(result)
# 	pattern.append(index)
# 	pattern = pattern[1:len(pattern)]
# print ("\nDone.")