In [1]:
import pandas as pd
import numpy
import json
import os
import glob
import re
import plotly.express as px
from datetime import datetime, timedelta

In [3]:
def parse_single_json(path, name):
    # Open Json
    try:
        with open(path) as json_data:
            data = json.load(json_data)
            df = pd.DataFrame.from_dict(data['messages'])
            df = df[df['sender_name'] == name]
            return df
    except:
        pass
        print("JSON Loaded")
    # Create Dataframe from JSON

In [2]:
def get_paths():
    path_arr = {}
    cwd = str(os.getcwdb())[2:-1]
    for root, dirs, files in os.walk(cwd):
        for file in files:
            if file.endswith(".json"):
                #  path_arr.append(os.path.join(root, file))
                path_arr[os.path.join(root, file)] = os.path.join(root, file).split('\\')[-2][0:-11]
    return(path_arr)


In [4]:
name = "Ali Adnan"

In [5]:
# Create dataframe to store dataframes of msg data
dfs = []
for key,value in get_paths().items():
    # Populate dataframe with messages from {name} parsed from JSON
    # print(key)
    # print(value)
    try:
        data = parse_single_json(key, name)
    # print(data)
        data["sender"] = value
        dfs.append(data)
    except:
        pass


JSON Loaded


In [6]:
# Combine data from dataframes dataframe into one dataframe
df_combined = pd.concat(dfs, sort=True)

In [7]:
# Add more calculated collumns
df_combined['timestamp_ms'] = pd.to_datetime(df_combined['timestamp_ms'], unit='ms') # set timestamp datatype
df_combined['date'] = df_combined['timestamp_ms'].apply(lambda x: (x + timedelta(hours=8)).date()) # calculate date from timestamp
df_combined['day_of_week'] = df_combined['timestamp_ms'].dt.day_name() # calculate day of week from timestamp
df_combined['character_count'] = df_combined['content'].str.len() # calculate character count
df_combined['word_count'] = df_combined['content'].apply(lambda s : len(str(s).split(' '))) # calculate wordcount based on spaces


In [8]:
# Sort data by day
df_combined['day_of_week'] = pd.Categorical(df_combined['day_of_week'], categories=
    ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
    ordered=True)


In [9]:
df_combined['content'] = df_combined['content'].apply(lambda x : str(x).replace(r'[^\w\s]+', ''))


In [10]:
# Drop all unneeded columns & reset index
df_combined = df_combined.reset_index(drop=False)
print(df_combined)
df_photos = df_combined[df_combined['photos'].isnull() == 0]
df_videos = df_combined[df_combined['videos'].isnull() == 0]
df_combined = df_combined.drop(['audio_files', 'call_duration', 'files','gifs','missed','reactions','share','sticker','users','videos','photos','type','index'], axis=1)


index audio_files  call_duration                      content files  \
0           3         NaN            NaN             have a good day!   NaN   
1           4         NaN            NaN               nevermind then   NaN   
2           5         NaN            NaN                    Ahhh okok   NaN   
3           8         NaN            NaN  is it 100usd or hkd on ebay   NaN   
4          10         NaN            NaN           Is this available?   NaN   
...       ...         ...            ...                          ...   ...   
145401      2         NaN            NaN                   Ok lets go   NaN   
145402      5         NaN            NaN                       ur mum   NaN   
145403     10         NaN            NaN               No not really.   NaN   
145404     13         NaN            NaN             I'm a Crocodile.   NaN   
145405     21         NaN            NaN                          HII   NaN   

       gifs missed photos reactions                      se

In [11]:
# Drop all content null rows
df_combined = df_combined[df_combined["content"].isnull() == 0]


In [12]:

df_photos = df_photos['photos'].apply(lambda x : len(x))
df_videos = df_videos['videos'].apply(lambda x : len(x))


In [13]:
df_wordcount_series = df_combined['content'].str.split(expand=True).stack()
df_wordcount_series = df_wordcount_series.str.replace(r'[^\w\s]+', '')
df_wordcount_series = df_wordcount_series.str.lower()
df_wordcount_series = df_wordcount_series.value_counts()



In [14]:
top20words = df_wordcount_series.iloc[0:20]
print(top20words)

i       17451
you     11572
like    10337
dude    10333
to       9456
the      9429
and      8158
a        7888
it       7422
but      6774
is       6382
so       5180
im       4552
u        4507
yeah     4497
its      4412
that     4303
do       4294
in       4137
nan      3758
dtype: int64


In [15]:
from datetime import datetime, timedelta

df_combined['timestamp_hkt'] = df_combined['timestamp_ms'].apply(lambda x: (x + timedelta(hours=8))) # set timestamp datatype

print(df_combined)

content                      sender sender_name  \
0                  have a good day!  alirolanda33midicontroller   Ali Adnan   
1                    nevermind then  alirolanda33midicontroller   Ali Adnan   
2                         Ahhh okok  alirolanda33midicontroller   Ali Adnan   
3       is it 100usd or hkd on ebay  alirolanda33midicontroller   Ali Adnan   
4                Is this available?  alirolanda33midicontroller   Ali Adnan   
...                             ...                         ...         ...   
145401                   Ok lets go                    zootopia   Ali Adnan   
145402                       ur mum                    zootopia   Ali Adnan   
145403               No not really.                    zootopia   Ali Adnan   
145404             I'm a Crocodile.                    zootopia   Ali Adnan   
145405                          HII                    zootopia   Ali Adnan   

                  timestamp_ms        date day_of_week  character_count  \
0   

In [16]:
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [17]:
df_combined['hour_of_day'] = df_combined['timestamp_hkt'].apply(lambda x : x.strftime('%H'))
df_hour_fig = df_combined.groupby(['hour_of_day'])['content'].count().iplot(dimensions=(900,600),colors=["DarkOrchid",],kind='bar',title="Texts on Hour",yTitle="Frequency",xTitle="Hour",asFigure=True)
df_hour_fig.write_image("images/hour_msgs.svg")
df_hour_fig.show()

In [18]:
df_date_fig = df_combined.groupby(['date'])['content'].count().iplot(dimensions=(900,600),colors=["MediumTurquoise",],kind='bar',title="Texts on Day",yTitle="Frequency",xTitle="Day",asFigure=True)
df_date_fig.write_image("images/date_msgs.svg")
df_date_fig.show()

In [19]:
df_date_rollingsum = df_combined.groupby(['date'])['content'].count().cumsum()
df_date_rollingsum_fig = df_date_rollingsum.iplot(dimensions=(900,600),colors=["MediumSpringGreen",],kind='area',fill=True,title="Cumulative Messages Over Time ",yTitle="Frequency",xTitle="Day",asFigure=True)
df_date_rollingsum_fig.write_image("images/cumu_msgs.svg")
df_date_rollingsum_fig.show()

In [20]:
messages_on_day = df_combined.groupby(['day_of_week'])['content'].count().iplot(dimensions=(900,600),colors=["Aquamarine",],kind='bar',title="Messages on Day",yTitle="Frequency",xTitle="Day",asFigure=True)
messages_on_day.write_image("images/messages_on_day.svg")
messages_on_day.show()

In [22]:
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [23]:
top20words_fig = top20words[0:-1].iplot(dimensions=(900,600),subplots=True,colors=["red",],kind='bar',title="Most Commonly Used Words",yTitle="Frequency",xTitle="Word",asFigure=True)
top20words_fig.write_image("images/common_words.svg")
top20words_fig.show()

In [21]:

if not os.path.exists("images"):
    os.mkdir("images")

In [25]:

data={'Metric':["Total Number of Messages Sent","Number of Photos Sent","Number of Videos Sent","Total Number of Words Sent","Total Number of Characters Sent","Average Number of Messages Sent per Day","Average Word Count per Message", "Average Character Count per Message"], 'Value':[df_combined.content.count(),df_photos.sum(),df_videos.sum(),df_combined.character_count.sum(),df_combined.word_count.sum(),df_combined.groupby(['date'])['content'].count().mean(),df_combined.word_count.mean(), df_combined.character_count.mean()]}
dd=pd.DataFrame(data,columns=['Index','Metric','Value'])


In [24]:
import plotly.graph_objects as go
import psutil

fig = go.Figure(data=[go.Table(header=dict(values=['Metric', 'Value']),cells=dict( align='left',values=[["Total Number of Messages Sent","Number of Photos Sent","Number of Videos Sent","Total Number of Words Sent","Total Number of Characters Sent","Average Number of Messages Sent per Day","Average Word Count per Message", "Average Character Count per Message"], [df_combined.content.count(),df_photos.sum(),df_videos.sum(),df_combined.character_count.sum(),df_combined.word_count.sum(),df_combined.groupby(['date'])['content'].count().mean(),df_combined.word_count.mean(), df_combined.character_count.mean()]]))])
fig.update_layout(width=600,
    height=600,)
fig.write_image("images/markov_gen.svg")
fig.show()


In [26]:


person_fig = df_combined.groupby(['sender'])['content'].count().reset_index(name='count').sort_values(['count'], ascending=False).set_index('sender').head(6)
person_fig['sender_hidden'] = range(1, len(person_fig) + 1)
person_fig['sender_hidden'] = person_fig['sender_hidden'].apply(lambda x : 'Person' + str(x))
person_fig = person_fig.set_index('sender_hidden')
print(person_fig)
person_fig = person_fig.iplot(dimensions=(900,600),colors=["PaleGreen",],kind='bar',title="Messages Sent to Person",yTitle="Frequency",xTitle="Person",asFigure='True')
person_fig.write_image("images/person.svg")

person_fig.show()


count
sender_hidden       
Person1        83977
Person2        20248
Person3         8170
Person4         6493
Person5         3427
Person6         2884


In [27]:
words_list = df_combined['content'].values.tolist()
corpus = '\n'.join(words_list)
corpus = re.sub(r'[^\w\s]','',corpus)
corpus = corpus.lower()
print(corpus)


ith your name and application number
oh and regarding this polyu has a bunch of activities for international students after registration just to get you settled in before the semester starts
ill check first with the international affairs office who deals with campus tours and registration for international student affairs and ill see what i can do for you 
of may
ok do you have any specific dates in mind
ahhh are you looking to visit the polyu campus during the holidays
yes how may i help you
hi
im only on this stupid page because im salty uoft didnt accept me confess
you as well 
good luck with it thooo
i am busy 0
sorry mann i already bought it a while ago
goodnight
i need sleep
its getting late
little twat
frkn kwong
we have chem so it sucks for now
wait no
great
uhhhh
ill check in the morning
idk then
whet
it starts at 1130
uhhh
vasu
i might swing by later this week
okok
how much
oh nice
hi do you sell the seiko skx007
say hi to your new facebook friend willy
you turned off all mes

In [28]:
import markovify
text_model = markovify.NewlineText(corpus)


In [29]:
markov_array = []
for i in range(10):
    markov_array.append(text_model.make_short_sentence(320))

In [30]:
pd.Series(markov_array).to_excel('mm.xlsx')

In [31]:
fig2 = go.Figure(data=[go.Table(header=dict(align='left',values=['No.', 'Generated Markov Chain']),cells=dict( align='left',values=[list(range(1,11)), markov_array]))])
fig2.update_layout(width=800,
    height=800,)
fig2.show()

In [None]:
df_combined['month_year'] = df_combined['timestamp_hkt'].apply(lambda x : x.strftime('%Y-%B '))

# LSTM TEXT GENERATION 

In [32]:
# import sys
# import numpy as np
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import Dropout
# from keras.layers import LSTM
# from keras.callbacks import ModelCheckpoint
# from keras.utils import np_utils

In [33]:
# nlp_corpus = df_combined['content'].values.tolist()
# nlp_corpus = ".".join(nlp_corpus).lower()
# raw_text = nlp_corpus
# import io
# with io.open('aa.txt', "w", encoding="utf-8") as f:
#     f.write(raw_text)


In [34]:
# chars = sorted(list(set(raw_text)))
# char_to_int = dict((c, i) for i, c in enumerate(chars))
# int_to_char = dict((i, c) for i, c in enumerate(chars))
# # summarize the loaded data
# n_chars = len(raw_text)
# n_vocab = len(chars)
# print ("Total Characters: ", n_chars)
# print ("Total Vocab: ", n_vocab)
# # prepare the dataset of input to output pairs encoded as integers
# seq_length = 100
# dataX = []
# dataY = []
# for i in range(0, n_chars - seq_length, 1):
# 	seq_in = raw_text[i:i + seq_length]
# 	seq_out = raw_text[i + seq_length]
# 	dataX.append([char_to_int[char] for char in seq_in])
# 	dataY.append(char_to_int[seq_out])
# n_patterns = len(dataX)
# print ("Total Patterns: ", n_patterns)
# # reshape X to be [samples, time steps, features]
# X = numpy.reshape(dataX, (n_patterns, seq_length, 1))

# # normalize
# X = X / float(n_vocab)
# # one hot encode the output variable
# y = np_utils.to_categorical(dataY)

In [35]:


# # define the LSTM model
# model = Sequential()
# model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
# model.add(Dropout(0.2))
# model.add(Dense(y.shape[1], activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam')
# # define the checkpoint
# filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]
# # fit the model
# model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

In [36]:
# y = np_utils.to_categorical(dataY)
# # define the LSTM model
# model = Sequential()
# model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
# model.add(Dropout(0.2))
# model.add(Dense(y.shape[1], activation='softmax'))
# # load the network weights
# filename = "weights-improvement-01-2.7836.hdf5"
# model.load_weights(filename)
# model.compile(loss='categorical_crossentropy', optimizer='adam')
# # pick a random seed
# start = numpy.random.randint(0, len(dataX)-1)
# pattern = dataX[start]
# print ("Seed:")
# print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# # generate characters
# for i in range(1000):
# 	x = numpy.reshape(pattern, (1, len(pattern), 1))
# 	x = x / float(n_vocab)
# 	prediction = model.predict(x, verbose=0)
# 	index = numpy.argmax(prediction)
# 	result = int_to_char[index]
# 	seq_in = [int_to_char[value] for value in pattern]
# 	sys.stdout.write(result)
# 	pattern.append(index)
# 	pattern = pattern[1:len(pattern)]
# print ("\nDone.")