In [165]:
import pandas as pd
import numpy
import json
import os
import glob
import re
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from matplotlib import rcParams
from datetime import datetime, timedelta

In [166]:
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Roboto']


In [167]:
def parse_single_json(path, name):
    # Open Json
    try:
        with open(path) as json_data:
            data = json.load(json_data)
            df = pd.DataFrame.from_dict(data['messages'])
            df = df[df['sender_name'] == name]
            return df
    except:
        pass
        print("JSON Loaded")
    # Create Dataframe from JSON

In [168]:
os.getcwdb()

b'c:\\Users\\aliad\\OneDrive\\Documents\\proj_\\fb_message_analysis'

In [169]:
def get_paths():
    path_arr = {}
    cwd = str(os.getcwdb())[2:-1]
    for root, dirs, files in os.walk(cwd):
        for file in files:
            if file.endswith(".json"):
                #  path_arr.append(os.path.join(root, file))
                path_arr[os.path.join(root, file)] = os.path.join(root, file).split('\\')[-2][0:-11]
    return(path_arr)


In [170]:
name = "Ali Adnan"

In [171]:
# Create dataframe to store dataframes of msg data
dfs = []
for key,value in get_paths().items():
    # Populate dataframe with messages from {name} parsed from JSON
    # print(key)
    # print(value)
    try:
        data = parse_single_json(key, name)
    # print(data)
        data["sender"] = value
        dfs.append(data)
    except:
        pass


JSON Loaded


In [172]:
# Combine data from dataframes dataframe into one dataframe
df_combined = pd.concat(dfs, sort=True)

In [173]:
# Add more calculated collumns
df_combined['timestamp_ms'] = pd.to_datetime(df_combined['timestamp_ms'], unit='ms') # set timestamp datatype
df_combined['date'] = df_combined['timestamp_ms'].apply(lambda x: (x + timedelta(hours=8)).date()) # calculate date from timestamp
df_combined['day_of_week'] = df_combined['timestamp_ms'].dt.day_name() # calculate day of week from timestamp
df_combined['character_count'] = df_combined['content'].str.len() # calculate character count
df_combined['word_count'] = df_combined['content'].apply(lambda s : len(str(s).split(' '))) # calculate wordcount based on spaces


In [174]:
# Sort data by day
df_combined['day_of_week'] = pd.Categorical(df_combined['day_of_week'], categories=
    ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
    ordered=True)


In [175]:
df_combined['content'] = df_combined['content'].apply(lambda x : str(x).replace(r'[^\w\s]+', ''))


In [176]:
# Drop all unneeded columns & reset index
df_combined = df_combined.reset_index(drop=False)
print(df_combined)
df_photos = df_combined[df_combined['photos'].isnull() == 0]
df_videos = df_combined[df_combined['videos'].isnull() == 0]
df_combined = df_combined.drop(['audio_files', 'call_duration', 'files','gifs','missed','reactions','share','sticker','users','videos','photos','type','index'], axis=1)


index audio_files  call_duration                      content files  \
0           3         NaN            NaN             have a good day!   NaN   
1           4         NaN            NaN               nevermind then   NaN   
2           5         NaN            NaN                    Ahhh okok   NaN   
3           8         NaN            NaN  is it 100usd or hkd on ebay   NaN   
4          10         NaN            NaN           Is this available?   NaN   
...       ...         ...            ...                          ...   ...   
145401      2         NaN            NaN                   Ok lets go   NaN   
145402      5         NaN            NaN                       ur mum   NaN   
145403     10         NaN            NaN               No not really.   NaN   
145404     13         NaN            NaN             I'm a Crocodile.   NaN   
145405     21         NaN            NaN                          HII   NaN   

       gifs missed photos reactions                      se

In [177]:
# Drop all content null rows
df_combined = df_combined[df_combined["content"].isnull() == 0]


In [178]:
df_photos = df_photos['photos'].apply(lambda x : len(x))
print(df_photos)

9         5
24        1
28        1
125       1
128       1
         ..
145322    1
145323    1
145324    1
145325    1
145359    1
Name: photos, Length: 3113, dtype: int64


In [179]:
df_videos = df_videos['videos'].apply(lambda x : len(x))
print(df_videos)

1392      1
1967      1
4073      1
5423      1
7785      1
         ..
140703    1
140708    1
140717    1
140727    1
144944    1
Name: videos, Length: 141, dtype: int64


In [180]:
print(df_combined.count())


content            145406
sender             145406
sender_name        145406
timestamp_ms       145406
date               145406
day_of_week        145406
character_count    141649
word_count         145406
dtype: int64


In [181]:
df_wordcount_series = df_combined['content'].str.split(expand=True)
print(df_wordcount_series)

0     1           2     3     4     5     6     7     8     9    \
0            have     a        good  day!  None  None  None  None  None  None   
1       nevermind  then        None  None  None  None  None  None  None  None   
2            Ahhh  okok        None  None  None  None  None  None  None  None   
3              is    it      100usd    or   hkd    on  ebay  None  None  None   
4              Is  this  available?  None  None  None  None  None  None  None   
...           ...   ...         ...   ...   ...   ...   ...   ...   ...   ...   
145401         Ok  lets          go  None  None  None  None  None  None  None   
145402         ur   mum        None  None  None  None  None  None  None  None   
145403         No   not     really.  None  None  None  None  None  None  None   
145404        I'm     a  Crocodile.  None  None  None  None  None  None  None   
145405        HII  None        None  None  None  None  None  None  None  None   

        ...   844   845   846   847   848

In [182]:
df_wordcount_series = df_wordcount_series.stack()
print(df_wordcount_series)

0       0          have
        1             a
        2          good
        3          day!
1       0     nevermind
                ...    
145403  2       really.
145404  0           I'm
        1             a
        2    Crocodile.
145405  0           HII
Length: 542664, dtype: object


In [183]:
df_wordcount_series = df_wordcount_series.str.replace(r'[^\w\s]+', '')
df_wordcount_series = df_wordcount_series.str.lower()
print(df_wordcount_series)

0       0         have
        1            a
        2         good
        3          day
1       0    nevermind
               ...    
145403  2       really
145404  0           im
        1            a
        2    crocodile
145405  0          hii
Length: 542664, dtype: object


In [184]:
df_wordcount_series = df_wordcount_series.value_counts()
print(df_wordcount_series)

i                                               17451
you                                             11572
like                                            10337
dude                                            10333
to                                               9456
                                                ...  
awwwww                                              1
httpswwwsearchmybiosearchqhong20kong20collab        1
twinkle                                             1
webseries                                           1
ovens                                               1
Length: 20574, dtype: int64


In [185]:
df_wordcount_series['fuck']

2253

In [186]:
top20words = df_wordcount_series.iloc[0:20]
print(top20words)

i       17451
you     11572
like    10337
dude    10333
to       9456
the      9429
and      8158
a        7888
it       7422
but      6774
is       6382
so       5180
im       4552
u        4507
yeah     4497
its      4412
that     4303
do       4294
in       4137
nan      3758
dtype: int64


In [187]:
maxword = 'i'
maxword2 = 'i'
maxword3 = 'i'
for i in range(len(df_wordcount_series)):
    if len(df_wordcount_series.index[i]) > len(maxword):
        maxword3 = maxword2
        maxword2 = maxword

        maxword = df_wordcount_series.index[i]

print(maxword)
print(maxword2)
print(maxword3)




aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa

In [188]:
maxword = 'i'
maxword2 = 'i'
maxword3 = 'i'
for i in range(len(df_wordcount_series)):
    if len(df_wordcount_series.index[i]) > len(maxword):
        maxword3 = maxword2
        maxword2 = maxword

        maxword = df_wordcount_series.index[i]

print(maxword)
print(maxword2)
print(maxword3)




aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa

In [189]:
from datetime import datetime, timedelta

df_combined['timestamp_hkt'] = df_combined['timestamp_ms'].apply(lambda x: (x + timedelta(hours=8))) # set timestamp datatype

print(df_combined)

content                      sender sender_name  \
0                  have a good day!  alirolanda33midicontroller   Ali Adnan   
1                    nevermind then  alirolanda33midicontroller   Ali Adnan   
2                         Ahhh okok  alirolanda33midicontroller   Ali Adnan   
3       is it 100usd or hkd on ebay  alirolanda33midicontroller   Ali Adnan   
4                Is this available?  alirolanda33midicontroller   Ali Adnan   
...                             ...                         ...         ...   
145401                   Ok lets go                    zootopia   Ali Adnan   
145402                       ur mum                    zootopia   Ali Adnan   
145403               No not really.                    zootopia   Ali Adnan   
145404             I'm a Crocodile.                    zootopia   Ali Adnan   
145405                          HII                    zootopia   Ali Adnan   

                  timestamp_ms        date day_of_week  character_count  \
0   

In [190]:
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [191]:
df_combined['hour_of_day'] = df_combined['timestamp_hkt'].apply(lambda x : x.strftime('%H'))
df_combined.groupby(['hour_of_day'])['content'].count().iplot(dimensions=(1500,1000),colors=["DarkOrchid",],kind='bar',title="Texts on Hour",yTitle="Frequency",xTitle="Hour")

In [192]:
df_combined.groupby(['date'])['content'].count().iplot(dimensions=(1500,1000),colors=["MediumTurquoise",],kind='bar',title="Texts on Day",yTitle="Frequency",xTitle="Day")

In [193]:
df_combined.groupby(['day_of_week'])['content'].count().iplot(dimensions=(1500,1000),colors=["Aquamarine",],kind='bar',title="Messages on Day",yTitle="Frequency",xTitle="Day")

In [194]:
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [195]:
top20words.iplot(dimensions=(1500,1000),subplots=True,colors=["red",],kind='bar',title="Most Commonly Used Words",yTitle="Frequency",xTitle="Word")

In [None]:
import plotly.graph_objects as go
fig = go.Figure(data=[go.Table(header=dict(values=['Metric', 'Value']),cells=dict( align='left',values=[["Total Number of Messages Sent","Number of Photos Sent","Number of Videos Sent","Total Number of Words Sent","Total Number of Characters Sent","Average Number of Messages Sent per Day","Average Word Count per Message", "Average Character Count per Message"], [df_combined.content.count(),df_photos.sum(),df_videos.sum(),df_combined.character_count.sum(),df_combined.word_count.sum(),df_combined.groupby(['date'])['content'].count().mean(),df_combined.word_count.mean(), df_combined.character_count.mean()]]))])
fig.show()


In [197]:
# df_combined.groupby(['sender'])['content'].count().reset_index(name='count') \
#                              .sort_values(['count'], ascending=False) \
#                              .head(5).iplot(colors=["LightSeaGreen",],kind='bar',title="Texts on Hour",yTitle="Frequency",xTitle="Hour")

df_combined.groupby(['sender'])['content'].count().reset_index(name='count').sort_values(['count'], ascending=False).set_index('sender').head(6).iplot(dimensions=(1500,1000),colors=["PaleGreen",],kind='bar',title="Messages Sent to Person",yTitle="Frequency",xTitle="Person")

In [198]:
a = "aaabaaa?"
a = a.replace(r'[^\w\s]+', '')
print(a)

aaabaaa?


In [199]:
words_list = df_combined['content'].values.tolist()
corpus = '\n'.join(words_list)
corpus = re.sub(r'[^\w\s]','',corpus)
print(corpus)


ith your name and application number
Oh and regarding this PolyU has a bunch of activities for international students after registration just to get you settled in before the semester starts
Ill check first with the International Affairs Office who deals with campus tours and registration for international student affairs and Ill see what I can do for you 
Of May
Ok Do you have any specific dates in mind
Ahhh are you looking to visit the PolyU campus during the holidays
Yes How may I help you
Hi
Im only on this stupid page because Im salty UofT didnt accept me confess
you as well 
good luck with it thooo
i am busy 0
sorry mann i already bought it a while ago
Goodnight
I need sleep
Its getting late
Little twat
Frkn kwong
We have chem so it sucks for now
Wait no
Great
Uhhhh
Ill check in the morning
Idk then
Whet
It starts at 1130
Uhhh
Vasu
I might swing by later this week
Okok
How much
Oh nice
Hi Do you sell the Seiko SKX007
Say hi to your new Facebook friend Willy
You turned off all mes

In [200]:
import markovify
text_model = markovify.NewlineText(corpus)


In [201]:
markov_array = []
for i in range(15):
    markov_array.append(text_model.make_short_sentence(320))

In [202]:
fig2 = go.Figure(data=[go.Table(header=dict(align='left',values=['No.', 'Generated Markov Chain']),cells=dict( align='left',values=[list(range(15)), markov_array]))])
fig2.show()

# LSTM TEXT GENERATION 

In [203]:
import sys
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [204]:
nlp_corpus = df_combined['content'].values.tolist()
nlp_corpus = ".".join(nlp_corpus).lower()
raw_text = nlp_corpus
import io
with io.open('aa.txt', "w", encoding="utf-8") as f:
    f.write(raw_text)


In [205]:
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length]
	seq_out = raw_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))

# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

Total Characters:  2736057
Total Vocab:  150


In [None]:


# # define the LSTM model
# model = Sequential()
# model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
# model.add(Dropout(0.2))
# model.add(Dense(y.shape[1], activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam')
# # define the checkpoint
# filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]
# # fit the model
# model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

In [None]:
y = np_utils.to_categorical(dataY)
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
# load the network weights
filename = "weights-improvement-01-2.7836.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print ("\nDone.")