In [None]:
import json
import pandas as pd
import numpy as np
import nltk

In [None]:
json_path = 'message.json'
with open(json_path, encoding='utf-8') as file:
    data = json.load(file)

#There should be a more efficient way to fill the dataframe
columns = data['messages'][0].keys()
df = pd.DataFrame( columns=columns)
for message in data['messages']:
    df = df.append(message, ignore_index=True)

#Create new features using the timestamp
df['date'] = pd.to_datetime(df['timestamp_ms'], unit='ms')
df['hour'] = df['date'].dt.hour
df['month'] = df['date'].dt.month
df['dayofweek'] = df['date'].dt.dayofweek

df.head()

In [None]:
#Get participants names
her = data['participants'][0]['name']
me = data['participants'][1]['name']


total_nbr_messages = len(df.index)
my_messages = df.loc[df['sender_name'] == me]
her_messages = df.loc[df['sender_name'] == her]
total_nbr_my_messages = len(my_messages.index)
total_nbr_her_messages = len(her_messages.index)

#Adding new feature (occ) to count
df['occ'] = 1
messages_by_day = df[['sender_name','dayofweek','occ']].groupby(['sender_name','dayofweek']).agg(['count'])
messages_by_month = df[['sender_name','month','occ']].groupby(['sender_name','month']).agg(['count'])
messages_by_hour = df[['sender_name','hour','occ']].groupby(['sender_name','hour']).agg(['count'])

calls = df.loc[df['call_duration'] > 1]
total_nbr_calls = len(calls.index)
max_call_duration = calls['call_duration'].max()
min_call_duration = calls['call_duration'].min()
mean_call_duration = calls['call_duration'].mean()
total_nbr_missed_calls = len((df.loc[df['missed'] == True]).index)



In [None]:
def cleanTxt(text):
    tokens = nltk.word_tokenize(my_corpus)
    stopwords = open('SmartStoplist.txt',encoding="utf8").read().split('\n')
    cleanedTokens = [word.lower() for word in tokens if word not in  stopwords and word.isalpha()]
    return cleanedTokens

def freq(listOfTokens):
    my_dict = {}
    for token in listOfTokens:
        if token in my_dict:
            current = my_dict[token]
            current += 1
            my_dict[token] = current
        else:
            my_dict[token] = 1
    
    return my_dict

def getAllMessagesInList(sender):
    #Sender should be her or me
    generic_only = df.loc[df['type'] == "Generic"]
    messages_content = []
    for index, row in generic_only.iterrows():
        if row['sender_name'] == sender :
            if row['content'] !='nan' : #and row['month'] >= 9:
                msg_str = str(row['content'])
                #if not (msg_str.startswith("X sent a") or msg_str.startswith("You sent a")):
                messages_content.append(msg_str)
    return messages_content

my_messages = getAllMessagesInList(me)
her_messages = getAllMessagesInList(her)

def getMostUsedWords(messages_content, k):
    my_corpus = '\n'.join(messages_content)
    res = freq(cleanTxt(my_corpus))
    sorted_by_value = sorted(res.items(), key=lambda kv: kv[1], reverse=True)
    return sorted_by_value[:k]
    
def getLonguestTextMessage(messages_content):
    max_len = 0
    for message in messages_content:
        len_msg = len(message)
        if len_msg > max_len:
            max_len = len_msg
            max_msg = message
    return (max_len, max_msg)
    
(max_len, max_msg) = getLonguestTextMessage(my_messages)

def getAverageMessagesLength(messages_content):
    if len(messages_content) > 0:
        total_length = 0
        for message in messages_content:
            words = message.split()
            total_length += len(words)    
        return total_length/len(messages_content)

average_length =  getAverageMessagesLength(my_messages)