In [2]:
import re
import pandas as pd
import os
import sys

In [71]:
def get_date(message):
    """ This function is to get the date ONLY, doesn't include the time.
    
        Explanation:
        pattern1: [dd/MM/YY HH.MM.SS] Sender: Messages
                  [dd/MM/YY HH.MM.SS AM/PM] Sender: Messages
        pattern2: MM/dd/YY, HH:MM - Sender: Messages
                  MM/dd/YY, HH:MM AM/PM - Sender: Messages
        pattern3: dd/MM/YY HH.MM - Sender: Messages
        pattern4: dd/MM/YYYY HH.MM - Sender: Messages            # haven't met this pattern
    """
    pattern1 = r'\[(\d{1,2})\/(\d{1,2})\/(\d{1,2}) (\d{1,2})\.(\d{1,2})\.(\d{1,2})?( AM|PM)?\]'
    pattern2 = r'(\d{1,2})\/(\d{1,2})\/(\d{0,2})\, (\d{0,2}):(\d{1,2})( [A|P]M)? -'
    pattern3 = r'(\d{1,2})\/(\d{1,2})\/(\d{1,2}) (\d{1,2})\.(\d{1,2})'
    pattern4 = r'(\d{1,2})\/(\d{1,2})\/(\d{1,4}) (\d{1,2})\.(\d{1,2})'
    
    result1 = re.findall(pattern1, message)
    result2 = re.findall(pattern2, message)
    result3 = re.findall(pattern3, message)
    result4 = re.findall(pattern4, message)
    
    date = []
    if result1:
        for result in result1:
            date.append(result[0]+'/'+result[1]+'/'+result[2])
    elif result2:
        for result in result2:
            date.append(result[1]+'/'+result[0]+'/'+result[2])
    elif result3:
        for result in result3:
            date.append(result[0]+'/'+result[1]+'/'+result[2])
    elif result4:
        for result in result4:
            date.append(result[0]+'/'+result[1]+'/'+result[2])
    else:
        print("No pattern detected.")
    return date

In [72]:
def get_sender(message):
    """ This function is to get the sender from messages.
    
        patternNo where the sender is a phone number (contact hasn't been saved)
        patternNa where the sender is a name
    """
    patternNo = r'(\+[\d+\-\s].*):'
    patternNa = r'[\]|\-] ?([\w\.].*):'

    resultNo = re.findall(patternNo, message)
    resultNa = re.findall(patternNa, message)
    
    if resultNo:
        return resultNo
    elif resultNa:
        return resultNa

In [366]:
def get_messages(message):
    """ This function is to get the messages!
        iOS and Android type is different.
    
    """
    pattern = r': ([\d\w\s\W][^\[\]]+)' # messages pattern for iOS
    
    RE_EMOJI = re.compile("(["
                          "\U0001F1E0-\U0001F1FF"  # flags (iOS)
                          "\U0001F300-\U0001F5FF"  # symbols & pictographs
                          "\U0001F600-\U0001F64F"  # emoticons
                          "\U0001F680-\U0001F6FF"  # transport & map symbols
                          "\U0001F700-\U0001F77F"  # alchemical symbols
                          "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                          "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                          "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                          "\U0001FA00-\U0001FA6F"  # Chess Symbols
                          "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                          "\U00002702-\U000027B0"  # Dingbats
                          "])")
    
    message = RE_EMOJI.sub(r'', message)
    
    # iOS
    if message.startswith('['):
        message = " ".join(message.split()) # remove excess line
        message = " ".join(message.split('\u200e')) # remove idk what this is xixi
        message = re.findall(pattern, message)
    
    # Android
    else:
        message = message.replace('\n', '')
        message = "".join(re.split('\\n?\d{1,2}\/\d{1,2}\/\d{1,2}, ', message))
        message = re.split(r'\d{1,2}:\d{1,2} [A|P]M - [\w\s]+: ', message)
        if '' in message:
            message.remove('') 
    return message

In [367]:
def convert_to_csv(dataframe, messages):
    """ This function is to convert .txt files to .csv files.
        All files stored in dataset_csv/
    """
    path = 'dataset_csv/'
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    file = 'dataset_csv/' + messages.replace('.txt', '.csv')
    dataframe.to_csv(file, index=False)

In [373]:
def create_df(message):
    """ Aggregate date, sender, and messages then create a Data Frame """
    date = get_date(message)
    sender = get_sender(message)
    messages = get_messages(message)
    df = pd.DataFrame(
            list(zip(date, sender, messages)),
            columns=['timestamp', 'sender', 'messages']
        )
    return df

In [374]:
def main():
    files = os.listdir(os.getcwd()+'/dataset')
    messages_list = [file for file in files if file.endswith('.txt')]
    
    for messages in messages_list: 
        with open('dataset/' + messages, encoding='utf-8') as file:
            message = file.read()
            
            df = create_df(message)
    return df
#             convert_to_csv(df, messages)

In [375]:
main()

Unnamed: 0,timestamp,sender,messages
0,25/6/20,Franklid Gunawan,Selamat siang Pak Imam. Mohon maaf mengganggu ...
1,25/6/20,Pak Imam,"Baik Franklid, saya bersedia. Terkait judul, t..."
2,25/6/20,Franklid Gunawan,Baik pak. Akan saya cari dan tambahkan. Terima...
3,13/8/20,Franklid Gunawan,Selamat siang Pak Imam. Mohon mqqf menggangu w...
4,14/8/20,Pak Imam,"Baik Franklid, saran saya Word Embeddingnya di..."
...,...,...,...
195,14/1/21,Franklid Gunawan,Selamat siang pak. Mohon maaf mengganggu. Saya...
196,14/1/21,Pak Imam,"Baik Franklid, noted.Terkait 2.b, berdasarkan ..."
197,14/1/21,Pak Imam,<Media omitted>
198,14/1/21,Pak Imam,"Saran saya, sebaiknya ditambahkan variasi misa..."


In [382]:
with open(os.getcwd()+ '/dataset/kakmen.txt', encoding='utf-8') as file:
    message = file.read()

In [355]:
count=0
for m in message.split():
    if 'Gunawan:' in m:
        count += 1
print(count)
98+102

102


200

In [359]:
print(len(get_date(message)), len(get_sender(message)), len(get_messages(message)))

200 200 200


In [357]:
def get_messages(message):
    """ This function is to get the messages! """
    
    pattern = r'w+: ([\d\w\s\W]+) [A|P]M -' # messages pattern for iOS
    
    RE_EMOJI = re.compile("(["
                          "\U0001F1E0-\U0001F1FF"  # flags (iOS)
                          "\U0001F300-\U0001F5FF"  # symbols & pictographs
                          "\U0001F600-\U0001F64F"  # emoticons
                          "\U0001F680-\U0001F6FF"  # transport & map symbols
                          "\U0001F700-\U0001F77F"  # alchemical symbols
                          "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                          "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                          "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                          "\U0001FA00-\U0001FA6F"  # Chess Symbols
                          "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                          "\U00002702-\U000027B0"  # Dingbats
                          "])")
    
    message = RE_EMOJI.sub(r'', message)
    message = message.replace('\n', '')
    message = "".join(re.split('\\n?\d{1,2}\/\d{1,2}\/\d{1,2}, ', message))
    message = re.split(r'\d{1,2}:\d{1,2} [A|P]M - [\w\s]+: ', message)
    if '' in message:
        message.remove('')
#     message = " ".join(message.split('\u200e')) # remove idk what this is xixi
#     messages = re.findall(pattern, message)
    return message

In [384]:
get_messages(message)

[' Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. ',
 'Assalamualaikum Wr. Wb. Selamat pagi Bu Indri, mohon maaf mengganggu waktunya. Perkenalkan nama saya Mentari Adiza (TIF’16) dengan keminatan KC. Saya salah satu mahasiswa bimbingan skripsi ibu. Saya ingin bertanya terkait pelaksanaan p0 skripsi saya. Saya sudah melaksanakan p0 dengan dosen 1 saya yakni Pak Imam. Kira-kira apakah ibu ada waktu untuk p0 pada hari ini sebagai dosen 2 saya bu? Terima kasih Bu untuk sebelum dan sesudahnya.. Wassalamualaikum ',
 'Waalaikumsalam...iya bs 10.15-12.00 ',
 'baik bu terima kasihh ',
 'Assalamualaikum Wr. Wb. Selamat siang Bu, mohon maaf mengganggu waktunya. Saya ingin bertanya bu, kira-kira apakah ibu ada waktu untuk konsultasi terkait skripsi saya untuk hari ini? Terima kasih bu untuk sebelum dan sesudahnya.. Wassalamualaikum ',
 'Waalaikumsalam wr wb... ',
 'Iya ',
 'maaf bu sebelumnya untuk pengiriman dokumen dan poi