In [125]:
import pandas as pd
import numpy as np
import json
import re
import os
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [153]:
""" 1. list folder chatx DONE
    2. list file dalam folder
    
    EXPECTED:
    current dir + /parsed doc/ + chatx/ + sesix.txt    
"""
folder = os.getcwd() + '/parsed doc/'
chat_folders = [chat_folder for chat_folder in os.listdir(folder) if not chat_folder.startswith('.')]

paths = []
for i in range(len(chat_folders)):
    pattern = str(folder) + chat_folders[i]
    paths.append(pattern)
    
sesi_paths = []
for i in range(len(paths)):
    for roots, dirs, files in os.walk(paths[i]):
        if "checkpoints" not in roots:
            for file in files:
                sesi_paths.append(roots + '/' + file)

In [249]:
chats = [pd.read_csv(sesi, sep='|', names = ['sender', 'messages', 'sentiment']) for sesi in sesi_paths]

messages = [chats[i].messages.values.tolist() for i in range(len(sesi_paths))]

In [139]:
# sender = chats['sender'].values.tolist()
# messages = chats['messages'].values.tolist()
# sentiment = chats['sentiment'].values.tolist()

In [285]:
def tokenize(msg):
    """ This function will turns all sentences into token or word.
        Each messages will saved in a list.
    """
    token = [re.split('\s+', msg[i]) for i, chat in enumerate(msg)]
    return token

token=[]
for i in range(len(messages)):
    token.append(tokenize(messages[i]))

token = [tokenize(messages[i]) for i in range(len(messages))]

In [334]:
def casefolding(msg):
    """ This function will turn all the letters into lowercase. """
#     token_lower = [[token.lower() for token in i if token] for i in msg]
    token_lower = [token.lower() for token in msg if token]
    return token_lower

token_lower = []
for token_per_sesi in token:
    for token_per_chat in token_per_sesi:
        token_lower.append([casefolding(token_per_chat)])
# token_lower = [casefolding(token) for i in range(len(messages))]
# token_lower

In [345]:
def cleaning(msg):
    """ This function will remove unnecessaries values in the sentence i.e links, emails, and punctuations
    """
    link = r'(www\.[^\s]+)|(https?:\/\/[^\s]+)|(meet\.google\.[^\s]+)'
    email = r'([a-zA-Z0-9\.\_\-]+@+[a-zA-Z0-9.]+)'
    punct = r'[^a-zA-Z0-9\-\s\[\]<>]'
    
    token_clean = []
    for token in msg:
        # remove link
        temp = re.sub(link, '', token)
        # remove email
        temp = re.sub(email, '', temp)
        # remove punctuation
        temp = re.sub(punct, '', temp)
        temp = re.sub(r'\-', ' ', temp)
        # remove numbers
        temp = re.sub(r'\b[0-9]+\b\s*', '', temp)
#         token_per_chat.append(temp)
        token_clean.append(temp)
    return token_clean

# token_clean = cleaning(token_lower)
token_clean = []
for token_per_sesi in token_lower:
    for token_per_chat in token_per_sesi:
        token_clean.append([cleaning(token_per_chat)])
# token_clean

In [349]:
def normalize(msg):
    """ This function will normalize the tokens,
        it will turn the slang words or typos to its normal values.
        NOTE: you could add the values into json files.
    """
    with open('slang_words.json', 'r') as f:
        dict = json.load(f)
        
    normalized_chat = []
    for token in msg:
        slang_dict = {v:k for v, k in dict.items()}
#         token_per_chat = []
#         for token in i:
        normal = slang_dict.get(token, token)
#             token_per_chat.append(normal)
        normalized_chat.append(normal)
    return normalized_chat

# token_normal = normalize(token_clean)
token_normal = []
for token_per_sesi in token_clean:
    for token_per_chat in token_per_sesi:
        token_normal.append([normalize(token_per_chat)])
# token_normal

In [351]:
def filtering(msg):
    """ Filtering: removing stopwords from tokens.
        In this project, we will use tala stopwords list.
    """
    with open('stopword_list_tala.txt', 'r') as tala:
        stoplist = tala.read()
    token_filtered = [token for token in msg if not token in stoplist]
    return token_filtered

# token_filtered = filtering(token_normal)
token_filtered = []
for token_per_sesi in token_normal:
    for token_per_chat in token_per_sesi:
        token_filtered.append([filtering(token_per_chat)])
token_filtered

[[['[dosen]',
   'mohon',
   'maaf',
   'mengganggu',
   'nama',
   '[mhs]',
   'angkatan',
   'ketersediaan',
   'pembimbing',
   'skripsi',
   'memiliki',
   'topik',
   'sentimen',
   'analisis',
   'produk',
   'layanan',
   'indihome',
   'metode',
   'k nearest',
   'neighbor',
   'information',
   'topik',
   'diangkat',
   'penelitian',
   'skripsi',
   'terima',
   'kasih']],
 [['bersedia', 'topik', 'layak', 'posisinya', 'dimap', 'kajian', 'pustaka']],
 [['terima kasih', 'mohon', 'maaf', 'mekanisme', 'bimbingan', 'daring']],
 [['[mhs]',
   'silahkan',
   'dikirimkan',
   'dokumen',
   'bimbingan',
   'minimal',
   'mingguan',
   'pra',
   'proposalp0p1p2',
   'semhassidang',
   'tolong',
   'cantumkan',
   'detail',
   'content',
   'email',
   'lampirkan',
   'draft',
   'dokumen',
   'dokumen',
   'skripsi',
   'dlm',
   'docdocx',
   'power point',
   'p0p1p2semhassidang',
   'p0p1p2semhassidang',
   'contoh',
   'power point',
   'link',
   'grup',
   'kelas',
   'tinggal'

In [None]:
def stemming(msg):
    """ Stemming: returns words to its original form.
        Since non-alphanumeric will be discarded by using StemmerFactory(),
        This function will do stemming if the token values neither [dosen] nor [mhs].
    """
    stemmer = StemmerFactory().create_stemmer()
    token_stemmed = []
    for token_per_chat in msg:
        token = []
        for tokens in token_per_chat:
            if tokens.startswith('['):
                token.append(tokens)
            else: token.append(stemmer.stem(tokens))
        token_stemmed.append(token)
    return token_stemmed

token_stemmed = stemming(token_filtered)
token_stemmed

In [None]:
setToken = [set(token) for token in token_stemmed]
setToken

In [None]:
for i in setToken:
    print(len(i))