In [1]:
import pandas as pd
import numpy as np
import json
import re
import os
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

### 1. parsing dokumen

In [2]:
## read each chats session folder path
""" 1. list folder chatx DONE
    2. list file dalam folder
    
    EXPECTED:
    current dir + /parsed doc/ + chatx/ + sesix.txt    
"""
parsed_doc_path = '../1_parsing/parsed_doc/'
chat_folders = [chat_folder for chat_folder in os.listdir(parsed_doc_path) if not chat_folder.startswith('.')]

paths = []
for chat_folder in chat_folders:
    pattern = parsed_doc_path + chat_folder
    paths.append(pattern)
    
sesi_paths = []
for path in paths:
    for roots, dirs, files in os.walk(path):
        if "checkpoints" not in roots:
            for file in files:
                sesi_paths.append(roots + '/' + file)

In [3]:
## make list of dataframe for each chats session
sessions = list()
for sesi_path in sesi_paths:
    sesi = pd.read_csv(sesi_path, sep='|', names=['sender', 'messages', 'sentiment'])
    sesi.drop('sender', axis=1, inplace=True) ## drop sender column (since its not necessary)
    sessions.append(sesi)

In [4]:
sessions[0].head()

Unnamed: 0,messages,sentiment
0,"Assalamualaikum Pak [DOSEN], mohon maaf mengga...",1.0
1,"Wa'alaikumsalam Wr. Wb. Baik, saya bersedia. I...",1.0
2,"Baik bapak, terimakasih. Mohon maaf sebelumnya...",-1.0


### 2. lexical analysis

In [5]:
## lexical analysis - cleaning
def cleaning(message):
    """ This function will remove unnecessaries values in the messages i.e links, emails, and punctuations
    """
    link_reg = r'(https?:\/\/[^\s]+)|(www\.[^\s]+)|(meet\.google\.[^\s]+)|(bit\.ly[^\s]+)'
    email_reg = r'([a-zA-Z0-9\.\_\-]+@+[a-zA-Z0-9.]+)'
    punct_reg = r'[^a-zA-Z0-9\[\]]'
    numb_reg = r'\b[0-9]+\b\s*'
    
    message = re.sub(link_reg, '[LINK]', message) # -> link removed
    message = re.sub(email_reg, '', message) # -> email removed
    message = re.sub(punct_reg, ' ', message) # -> punctuation removed
    message = re.sub(numb_reg, '', message) # -> numbers removed
    
    return message

In [6]:
## lexical analysis - lower, except <MHS>, <DOSEN>, <LINK>
def lower(word_list):
    new_word_list = list()
    for word in word_list:
        if word != '[MHS]' and word != '[DOSEN]' and word != '[LINK]':
            word = word.lower()
        new_word_list.append(word)
    return new_word_list

In [7]:
## do lexical analysis
for sesi in sessions:
    for i, row in sesi.iterrows():
        message = row['messages']
        # do cleaning
        message = cleaning(message)
        # do strip
        message = message.strip()
        # do tokenization
        message = re.split('\s+', message)
        # do lower case
        message = lower(message)
        
        # replace current message value in df
        sesi.at[i, 'messages'] = message

In [9]:
sessions[0].head()

Unnamed: 0,messages,sentiment
0,"[assalamualaikum, pak, [DOSEN], mohon, maaf, m...",1.0
1,"[wa, alaikumsalam, wr, wb, baik, saya, bersedi...",1.0
2,"[baik, bapak, terimakasih, mohon, maaf, sebelu...",-1.0


### 3. normalization (slang word)

In [8]:
## normalize function
def normalize(slang_words, tokens):
    """ This function will normalize the tokens,
        it will turn the slang words or typos to its normal values.
        NOTE: you could add the values into json files.
    """
    new_tokens = list()
    for token in tokens:
        new_value = slang_words.get(token, token) #2nd parameter for default value if word's key not found
        new_tokens.append(new_value)
    
    return new_tokens

In [10]:
## open slang_words json
f = open('slang_words.json', 'r')
slang_words = json.load(f)
f.close()

## do normalize
for sesi in sessions:
    for i, row in sesi.iterrows():
        tokens = row['messages']
        # do normalize
        new_tokens = normalize(slang_words, tokens)

        #replace current message value in df
        sesi.at[i, 'messages'] = new_tokens

In [11]:
sessions[0].head()

Unnamed: 0,messages,sentiment
0,"[assalamualaikum, bapak, [DOSEN], mohon, maaf,...",1.0
1,"[wa, alaikumsalam, wr, wb, baik, saya, bersedi...",1.0
2,"[baik, bapak, terima kasih, mohon, maaf, sebel...",-1.0


### 4. filtering

In [12]:
## filtering function
def filtering(stopword_list, tokens):
    """ Filtering: removing stopwords from tokens.
        In this project, we will use tala stopwords list.
    """
    new_tokens = list()
    for token in tokens:
        if token not in stopword_list:
            new_tokens.append(token)
    
    return new_tokens

In [13]:
## open stopword list txt
f = open('tala_stopwords.txt', 'r')
stopword_list = f.read()
f.close()

## do normalize
for sesi in sessions:
    for i, row in sesi.iterrows():
        tokens = row['messages']
        # do filtering
        new_tokens = filtering(stopword_list, tokens)

        #replace current message value in df
        sesi.at[i, 'messages'] = new_tokens

In [14]:
sessions[0].head()

Unnamed: 0,messages,sentiment
0,"[[DOSEN], mohon, maaf, mengganggu, nama, [MHS]...",1.0
1,"[bersedia, topik, layak, posisinya, dimap, kaj...",1.0
2,"[terima kasih, mohon, maaf, mekanisme, bimbing...",-1.0


### 5. stemming

In [15]:
## stemming function, except [DOSEN], [MHS], [LINK]
def stemming(stemmer, tokens):
    """ Stemming: returns words to its original form.
        Since non-alphanumeric will be discarded by using StemmerFactory(),
        This function will do stemming if the token values neither [dosen] nor [mhs].
    """
    new_tokens = list()
    for token in tokens:
        if token != '[MHS]' and token != '[DOSEN]' and token != '[LINK]':
            token = stemmer.stem(token)
        new_tokens.append(token)
    
    return new_tokens

In [16]:
## create stemmer object
stemmer = StemmerFactory().create_stemmer()

## do normalize
for sesi in sessions:
    for i, row in sesi.iterrows():
        tokens = row['messages']
        # do filtering
        new_tokens = stemming(stemmer, tokens)

        # replace current message value in df
        sesi.at[i, 'messages'] = new_tokens

In [17]:
sessions[0].head()

Unnamed: 0,messages,sentiment
0,"[[DOSEN], mohon, maaf, ganggu, nama, [MHS], an...",1.0
1,"[sedia, topik, layak, posisi, map, kaji, pustaka]",1.0
2,"[terima kasih, mohon, maaf, mekanisme, bimbing...",-1.0


### 6. add session index for each chat, also make array of df into single df, using pandas concat

In [18]:
## add index session column
for i, sesi in enumerate(sessions):
    index = [i for j in range(sesi.shape[0])]
    sesi['session_index'] = index

## concat all df into single df
sessions_final = pd.concat(sessions, ignore_index=True)

### 7. drop empty preprocessing result's message

In [20]:
empty_index = list()
for i, row in sessions_final.iterrows():
    if not row['messages']:
        empty_index.append(i)

print(empty_index)

[24, 29, 53, 82, 116, 169, 196, 352, 388, 447, 451, 512, 569, 579, 625, 648, 756, 775, 783, 840, 873, 880, 982, 984, 1028, 1117, 1122, 1183, 1187, 1195, 1200, 1217, 1241, 1243, 1248, 1254, 1274, 1298, 1352, 1377, 1392, 1530, 1533, 1540]


In [21]:
sessions_final = sessions_final.drop(sessions_final.index[empty_index])

In [23]:
## check again the empty index
empty_index = list()
for i, row in sessions_final.iterrows():
    if not row['messages']:
        empty_index.append(i)

if not empty_index:
    print('no empty message!')

no empty message!


### 8. export to csv

In [24]:
sessions_final.to_csv('preprocessing_results.csv', index=False)