In [1]:
import pandas as pd
import numpy as np
import json
import re
import os
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

### 1. parsing dokumen

In [2]:
## read each chats session folder path
""" 1. list folder chatx DONE
    2. list file dalam folder
    
    EXPECTED:
    current dir + /parsed doc/ + chatx/ + sesix.txt    
"""
parsed_doc_path = '../parsing/parsed_doc/'
chat_folders = [chat_folder for chat_folder in os.listdir(parsed_doc_path) if not chat_folder.startswith('.')]

paths = []
for chat_folder in chat_folders:
    pattern = parsed_doc_path + chat_folder
    paths.append(pattern)
    
sesi_paths = []
for path in paths:
    for roots, dirs, files in os.walk(path):
        if "checkpoints" not in roots:
            for file in files:
                sesi_paths.append(roots + '/' + file)

In [9]:
## make list of dataframe for each chats session
sessions = list()
for sesi_path in sesi_paths:
    sesi = pd.read_csv(sesi_path, sep='|', names=['sender', 'messages', 'sentiment'])
    sesi.drop('sender', axis=1, inplace=True) ## drop sender column (since its not necessary)
    sessions.append(sesi)

In [10]:
sessions[0].head()

Unnamed: 0,messages,sentiment
0,"Assalamualaikum Wr. Wb. Pak, selamat pagi, moh...",1
1,Wa'alaykumsalam. Via wa ini sj ya,1
2,"Baik Pak. Perihal waktu bimbingannya sendiri, ...",1
3,Biasanya hari Kamis atau Jumat,1
4,"Baik Pak. Lalu untuk pelaksanaan P0, apakah te...",1


### 2. lexical analysis

In [7]:
## lexical analysis - cleaning
def cleaning(message):
    """ This function will remove unnecessaries values in the messages i.e links, emails, and punctuations
    """
    link_reg = r'(https?:\/\/[^\s]+)|(www\.[^\s]+)|(meet\.google\.[^\s]+)|(bit\.ly[^\s]+)'
    email_reg = r'([a-zA-Z0-9\.\_\-]+@+[a-zA-Z0-9.]+)'
    punct_reg = r'[^a-zA-Z0-9\[\]]'
    numb_reg = r'\b[0-9]+\b\s*'
    
    message = re.sub(link_reg, '', message) # -> link removed
    message = re.sub(email_reg, '', message) # -> email removed
    message = re.sub(punct_reg, ' ', message) # -> punctuation removed
    message = re.sub(numb_reg, '', message) # -> numbers removed
    
    return message

In [8]:
## lexical analysis - tokenization
def tokenize(message, token_list):
    """ This function will turns all sentences into token or word.
        Each messages will saved in a list.
    """
    word_list = re.split('\s+', message)
    for word in word_list:
        if word not in token_list:
            token_list.append(word)
    
    return word_list, token_list

In [11]:
## do lexical analysis
for sesi in sessions:
    for i, row in sesi.iterrows():
        message = row['messages']
        # do cleaning
        message = cleaning(message)
        # do lower case
        message = message.lower()
        # do tokenization
        message = re.split('\s+', message)

        #replace current message value in df
        sesi.at[i, 'messages'] = message

In [12]:
sessions[0].head()

Unnamed: 0,messages,sentiment
0,"[assalamualaikum, wr, wb, pak, selamat, pagi, ...",1
1,"[wa, alaykumsalam, via, wa, ini, sj, ya]",1
2,"[baik, pak, perihal, waktu, bimbingannya, send...",1
3,"[biasanya, hari, kamis, atau, jumat]",1
4,"[baik, pak, lalu, untuk, pelaksanaan, p0, apak...",1


### 3. normalization (slang word)

In [13]:
## normalize function
def normalize(slang_words, tokens):
    """ This function will normalize the tokens,
        it will turn the slang words or typos to its normal values.
        NOTE: you could add the values into json files.
    """
    new_tokens = list()
    for token in tokens:
        new_value = slang_words.get(token, token) #2nd parameter for default value if word's key not found
        new_tokens.append(new_value)
    
    return new_tokens

In [14]:
## open slang_words json
f = open('slang_words.json', 'r')
slang_words = json.load(f)
f.close()

## do normalize
for sesi in sessions:
    for i, row in sesi.iterrows():
        tokens = row['messages']
        # do normalize
        new_tokens = normalize(slang_words, tokens)

        #replace current message value in df
        sesi.at[i, 'messages'] = new_tokens

In [15]:
sessions[0].head()

Unnamed: 0,messages,sentiment
0,"[assalamualaikum, wr, wb, bapak, selamat, pagi...",1
1,"[wa, alaykumsalam, via, wa, ini, saja, iya]",1
2,"[baik, bapak, perihal, waktu, bimbingannya, se...",1
3,"[biasanya, hari, kamis, atau, jumat]",1
4,"[baik, bapak, lalu, untuk, pelaksanaan, p0, ap...",1


### 4. filtering

In [16]:
## filtering function
def filtering(stopword_list, tokens):
    """ Filtering: removing stopwords from tokens.
        In this project, we will use tala stopwords list.
    """
    new_tokens = list()
    for token in tokens:
        if token not in stopword_list:
            new_tokens.append(token)
    
    return new_tokens

In [17]:
## open stopword list txt
f = open('tala_stopwords.txt', 'r')
stopword_list = f.read()
f.close()

## do normalize
for sesi in sessions:
    for i, row in sesi.iterrows():
        tokens = row['messages']
        # do filtering
        new_tokens = filtering(stopword_list, tokens)

        #replace current message value in df
        sesi.at[i, 'messages'] = new_tokens

In [18]:
sessions[0].head()

Unnamed: 0,messages,sentiment
0,"[selamat, pagi, mohon, maaf, mengganggu, mahas...",1
1,"[alaykumsalam, via, iya]",1
2,"[perihal, bimbingannya, ketentuan, jam]",1
3,"[kamis, jumat]",1
4,"[pelaksanaan, p0, ditentukan, dibahas, bimbing...",1


### 5. stemming

In [19]:
## stemming function
def stemming(stemmer, tokens):
    """ Stemming: returns words to its original form.
        Since non-alphanumeric will be discarded by using StemmerFactory(),
        This function will do stemming if the token values neither [dosen] nor [mhs].
    """
    new_tokens = list()
    for token in tokens:
        new_value = stemmer.stem(token)
        new_tokens.append(new_value)
    
    return new_tokens

In [20]:
## create stemmer object
stemmer = StemmerFactory().create_stemmer()

## do normalize
for sesi in sessions:
    for i, row in sesi.iterrows():
        tokens = row['messages']
        # do filtering
        new_tokens = stemming(stemmer, tokens)

        #replace current message value in df
        sesi.at[i, 'messages'] = new_tokens

In [21]:
sessions[0].head()

Unnamed: 0,messages,sentiment
0,"[selamat, pagi, mohon, maaf, ganggu, mahasiswa...",1
1,"[alaykumsalam, via, iya]",1
2,"[perihal, bimbing, tentu, jam]",1
3,"[kamis, jumat]",1
4,"[laksana, p0, tentu, bahas, bimbing, iya]",1


### 6. add session index for each chat, also make array of df into single df, using pandas concat

In [29]:
## add index session column
for i, sesi in enumerate(sessions):
    index = [i for j in range(sesi.shape[0])]
    sesi['session_index'] = index

## concat all df into single df
sessions_final = pd.concat(sessions, ignore_index=True)

### 7. export to csv

In [33]:
sessions_final.to_csv('preprocessing_3_29.csv', index=False)