# Preparing Data

In [None]:
import pandas as pd 
import numpy as np
tweet = pd.read_excel("tweet translate.xlsx")

tweet.head()

# Case Folding

In [None]:
# ------ Case Folding --------
# Using Series.str.lower() function on Pandas
tweet['Text Case Folding'] = tweet['Text'].str.lower()


print('Case Folding Result : \n')
print(tweet['Text Case Folding'].head(5))
print('\n\n\n')

# Tokenizing

In [None]:
# ------ Tokenizing ---------

import string 
import re #regex library

# import word_tokenize
from nltk.tokenize import word_tokenize 

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
tweet['Text Case Folding'] = tweet['Text Case Folding'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

tweet['Text Case Folding'] = tweet['Text Case Folding'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

tweet['Text Case Folding'] = tweet['Text Case Folding'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

tweet['Text Case Folding'] = tweet['Text Case Folding'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

tweet['Text Case Folding'] = tweet['Text Case Folding'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

tweet['Text Case Folding'] = tweet['Text Case Folding'].apply(remove_singl_char)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

tweet['Text Tokenizing'] = tweet['Text Case Folding'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(tweet['Text Tokenizing'].head())
print('\n\n\n')

# Normalization

In [None]:
# ------ Normalization ---------
normalizad_word = pd.read_excel("kata_baku.xlsx")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

tweet['Text Normalization'] = tweet['Text Tokenizing'].apply(normalized_term)

tweet['Text Normalization'].head(10)

# Stemming

In [None]:
# ------ Stemming ---------
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
import swifter


# stemmed
def stemmed_wrapper(term):
    return ps.stem(term)

term_dict = {}

for document in tweet['Text Normalization']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

tweet['Text Stemming'] = tweet['Text Normalization'].swifter.apply(get_stemmed_term)
print(tweet['Text Stemming'])

# Filtering

In [None]:
# ------ Filtering ---------
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('english')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(['the', 'Aspiyuaja', 'ewopang', 'ombabikinoten',  'wkwkwk', 'jgdqcxnjo', 'rtqwsxznu', 
                       'pembslap','mnjdi','rt','&amp',
                       'allah', 'brb', 'btw', 'cod', 'cmiiw', 'fyi',
                       'gg', 'ggwp', 'idk', 'ikr', 'lol', 'ootd', 'lmao', 'oot',
                       'pap', 'otw', 'tfl', 'vc', 'ygy'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopword inggris.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

tweet['Text Filtering'] = tweet['Text Stemming'].apply(stopwords_removal) 


print(tweet['Text Filtering'].head())

# View & Save Results

In [None]:
tweet.head()

tweet.to_csv("preprocessing results", index=False)