## Install Library

In [4]:
# utama
import pandas as pd
import numpy as np
from textblob import Word

# preprocessing
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
from nltk.tokenize import sent_tokenize

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# model building
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold

# model evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

# lainnya
from numpy import array
from scipy.sparse import csr_matrix, coo_matrix, hstack
from numpy import mean
from nltk.corpus import words
import time
import csv
import os
from sklearn.preprocessing import StandardScaler

## Prepare Contraction and Slang Words

In [6]:
# masukan path contraction dan slang words

contraction_words = pd.read_csv('../input/contraction/contraction_words.csv', header=None, index_col=0, squeeze=True).to_dict()
slang_words = pd.read_csv('../input/slangs/slang_words.csv', header=None, index_col=0, squeeze=True).to_dict()

## Example Data

In [None]:
# masukan path csv nya

example_data = pd.read_csv('../input/reviews/example-data.csv')

## Custom Function

In [None]:
def data_labeling(helpful_vote, total_vote):
    if total_vote < 10:
        return 'tidak bermanfaat'
    else:
        ratio_helpfulness = helpful_vote/total_vote
        if ratio_helpfulness > 0.6:
            return 'bermanfaat'
        else:
            return 'tidak bermanfaat'

def second_cleansing(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r"http\S+", ' ', text) 
    text = re.sub(r"www\S+", ' ', text) 
    text = re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', ' ', text) # match html tags
    text = expand_words(text, contraction_words) 
    text = expand_words(text, slang_words)
    text = re.sub(r'[^a-z\s]', ' ', text) # match string a-z only (remove punctuation, symbol, and number)
    text = re.sub(r'\s\s+', ' ', text) # optional => match excess whitespace 
    return text

def expand_words(sentence, expansion_word):
    sentence = sentence.split()
    index_replace = []
    replacement_word = []
    for index, value in enumerate(sentence):        
        for key in expansion_word:
            if value == key:
                index_replace.append(index)
                replacement_word.append(expansion_word[key])
                break
    for index, value_index in enumerate(index_replace):
        sentence[value_index] = replacement_word[index]
    return ' '.join(sentence)

def stopword(sentence):
    string = [i for i in sentence.split() if i not in stopwords.words('english')]
    return ' '.join(string)

wl = WordNetLemmatizer()
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatizer(sentence):
    word_pos_tags = nltk.pos_tag(word_tokenize(sentence)) # get position tag
    word = [wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # map the position tag and lemmatize the word/token
    return ' '.join(word)

# --------------- spelling correction ---------------
word_list = words.words() 
def word_check(sentence):
    final_word_list = []
    for word in sentence.split():
        if word in word_list:
            final_word_list.append(word)
        else:
            corrected_word = spelling_check(word)
            final_word_list.append(corrected_word)
    return ' '.join(final_word_list) 

def spelling_check(word):
    word = Word(word)
    result = word.spellcheck()
    if result[0][1] == 0:
        corrected_word = spelling_corrector(word)
        return corrected_word
    else:
        return result[0][0]
        
def spelling_corrector(word):
    elongated_word = bool(re.search(r'(.)\1{2,}', word))
    if elongated_word:
        temp_word = word 
        while bool(re.search(r'(.)\1{2,}', temp_word)):
            temp = list(temp_word)
            char = re.search(r'(.)\1{2,}', temp_word)
            temp[char.start():(char.end() - 1)] = ''
            string = "".join(temp)
            word = Word(string)
            result = word.spellcheck()
            if result[0][1] != 0:
                return result[0][0]
                break
            else:
                temp_word = string
                if bool(re.search(r'(.)\1{2,}', temp_word)):
                    continue
                else:
                    return word
                    break
    else:
        return word
# --------------- spelling correction ---------------

def final_preprocess(sentence):
    return word_check(lemmatizer(stopword(second_cleansing(sentence))))

## Data Preprocessing

In [None]:
# one review

second_cleansing(""" Very good, inexpensive brush! Bought 5 for my wife &amp; I. 4 lasted 20 months. On the last one &amp; ready to re-order. Can't beat the price https://www.youtube.com/watch?v=nVHP49g5IPQ. """)

In [1]:
# multiple review

multiple_review = pd.DataFrame({'teks': [
    """ Very good, inexpensive brush! Bought 5 for my wife &amp; I. 4 lasted 20 months. On the last one &amp; ready to re-order. Can't beat the price. """, 
    """ AMMMAAAZZZIIINNNGGGGG Don't think twice just buy it. It's amazing how even after the first use what a difference it makes. I love this stuff :) """,
    """ Who pays 4 dollars more for a $20 gift card?<br />What store doesn't sell gift cards that the extra 4 dollars sounds like a good idea? """,
    """ Fun game, fast delivery.<br />No problems or complaints.  Nice aqnd fast delivery.  Game is in excellent condition.  Brand New I believe. """,
]})


satu = multiple_review.teks.apply(lambda x: lowercase(x))
dua = multiple_review.teks.apply(lambda x: remove_html(lowercase(x)) )
tiga = multiple_review.teks.apply(lambda x: remove_noise(remove_html(lowercase(x))))

print(satu)
print(dua)
print(tiga)

In [5]:
# use dataset

example_data.loc[:, 'clean_review'] = example_data.apply(lambda x: final_preprocess(x.review_body), axis=1)