## Import Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

## Data Loading

In [2]:
dataset = pd.read_excel("data/dataset.xlsx")

## Data Preparation

### Data Assessing

#### Pengecekan Review Duplikat

In [3]:
print(f"Jumlah Duplikasi Data: {dataset.duplicated().sum()}")

Jumlah Duplikasi Data: 10


### Data Cleaning

#### Penghapusan Data Duplikat

In [4]:
dataset.drop_duplicates(inplace=True)

In [5]:
print(f"Jumlah Duplikasi Data: {dataset.duplicated().sum()}")

Jumlah Duplikasi Data: 0


### Pemisahan Review Menurut Aspek

#### Attractions

In [6]:
attractions = dataset[["attractions_text", "attractions"]]
attractions.columns = ["text", "label"]

mask_none = attractions["label"].str.lower() == "none"
attractions.loc[mask_none, "text"] = dataset.loc[mask_none, "review"]
                                                 
attractions["label"] = attractions["label"].str.replace("(?i)^netral$", "neutral", regex=True)
attractions = attractions.dropna()
attractions

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attractions.loc[mask_none, "text"] = dataset.loc[mask_none, "review"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attractions["label"] = attractions["label"].str.replace("(?i)^netral$", "neutral", regex=True)


Unnamed: 0,text,label
0,Suka Bali - area keluarga yang luar biasa_x000...,none
1,Anda dapat berlari dan menonton matahari terbi...,positive
2,Lokasi paling pas liat sunrise..harus datang s...,positive
3,"Tempatnya bagus, selalu ramai oleh pengunjung....",positive
4,pantai sanur banyak menyimpan misteri dan sang...,positive
...,...,...
3616,Pantai losari adalah salah itu icon di kota ma...,none
3617,Salah satu tempat pilihan bila berkunjung Ke M...,none
3618,Salah satu objek wisata yang ada di kota makas...,none
3619,"Lumayan bersih, asik untuk dikelilingi saat ma...",positive


#### Amenities

In [7]:
amenities = dataset[["amenities_text", "amenities"]]
amenities.columns = ["text", "label"]

mask_none = amenities["label"].str.lower() == "none"
amenities.loc[mask_none, "text"] = dataset.loc[mask_none, "review"]

amenities["label"] = amenities["label"].str.replace("(?i)^netral$", "neutral", regex=True)
amenities = amenities.dropna()
amenities

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amenities.loc[mask_none, "text"] = dataset.loc[mask_none, "review"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amenities["label"] = amenities["label"].str.replace("(?i)^netral$", "neutral", regex=True)


Unnamed: 0,text,label
0,Suka Bali - area keluarga yang luar biasa_x000...,none
1,Setelah Anda berjalan anda dapat menikmati hid...,neutral
2,Lokasi paling pas liat sunrise..harus datang s...,none
3,"Tempatnya bagus, selalu ramai oleh pengunjung....",none
4,pantai sanur banyak menyimpan misteri dan sang...,none
...,...,...
3616,Di pantai losari banyak terdapat patung patung...,positive
3617,Salah satu tempat pilihan bila berkunjung Ke M...,none
3618,Salah satu objek wisata yang ada di kota makas...,none
3619,"Lumayan bersih, asik untuk dikelilingi saat ma...",positive


#### Access

In [8]:
access = dataset[["access_text", "access"]]
access.columns = ["text", "label"]

mask_none = access["label"].str.lower() == "none"
access.loc[mask_none, "text"] = dataset.loc[mask_none, "review"]

access["label"] = access["label"].str.replace("(?i)^netral$", "neutral", regex=True)
access = access.dropna()
access

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  access.loc[mask_none, "text"] = dataset.loc[mask_none, "review"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  access["label"] = access["label"].str.replace("(?i)^netral$", "neutral", regex=True)


Unnamed: 0,text,label
0,Suka Bali - area keluarga yang luar biasa_x000...,none
1,Saya selalu berjalan di Pantai Sanur pada hari...,none
2,Lokasi paling pas liat sunrise..harus datang s...,none
3,"Tempatnya bagus, selalu ramai oleh pengunjung....",none
4,pantai sanur banyak menyimpan misteri dan sang...,none
...,...,...
3616,Di pantai losari banyak terdapat patung patung...,positive
3617,Salah satu tempat pilihan bila berkunjung Ke M...,none
3618,Salah satu objek wisata yang ada di kota makas...,none
3619,"Lumayan bersih, asik untuk dikelilingi saat ma...",none


#### Price

In [9]:
price = dataset[["price_text", "price"]]
price.columns = ["text", "label"]

mask_none = price["label"].str.lower() == "none"
price.loc[mask_none, "text"] = dataset.loc[mask_none, "review"]

price["label"] = price["label"].str.replace("(?i)^netral$", "neutral", regex=True)
price = price.dropna()
price

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price.loc[mask_none, "text"] = dataset.loc[mask_none, "review"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price["label"] = price["label"].str.replace("(?i)^netral$", "neutral", regex=True)


Unnamed: 0,text,label
0,Suka Bali - area keluarga yang luar biasa_x000...,none
1,Saya selalu berjalan di Pantai Sanur pada hari...,none
2,Lokasi paling pas liat sunrise..harus datang s...,positive
3,"Tempatnya bagus, selalu ramai oleh pengunjung....",none
4,pantai sanur banyak menyimpan misteri dan sang...,none
...,...,...
3616,Pantai losari adalah salah itu icon di kota ma...,none
3617,Salah satu tempat pilihan bila berkunjung Ke M...,none
3618,Salah satu objek wisata yang ada di kota makas...,none
3619,"Lumayan bersih, asik untuk dikelilingi saat ma...",positive


#### No Aspect

In [10]:
no_aspect = dataset[["no_aspect_text", "no_aspect"]]
no_aspect.columns = ["text", "label"]

mask_none = no_aspect["label"].str.lower() == "none"
no_aspect.loc[mask_none, "text"] = dataset.loc[mask_none, "review"]

no_aspect["label"] = no_aspect["label"].str.replace("(?i)^netral$", "neutral", regex=True)
no_aspect = no_aspect.dropna()
no_aspect

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_aspect.loc[mask_none, "text"] = dataset.loc[mask_none, "review"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_aspect["label"] = no_aspect["label"].str.replace("(?i)^netral$", "neutral", regex=True)


Unnamed: 0,text,label
0,Suka Bali - area keluarga yang luar biasa menc...,neutral
1,Saya selalu berjalan di Pantai Sanur pada hari...,positive
2,Lokasi paling pas liat sunrise..harus datang s...,none
3,"Tempatnya bagus, selalu ramai oleh pengunjung....",none
4,pantai sanur banyak menyimpan misteri dan sang...,none
...,...,...
3616,Pantai losari adalah salah itu icon di kota ma...,none
3617,Salah satu tempat pilihan bila berkunjung Ke M...,positive
3618,Salah satu objek wisata yang ada di kota makas...,none
3619,"Lumayan bersih, asik untuk dikelilingi saat ma...",none


### Text Preprocessing

In [11]:
def cleaning_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text) # menghapus RT
    text = re.sub(r"http\S+", '', text) # menghapus link
    text = re.sub(r'[0-9]+', '', text) # menghapus angka
    text = re.sub(r'[^\w\s]', '', text) # menghapus karakter selain huruf dan angka

    text = text.replace('\n', ' ') # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', string.punctuation)) # menghapus semua tanda baca
    text = text.strip(' ') # menghapus karakter spasi dari kiri dan kanan teks
    return text

def casefolding_text(text): # Mengubah semua karakter dalam teks menjadi huruf kecil
    text = text.lower()
    return text

def tokenizing_text(text): # Memecah atau membagi string, teks menjadi daftar token
    text = word_tokenize(text)
    return text

def filtering_text(text): # Menghapus stopwords dalam teks
    listStopwords = set(stopwords.words('indonesian')) # Menggunakan stopwords bahasa Indonesia
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def to_sentence(list_words): # Mengubah daftar kata menjadi kalimat
    sentence = ' '.join(word for word in list_words)
    return sentence

In [12]:
def clean_dataframe(clean_df):
    clean_df['text_clean'] = clean_df['text'].apply(cleaning_text)

    # Mengubah huruf dalam teks menjadi huruf kecil
    clean_df['text_casefolding'] = clean_df['text_clean'].apply(casefolding_text)

    # Memecah teks menjadi token (kata-kata)
    clean_df['text_tokenizing'] = clean_df['text_casefolding'].apply(tokenizing_text)

    # Menghapus kata-kata stop (kata-kata umum)
    clean_df['text_stopword'] = clean_df['text_tokenizing'].apply(filtering_text)

    # Menggabungkan token-token menjadi kalimat
    clean_df['final_text'] = clean_df['text_stopword'].apply(to_sentence)

    return clean_df

In [13]:
clean_attractions = clean_dataframe(attractions.copy())
clean_amenities = clean_dataframe(amenities.copy())
clean_access = clean_dataframe(access.copy())
clean_price = clean_dataframe(price.copy())
clean_no_aspect = clean_dataframe(no_aspect.copy())

In [14]:
clean_attractions.to_excel("data/clean_attractions.xlsx", index=False)
clean_amenities.to_excel("data/clean_amenities.xlsx", index=False)
clean_access.to_excel("data/clean_access.xlsx", index=False)
clean_price.to_excel("data/clean_price.xlsx", index=False)
clean_no_aspect.to_excel("data/clean_no_aspect.xlsx", index=False)