## Read Dataset Preprocessing

In [19]:
import pandas as pd

def read_data(file_dir):
    df = pd.read_csv(file_dir)
    return df

In [20]:
df = read_data("dataset.csv")
df.head()

Unnamed: 0,objek wisata,ulasan,atraksi,amenitas,aksesibilitas
0,alun alun,"malem kelihatan kumuh & agak horror, karena b...",negatif,negatif,
1,alun alun,Pertama kali kesana itu cuaca kurang mendukung...,positif,positif,
2,alun alun,"Taman alun-alun yang bagus dan menyenangkan, c...",positif,,positif
3,alun alun,Merupakan taman kota yang cocok untuk mengajak...,positif,negatif,
4,alun alun,Alun-alun dengan banyak fasilitas yang bisa di...,,positif,


## Split Data Train & Data Test

In [21]:
import sklearn
from sklearn.model_selection import train_test_split

def split_data(df, test_size, random_state):
    col_name = df.columns[1]
    data = df
    data_train, data_test = train_test_split(data, test_size=test_size)
    return data_train, data_test

In [22]:
data_train, data_test = split_data(df, test_size= 0.2, random_state=42)

In [23]:
data_train.to_csv("data_train.csv")
data_test.to_csv("data_test.csv")

In [24]:
data_train = read_data("data_train.csv")
data_train.head()

Unnamed: 0.1,Unnamed: 0,objek wisata,ulasan,atraksi,amenitas,aksesibilitas
0,1997,waduk gondang,Bagus,,,
1,2733,wego,Karna sepi jadi kurang terurus,,negatif,
2,2773,wego,Cocok utk wisata anak2... Berwisata sambil bel...,positif,,
3,346,alun alun,Cocok untuk mengajak anak-anak bermain,positif,,
4,2211,wbl,salah satu taman hiburan di Kabupaten Lamongan...,positif,,


In [25]:
data_test = read_data("data_test.csv")
data_test.head()

Unnamed: 0.1,Unnamed: 0,objek wisata,ulasan,atraksi,amenitas,aksesibilitas
0,2889,gunung mas mantup,Tidak adanya tempat makan dan minum di sekitar...,,negatif,
1,2074,waduk gondang,"Akses jalan mulai bagus,perlu diperhatikan keb...",,negatif,positif
2,2852,wbl,Kami senang dengan kondisi jalan yang tidak ru...,,,positif
3,2837,wego,Papan petunjuk arah yang tidak jelas dan minim...,,,negatif
4,170,alun alun,Bermain bersama anak2.. sayangnya bianglala ti...,negatif,,


In [26]:
data_train.head()

Unnamed: 0.1,Unnamed: 0,objek wisata,ulasan,atraksi,amenitas,aksesibilitas
0,1997,waduk gondang,Bagus,,,
1,2733,wego,Karna sepi jadi kurang terurus,,negatif,
2,2773,wego,Cocok utk wisata anak2... Berwisata sambil bel...,positif,,
3,346,alun alun,Cocok untuk mengajak anak-anak bermain,positif,,
4,2211,wbl,salah satu taman hiburan di Kabupaten Lamongan...,positif,,


In [27]:
data_test.head()

Unnamed: 0.1,Unnamed: 0,objek wisata,ulasan,atraksi,amenitas,aksesibilitas
0,2889,gunung mas mantup,Tidak adanya tempat makan dan minum di sekitar...,,negatif,
1,2074,waduk gondang,"Akses jalan mulai bagus,perlu diperhatikan keb...",,negatif,positif
2,2852,wbl,Kami senang dengan kondisi jalan yang tidak ru...,,,positif
3,2837,wego,Papan petunjuk arah yang tidak jelas dan minim...,,,negatif
4,170,alun alun,Bermain bersama anak2.. sayangnya bianglala ti...,negatif,,


In [28]:
print(' len data train review_text  : ',len(data_train["ulasan"]))
print(' data train atraksi  : \n', data_train["atraksi"].value_counts())
print(' data train amenitas  : \n',data_train["amenitas"].value_counts())
print(' data train aksesibilitas  : \n',data_train["aksesibilitas"].value_counts())

 len data train review_text  :  2320
 data train atraksi  : 
 positif    1237
negatif     135
Name: atraksi, dtype: int64
 data train amenitas  : 
 positif    600
negatif    329
Name: amenitas, dtype: int64
 data train aksesibilitas  : 
 positif    164
negatif    111
Name: aksesibilitas, dtype: int64


In [29]:
print(' len data test review_text  : ',len(data_test["ulasan"]))
print(' data test atraksi  : \n', data_test["atraksi"].value_counts())
print(' data test amenitas  : \n',data_test["amenitas"].value_counts())
print(' data test aksesibilitas  : \n',data_test["aksesibilitas"].value_counts())

 len data test review_text  :  580
 data test atraksi  : 
 positif    323
negatif     30
Name: atraksi, dtype: int64
 data test amenitas  : 
 positif    126
negatif     70
Name: amenitas, dtype: int64
 data test aksesibilitas  : 
 positif    38
negatif    30
Name: aksesibilitas, dtype: int64


## Pemisahan Data dan Convert Label

In [30]:
import numpy

def convert_sentimen_label(df, nama_kolom):
    index_nan = list()
    df2 = df[['ulasan', nama_kolom]].copy()
    df2[nama_kolom].replace(str("positif"), 1, inplace=True)
    df2[nama_kolom].replace(str("negatif"), 0, inplace=True)
    df2[nama_kolom].replace("", float("NaN"), inplace=True)
    df2.dropna(subset=[nama_kolom], inplace=True)
    df2[nama_kolom] = df2[nama_kolom].astype(numpy.int64)
    
    return df2

def convert_aspek_label(df, nama_kolom):
    index_nan = list()
    df2 = df[['ulasan', nama_kolom]].copy()
    df2[nama_kolom].replace(float("NaN"), 0, inplace=True)
    df2[nama_kolom].replace(str("positif"), 1, inplace=True)
    df2[nama_kolom].replace(str("negatif"), 1, inplace=True)
    df2[nama_kolom] = df2[nama_kolom].astype(numpy.int64)
    
    return df2

In [31]:
#Konversi Label

nama_kolom1 = 'atraksi'
nama_kolom2 = 'amenitas'
nama_kolom3 = 'aksesibilitas'

train_a1 = convert_aspek_label(data_train, nama_kolom1)
train_s1 = convert_sentimen_label(data_train, nama_kolom1)
train_a2 = convert_aspek_label(data_train, nama_kolom2)
train_s2 = convert_sentimen_label(data_train, nama_kolom2)
train_a3 = convert_aspek_label(data_train, nama_kolom3)
train_s3 = convert_sentimen_label(data_train, nama_kolom3)

test_a1 = convert_aspek_label(data_test, nama_kolom1)
test_s1 = convert_sentimen_label(data_test, nama_kolom1)
test_a2 = convert_aspek_label(data_test, nama_kolom2)
test_s2 = convert_sentimen_label(data_test, nama_kolom2)
test_a3 = convert_aspek_label(data_test, nama_kolom3)
test_s3 = convert_sentimen_label(data_test, nama_kolom3)

In [32]:
x_train_a1, x_test_a1, y_train_a1, y_test_a1 = train_a1["ulasan"], test_a1["ulasan"], train_a1["atraksi"], test_a1["atraksi"]
x_train_a2, x_test_a2, y_train_a2, y_test_a2 = train_a2["ulasan"], test_a2["ulasan"], train_a2["amenitas"], test_a2["amenitas"]
x_train_a3, x_test_a3, y_train_a3, y_test_a3 = train_a3["ulasan"], test_a3["ulasan"], train_a3["aksesibilitas"], test_a3["aksesibilitas"]
x_train_s1, x_test_s1, y_train_s1, y_test_s1 = train_s1["ulasan"], test_s1["ulasan"], train_s1["atraksi"], test_s1["atraksi"]
x_train_s2, x_test_s2, y_train_s2, y_test_s2 = train_s2["ulasan"], test_s2["ulasan"], train_s2["amenitas"], test_s2["amenitas"]
x_train_s3, x_test_s3, y_train_s3, y_test_s3 = train_s3["ulasan"], test_s3["ulasan"], train_s3["aksesibilitas"], test_s3["aksesibilitas"]

##  Jumlah kemunculan setiap kelas pada data train sebelum ROS

In [35]:
print(' len data train aspek atraksi sebelum ROS  : ',len(train_a1["ulasan"]))
print('y_train_a1 : \n', y_train_a1.value_counts())
print(' len data train sentimen atraksi sebelum ROS  : ',len(train_s1["ulasan"]))
print('y_train_s1 : \n', y_train_s1.value_counts())
print(' len data train aspek amenitas sebelum ROS  : ',len(train_a2["ulasan"]))
print('y_train_a2 : \n', y_train_a2.value_counts())
print(' len data train sentimen amenitas sebelum ROS  : ',len(train_s2["ulasan"]))
print('y_train_s2 : \n', y_train_s2.value_counts())
print(' len data train aspek aksesibilitas sebelum ROS  : ',len(train_a3["ulasan"]))
print('y_train_a3 : \n', y_train_a3.value_counts())
print(' len data train sentimen amenitas sebelum ROS  : ',len(train_s3["ulasan"]))
print('y_train_s3 : \n', y_train_s3.value_counts())

 len data train aspek atraksi sebelum ROS  :  2320
y_train_a1 : 
 1    1372
0     948
Name: atraksi, dtype: int64
 len data train sentimen atraksi sebelum ROS  :  1372
y_train_s1 : 
 1    1237
0     135
Name: atraksi, dtype: int64
 len data train aspek amenitas sebelum ROS  :  2320
y_train_a2 : 
 0    1391
1     929
Name: amenitas, dtype: int64
 len data train sentimen amenitas sebelum ROS  :  929
y_train_s2 : 
 1    600
0    329
Name: amenitas, dtype: int64
 len data train aspek aksesibilitas sebelum ROS  :  2320
y_train_a3 : 
 0    2045
1     275
Name: aksesibilitas, dtype: int64
 len data train sentimen amenitas sebelum ROS  :  275
y_train_s3 : 
 1    164
0    111
Name: aksesibilitas, dtype: int64


In [36]:
print(' len data train aspek atraksi sebelum ROS  : ',len(test_a1["atraksi"]))
print('y_test_a1 : \n', y_test_a1.value_counts())
print(' len data train sentimen atraksi sebelum ROS  : ',len(test_s1["atraksi"]))
print('y_test_s1 : \n', y_test_s1.value_counts())
print(' len data train aspek amenitas sebelum ROS  : ',len(test_a2["amenitas"]))
print('y_test_a2 : \n', y_test_a2.value_counts())
print(' len data train sentimen amenitas sebelum ROS  : ',len(test_s2["amenitas"]))
print('y_test_s2 : \n', y_test_s2.value_counts())
print(' len data train aspek aksesibilitas sebelum ROS  : ',len(test_a3["aksesibilitas"]))
print('y_test_a3 : \n', y_test_a3.value_counts())
print(' len data train sentimen aksesibilitas sebelum ROS  : ',len(test_s3["aksesibilitas"]))
print('y_test_s3 : \n', y_test_s3.value_counts())

 len data train aspek atraksi sebelum ROS  :  580
y_test_a1 : 
 1    353
0    227
Name: atraksi, dtype: int64
 len data train sentimen atraksi sebelum ROS  :  353
y_test_s1 : 
 1    323
0     30
Name: atraksi, dtype: int64
 len data train aspek amenitas sebelum ROS  :  580
y_test_a2 : 
 0    384
1    196
Name: amenitas, dtype: int64
 len data train sentimen amenitas sebelum ROS  :  196
y_test_s2 : 
 1    126
0     70
Name: amenitas, dtype: int64
 len data train aspek aksesibilitas sebelum ROS  :  580
y_test_a3 : 
 0    512
1     68
Name: aksesibilitas, dtype: int64
 len data train sentimen aksesibilitas sebelum ROS  :  68
y_test_s3 : 
 1    38
0    30
Name: aksesibilitas, dtype: int64


## Simpan Data Train Sebelum ROS

In [37]:
#Dokumentasi Hasil split data
import pickle

def save_x_train(list_data, f_name):
    pickle.dump(list_data, open("x_train before prepro/" + f_name + ".pickle", "wb"))

save_x_train(x_train_a1, 'x_train_a1')
save_x_train(x_train_a2, 'x_train_a2')
save_x_train(x_train_a3, 'x_train_a3')

save_x_train(x_train_s1, 'x_train_s1')
save_x_train(x_train_s2, 'x_train_s2')
save_x_train(x_train_s3, 'x_train_s3')

#--------------------------------------

def save_y_train(list_data, f_name):
    pickle.dump(list_data, open("y_train/" + f_name + ".pickle", "wb"))

save_y_train(y_train_a1, 'y_train_a1')
save_y_train(y_train_a2, 'y_train_a2')
save_y_train(y_train_a3, 'y_train_a3')

save_y_train(y_train_s1, 'y_train_s1')
save_y_train(y_train_s2, 'y_train_s2')
save_y_train(y_train_s3, 'y_train_s3')

In [38]:
#Dokumentasi Hasil split data
import pickle

def save_x_test(list_data, f_name):
    pickle.dump(list_data, open("x_test before prepro/" + f_name + ".pickle", "wb"))

save_x_test(x_test_a1, 'x_test_a1')
save_x_test(x_test_a2, 'x_test_a2')
save_x_test(x_test_a3, 'x_test_a3')

save_x_test(x_test_s1, 'x_test_s1')
save_x_test(x_test_s2, 'x_test_s2')
save_x_test(x_test_s3, 'x_test_s3')

#--------------------------------------

def save_y_test(list_data, f_name):
    pickle.dump(list_data, open("y_test/" + f_name + ".pickle", "wb"))

save_y_test(y_test_a1, 'y_test_a1')
save_y_test(y_test_a2, 'y_test_a2')
save_y_test(y_test_a3, 'y_test_a3')

save_y_test(y_test_s1, 'y_test_s1')
save_y_test(y_test_s2, 'y_test_s2')
save_y_test(y_test_s3, 'y_test_s3')