In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sqlite3

In [2]:
# Load dataset from database

db = sqlite3.connect('database.db', check_same_thread = False)
q_data = 'SELECT * FROM tabel_cleansed'
data = pd.read_sql_query(q_data, db)
data.head()

Unnamed: 0,text,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


In [3]:
#cek label

data['label'].value_counts()

label
positive    6383
negative    3412
neutral     1138
Name: count, dtype: int64

## Feature-Label Classification

In [3]:
#sortir datanya berdasarkan tiga sentimen tadi

# Group data text

positif_text = data.loc[data['label']=='positive'].text.tolist()
negatif_text = data.loc[data['label']=='negative'].text.tolist()
neutral_text = data.loc[data['label']=='neutral'].text.tolist()

# Group data label

positif_label = data.loc[data['label']=='positive'].label.tolist()
negatif_label = data.loc[data['label']=='negative'].label.tolist()
neutral_label = data.loc[data['label']=='neutral'].label.tolist()

In [4]:
# Final data check

total_data = positif_text + negatif_text + neutral_text
labels = positif_label + negatif_label + neutral_label

print("Positive: %s, Negative: %s, Neutral: %s" % (len(positif_text), len(negatif_text), len(neutral_text)))
print("Total data: %s" % len(total_data))
print("Total labels: %s" % len(labels))

Positive: 6383, Negative: 3412, Neutral: 1138
Total data: 10933
Total labels: 10933


## Feature Extraction

In [5]:
# Tokenizer dan pad_sequences
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import defaultdict

max_features = 100000
tokenizer = Tokenizer(num_words=max_features, split=' ', lower=True)
tokenizer.fit_on_texts(total_data)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("tokenizer.pickle has created!")

#memanggil modul tokenizer
X = tokenizer.texts_to_sequences(total_data)
vocab_size = len(tokenizer.word_index)
maxlen = max(len(x) for x in X)

#memanggil modul pad sequence
X = pad_sequences(X)
with open('x_pad_sequences.pickle', 'wb') as handle:
    pickle.dump(X, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("x_pad_sequences.pickle has created!")


tokenizer.pickle has created!
x_pad_sequences.pickle has created!


In [6]:
# Feature extraction untuk labels

Y = pd.get_dummies(labels)
Y = Y.values

with open('y_labels.pickle', 'wb') as handle:
    pickle.dump(Y, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("y_labels.pickle has created!")

y_labels.pickle has created!
