## 2. Feature Engineering

### 2.1 Importing the Library

In [20]:
import pickle
import numpy as np 
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from imblearn.over_sampling import RandomOverSampler

### 2.2 Load the Dataset

In [21]:
# Training
X_train = pickle.load(open("../dataset/processed/01_after_preprocessing/X_train.pkl", "rb"))
y_train = pickle.load(open("../dataset/processed/01_after_preprocessing/y_train.pkl", "rb"))

# Validation
X_val   = pickle.load(open("../dataset/processed/01_after_preprocessing/X_val.pkl", "rb"))
y_val   = pickle.load(open("../dataset/processed/01_after_preprocessing/y_val.pkl", "rb"))

# Test
X_test  = pickle.load(open("../dataset/processed/01_after_preprocessing/X_test.pkl", "rb"))
y_test  = pickle.load(open("../dataset/processed/01_after_preprocessing/y_test.pkl", "rb"))

### 2.3 Define the Parameter

In [22]:
VOCAB_SIZE = 5000
TFIDF_FEATURES = 5000
MAX_LEN = 200
SAVE_DIR = "../dataset/processed/02_after_FE"

### 2.4 Fit Tools (TF-IDF & Tokenizer)

In [23]:
tfidf = TfidfVectorizer(max_features=TFIDF_FEATURES)
tfidf.fit(X_train)

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(X_train)

### 2.5 Transform Data Test 

In [24]:
# Transformasi Data Validation
X_val_tfidf = tfidf.transform(X_val).toarray()
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN)

# Transformasi Data Test
X_test_tfidf = tfidf.transform(X_test).toarray()
X_test_seq   = tokenizer.texts_to_sequences(X_test)
X_test_pad   = pad_sequences(X_test_seq, maxlen=MAX_LEN)

### 2.6 Class Balancing dan Transformasi Final 

#### 2.6.1 Random Oversampling Data Mentah

In [25]:
ros = RandomOverSampler(random_state=42)

X_train_reshaped = np.array(X_train).reshape(-1, 1)
X_train_text_resampled, y_train_resampled = ros.fit_resample(X_train_reshaped, y_train)
X_train_text_resampled = X_train_text_resampled.flatten()

In [26]:
# Convert (n,1) array into a text list
X_train_text_resampled = X_train_text_resampled.flatten().tolist()

#### 2.6.2 Konversi Ulang ke Angka

In [27]:
# Sequence (Padding)
X_train_seq_final = tokenizer.texts_to_sequences(X_train_text_resampled)
X_train_pad_final = pad_sequences(X_train_seq_final, maxlen=MAX_LEN)

# TF-IDF (Vectorization)
X_train_tfidf_final = tfidf.transform(X_train_text_resampled).toarray()

### 2.7 Saving All Dataset and Tools

In [28]:
os.makedirs(SAVE_DIR, exist_ok=True)

pickle.dump(tfidf,     open(os.path.join(SAVE_DIR, "tfidf_vectorizer.pkl"), "wb"))
pickle.dump(tokenizer, open(os.path.join(SAVE_DIR, "tokenizer.pkl"), "wb"))

pickle.dump(X_train_tfidf_final, open(os.path.join(SAVE_DIR, "X_train_tfidf.pkl"), "wb")) 
pickle.dump(X_train_pad_final,   open(os.path.join(SAVE_DIR, "X_train_pad.pkl"), "wb")) 
pickle.dump(y_train_resampled,   open(os.path.join(SAVE_DIR, "y_train.pkl"), "wb")) 

pickle.dump(X_test_tfidf, open(os.path.join(SAVE_DIR, "X_test_tfidf.pkl"), "wb")) 
pickle.dump(X_test_pad,   open(os.path.join(SAVE_DIR, "X_test_pad.pkl"), "wb")) 
pickle.dump(y_test,       open(os.path.join(SAVE_DIR, "y_test.pkl"), "wb")) 

pickle.dump(X_val_tfidf, open(os.path.join(SAVE_DIR, "X_val_tfidf.pkl"), "wb"))
pickle.dump(X_val_pad,   open(os.path.join(SAVE_DIR, "X_val_pad.pkl"), "wb"))
pickle.dump(y_val,       open(os.path.join(SAVE_DIR, "y_val.pkl"), "wb"))

print("Done!")

Done!
