# Prepare dataset

### IMDB

In [None]:
from tensorflow.keras.datasets import imdb
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Set vocabulary size
vocab_size = 20000

# Load the IMDb dataset (tokenized)
print("Loading IMDb dataset from Keras...")
(X_train_tokenized, _), (_, _) = imdb.load_data(num_words=vocab_size)

# Get the word index mapping
word_index = imdb.get_word_index()
index_to_word = {v + 3: k for k, v in word_index.items()}  # Offset by 3 for reserved tokens
index_to_word[0] = "[PAD]"
index_to_word[1] = "[START]"
index_to_word[2] = "[UNK]"
index_to_word[3] = "[UNUSED]"

# Convert tokenized reviews back to text
X_train_raw = [" ".join(index_to_word.get(i, "[UNK]") for i in review) for review in X_train_tokenized]

# Convert text into TF-IDF feature vectors
vectorizer_X = TfidfVectorizer(max_features=vocab_size, stop_words="english")
X_train = vectorizer_X.fit_transform(X_train_raw)

print("Vectorization complete.")
print("Shape of X_train:", X_train.shape)

with open("X.pickle", "wb") as f:
    pickle.dump(X_train, f)

with open("vectorizer_X.pickle", "wb") as f:
    pickle.dump(vectorizer_X, f)

### 1 Billion Word
#### Corpus can be downloaded from https://drive.google.com/file/d/1jF4_EKHQCV5zyc_Sd96iKbL_1mE6mm5Z/view?usp=sharing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import sys
import pickle
import re
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

print(sys.version)
min_frequency = 1

f = open("train_v2.txt", encoding='utf-8')
lines = f.read().split("\n")
f.close()
sentences = []
for sentence in lines:
    sentences.append(sentence)
print(len(sentences))

NUM_WORDS=40000

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize text
    words = text.split()
    # Remove stopwords
    english_stopwords = set(stopwords.words('english'))
    words = [word for word in words if word not in english_stopwords]
    # Join words back into a processed text
    processed_text = ' '.join(words)
    return processed_text
    
vectorizer_X = CountVectorizer(preprocessor=preprocess_text,max_features=NUM_WORDS, binary=True)
X = vectorizer_X.fit_transform(sentences)
current_vocab_size = len(vectorizer_X.vocabulary_)
print(current_vocab_size)
print("vectorising completed")

f_vectorizer_X = open("vectorizer_X.pickle", "wb")
pickle.dump(vectorizer_X, f_vectorizer_X, protocol=4)
f_vectorizer_X.close()

f_X = open("X.pickle", "wb")
pickle.dump(X, f_X, protocol=4)
f_X.close()