In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from gensim.models import FastText
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, Dense

In [3]:
# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("wordnet")

# Define stopwords and lemmatizer
en_stop = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Error loading stopwords: <urlopen error [WinError 10054]
[nltk_data]     An existing connection was forcibly closed by the
[nltk_data]     remote host>
[nltk_data] Downloading package wordnet to C:\Users\eng
[nltk_data]     abdulrhman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# ------------------ LOAD DATA ------------------
df = pd.read_csv(r"C:\Users\eng abdulrhman\Downloads\IMDB Dataset.csv\IMDB Dataset.csv")
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
texts = df['review'].values
labels = df['sentiment'].values

print(texts)
print(labels)

["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the f

In [5]:
# ------------------ TEXT CLEANING ------------------
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove numbers and special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces

    words = text.split()  # Word tokenization
    words = [lemmatizer.lemmatize(word) for word in words if len(word) > 3 and word not in en_stop]
    return words  # Return list of words (FastText expects list format)


In [6]:
# Apply preprocessing
tokenized_texts = [preprocess_text(text) for text in texts]

In [7]:
# ------------------- 3. Train FastText -------------------
embedding_dim = 100
# Step 1: Train FastText Model on IMDB Data
fasttext_model = FastText(vector_size=embedding_dim, window=5, min_count=5, workers=4, sg=1, epochs=10)
fasttext_model.build_vocab(corpus_iterable=tokenized_texts)
fasttext_model.train(corpus_iterable=tokenized_texts, total_examples=len(tokenized_texts), epochs=10)
fasttext_model.save("IMDB_fasttext.model")
print("Model trained and saved successfully.")




Model trained and saved successfully.


In [8]:
# ------------------- 4. Prepare Embedding Matrix -------------------
max_len = 200
def sentence_to_embedding_matrix(tokens):
    matrix = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            matrix.append(fasttext_model.wv[word] if word in fasttext_model.wv else np.zeros(embedding_dim))
        else:
            matrix.append(np.zeros(embedding_dim))  # padding
    return matrix

X = np.array([sentence_to_embedding_matrix(sentence) for sentence in tokenized_texts])
y = np.array(labels)

In [9]:
print(X.shape)
print(y.shape)

(50000, 200, 100)
(50000,)


In [10]:
def cnn_model(kernel_sizes):
    input_layer = Input(shape=(max_len, embedding_dim))
    
    convs = []
    for k in kernel_sizes:
        conv1 = GlobalMaxPooling1D()(Conv1D(100, k, activation='relu')(input_layer))
        conv2 = GlobalMaxPooling1D()(Conv1D(100, k, activation='relu')(input_layer))
        convs.extend([conv1, conv2])  # Add both filters

    merged = Concatenate()(convs)
    dropout = Dropout(0.5)(merged)  # Prevent overfitting
    output = Dense(1, activation='sigmoid')(dropout)

    model = Model(inputs=input_layer, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [11]:
# ---------------- 5. Train and evaluate models with different kernel sizes ----------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
kernel_sets = [[2, 3, 4], [3, 4, 5], [4, 5, 6]]

best_acc = 0
best_kernels = None

for kernels in kernel_sets:
    print(f"\nTraining with kernel sizes: {kernels}")
    model = cnn_model(kernels)
    model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.1, verbose=0)
    
    loss, acc = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test loss: {loss:.4f} | Test accuracy: {acc:.4f}")
    
    if acc > best_acc:
        best_acc = acc
        best_kernels = kernels

print(f"\nBest kernel set: {best_kernels} with test accuracy: {best_acc:.4f}")


Training with kernel sizes: [2, 3, 4]
Test loss: 0.2497 | Test accuracy: 0.9016

Training with kernel sizes: [3, 4, 5]
Test loss: 0.2582 | Test accuracy: 0.8997

Training with kernel sizes: [4, 5, 6]
Test loss: 0.2628 | Test accuracy: 0.8991

Best kernel set: [2, 3, 4] with test accuracy: 0.9016
