In [1]:
import pandas as pd
import os
import numpy as np

In [3]:

folder_path = os.path.join(os.getcwd(), "dataset", "aclImdb", "train", "neg")

data = []

for filename in os.listdir(folder_path):
    score = filename.split('_')[1].replace(".txt", "")  
    with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
        text = file.read().strip()    
    data.append({"text": text, "score": score})

df = pd.DataFrame(data)




In [4]:

folder_path = os.path.join(os.getcwd(), "dataset", "aclImdb", "train", "pos")

data = []

for filename in os.listdir(folder_path):
    score = filename.split('_')[1].replace(".txt", "")
    with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
        text = file.read().strip() 
    data.append({"text": text, "score": score})

df2 = pd.DataFrame(data)


In [5]:
folder_path = os.path.join(os.getcwd(), "dataset", "aclImdb", "test", "pos")
data = []

for filename in os.listdir(folder_path):
    score = filename.split('_')[1].replace(".txt", "")
    with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
        text = file.read().strip()
    data.append({"text": text, "score": score})
df3 = pd.DataFrame(data)


In [6]:
folder_path = os.path.join(os.getcwd(), "dataset", "aclImdb", "test", "neg")
data = []
for filename in os.listdir(folder_path):
    score = filename.split('_')[1].replace(".txt", "")
    with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
        text = file.read().strip() 
    data.append({"text": text, "score": score})
df4 = pd.DataFrame(data)

In [7]:
df_fin1 = pd.concat([df, df2])
df_fin2 = pd.concat([df3, df4])
ds_fin = pd.concat([df_fin1, df_fin2])


In [8]:
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [9]:
ds_fin.score = pd.to_numeric(ds_fin.score)

In [21]:
ds_fin['class'] = ds_fin['score'].apply(lambda p: 1 if p >= 7 else (0 if p <= 4 else None))


In [24]:
df = ds_fin

def clean_text(text):
    text = re.sub(r'<br />', ' ', text)  
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  
    text = text.lower() 
    return text

df['cleaned_text'] = df['text'].apply(clean_text)
nltk.download('punkt')
df['tokenized_text'] = df['cleaned_text'].apply(word_tokenize)

X_train, X_test, y_train, y_test = train_test_split(df['tokenized_text'], df['score'], test_size=0.5, random_state=42)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ilya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=2, workers=4)
word2vec_model.save("word2vec_movie_reviews.model")

In [27]:
import torch
from torch.utils.data import Dataset, DataLoader
class MovieReviewDataset(Dataset):
    def __init__(self, reviews, scores, word2vec_model, max_seq_len=100):
        self.reviews = reviews
        self.scores = torch.tensor(scores.astype(float).values, dtype=torch.float32)
        self.word2vec_model = word2vec_model
        self.vector_size = word2vec_model.vector_size
        self.max_seq_len = max_seq_len 

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        tokens = self.reviews.iloc[idx]
        vectors = [self.word2vec_model.wv[token] for token in tokens if token in self.word2vec_model.wv]

        if len(vectors) < self.max_seq_len:
            vectors += [np.zeros(self.vector_size)] * (self.max_seq_len - len(vectors))
        else:
            vectors = vectors[:self.max_seq_len]

        embedding = torch.tensor(vectors, dtype=torch.float32)
        return embedding, self.scores[idx]

train_dataset = MovieReviewDataset(X_train, y_train, word2vec_model)
test_dataset = MovieReviewDataset(X_test, y_test, word2vec_model)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [36]:
import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers=1):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1) 

    def forward(self, x):
        lstm_out, _ = self.lstm(x)       
        lstm_out = lstm_out[:, -1, :]      
        output = self.fc(lstm_out)       
        output = output.view(-1)       
        return output


In [41]:
import torch.optim as optim
embedding_dim = 100 
hidden_dim = 128
num_layers = 1
num_epochs = 10
learning_rate = 0.001
device = torch.device('cuda')

model = LSTMModel(embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_layers=num_layers)
model = model.to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for embeddings, labels in train_loader:
        embeddings, labels = embeddings.to(device), labels.to(device)

        optimizer.zero_grad() 
        outputs = model(embeddings) 
        loss = criterion(outputs, labels) 
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')

Epoch [1/10], Loss: 10.6095
Epoch [2/10], Loss: 6.8995
Epoch [3/10], Loss: 6.0778
Epoch [4/10], Loss: 5.5502
Epoch [5/10], Loss: 5.2228
Epoch [6/10], Loss: 4.8632
Epoch [7/10], Loss: 4.5559
Epoch [8/10], Loss: 4.2448
Epoch [9/10], Loss: 3.8620
Epoch [10/10], Loss: 3.5830


In [57]:
criterion2 = nn.L1Loss()
model.eval()
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for embeddings, labels in test_loader:
        embeddings, labels = embeddings.to(device), labels.to(device)
        outputs = model(embeddings)
        loss = criterion2(outputs.squeeze(), labels)
        test_loss += loss.item()
        predicted = torch.ge(outputs, 5).float() 
        labels_binary = torch.ge(labels, 5).float() 
        correct += (predicted == labels_binary).sum().item()
        total += labels.size(0)
accuracy = correct / total

print(f'Test Loss: {test_loss / len(test_loader):.4f}, Accuracy: {accuracy * 100:.2f}%')


Test Loss: 1.7389, Accuracy: 82.50%
