In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/word2vec-nlp-tutorial/testData.tsv/testData.tsv
/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv/labeledTrainData.tsv
/kaggle/input/glove840b300dtxt/glove.840B.300d.pkl
/kaggle/input/glove840b300dtxt/glove.840B.300d.txt


In [6]:
import logging
import os
import re
import sys
from itertools import chain

import gensim
import pandas as pd
import torch
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split

import pickle

embed_size = 300
max_len = 512

train = pd.read_csv("/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("/kaggle/input/word2vec-nlp-tutorial/testData.tsv/testData.tsv", header=0, delimiter="\t", quoting=3)


def review_to_wordlist(review, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review, "lxml").get_text()
    #
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    # if remove_stopwords:
    #     stops = set(stopwords.words("english"))
    #     words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return (words)


def encode_samples(tokenized_samples):
    features = []
    for sample in tokenized_samples:
        feature = []
        for token in sample:
            if token in word_to_idx:
                feature.append(word_to_idx[token])
            else:
                feature.append(0)
        features.append(feature)
    return features


def pad_samples(features, maxlen=max_len, PAD=0):
    padded_features = []
    for feature in features:
        if len(feature) >= maxlen:
            padded_feature = feature[:maxlen]
        else:
            padded_feature = feature
            while len(padded_feature) < maxlen:
                padded_feature.append(PAD)
        padded_features.append(padded_feature)
    return padded_features


if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ''.join(sys.argv))

    clean_train_reviews, train_labels = [], []
    for i, review in enumerate(train["review"]):
        clean_train_reviews.append(review_to_wordlist(review, remove_stopwords=False))
        train_labels.append(train["sentiment"][i])

    clean_test_reviews = []
    for review in test["review"]:
        clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=False))

    vocab = set(chain(*clean_train_reviews)) | set(chain(*clean_test_reviews))
    vocab_size = len(vocab)

    train_reviews, val_reviews, train_labels, val_labels = train_test_split(clean_train_reviews, train_labels,
                                                                            test_size=0.2, random_state=0)

    #wvmodel_file = '/kaggle/working/glove.840B.300d.txt'
    wvmodel_file = "/kaggle/input/glove840b300dtxt/glove.840B.300d.txt"
    wvmodel = gensim.models.KeyedVectors.load_word2vec_format(wvmodel_file, binary=False, no_header=True)

    word_to_idx = {word: i + 1 for i, word in enumerate(vocab)}
    word_to_idx['<unk>'] = 0
    idx_to_word = {i + 1: word for i, word in enumerate(vocab)}
    idx_to_word[0] = '<unk>'

    train_features = torch.tensor(pad_samples(encode_samples(train_reviews)))
    val_features = torch.tensor(pad_samples(encode_samples(val_reviews)))
    test_features = torch.tensor(pad_samples(encode_samples(clean_test_reviews)))

    train_labels = torch.tensor(train_labels)
    val_labels = torch.tensor(val_labels)

    weight = torch.zeros(vocab_size + 1, embed_size)
    for i in range(len(wvmodel.index_to_key)):
        try:
            index = word_to_idx[wvmodel.index_to_key[i]]
            # print(i)
        except:
            continue
        weight[index, :] = torch.from_numpy(wvmodel.get_vector(
            idx_to_word[word_to_idx[wvmodel.index_to_key[i]]]))

    pickle_file = os.path.join('imdb_glove.pickle3')
    pickle.dump(
        [train_features, train_labels, val_features, val_labels, test_features, weight, word_to_idx, idx_to_word, vocab],
        open(pickle_file, 'wb'))
    print('data dumped!')


INFO:colab_kernel_launcher.py:running /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py-f/root/.local/share/jupyter/runtime/kernel-a37f9765-44f7-47fe-9121-72b5daa569a2.json
INFO:gensim.models.keyedvectors:loading projection weights from /kaggle/input/glove840b300dtxt/glove.840B.300d.txt
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (2196018, 300) matrix of type float32 from /kaggle/input/glove840b300dtxt/glove.840B.300d.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2026-02-07T10:35:27.566449', 'gensim': '4.4.0', 'python': '3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]', 'platform': 'Linux-6.6.113+-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


data dumped!


In [7]:
import logging
import os
import sys
import pickle
import time

import pandas as pd
import torch
import pandas as pd
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.autograd import Variable
from tqdm import tqdm

from sklearn.metrics import accuracy_score


test = pd.read_csv("/kaggle/input/word2vec-nlp-tutorial/testData.tsv/testData.tsv", header=0, delimiter="\t", quoting=3)


num_epochs = 10
embed_size = 300
num_hiddens = 128
num_layers = 2
bidirectional = True
batch_size = 64
labels = 2
lr = 0.01
device = torch.device('cuda:0')
use_gpu = True


class Attention(nn.Module):
    def __init__(self, num_hiddens, bidirectional, **kwargs):
        super(Attention, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.bidirectional = bidirectional

        # if bidirectional, then double the hidden dimensionality
        if self.bidirectional:
            self.w_omega = nn.Parameter(torch.Tensor(num_hiddens * 2, num_hiddens * 2))
            self.u_omega = nn.Parameter(torch.Tensor(num_hiddens * 2, 1))
        else:
            self.w_omega = nn.Parameter(torch.Tensor(num_hiddens, num_hiddens))
            self.u_omega = nn.Parameter(torch.Tensor(num_hiddens, 1))

        nn.init.uniform_(self.w_omega, -0.1, 0.1)
        nn.init.uniform_(self.u_omega, -0.1, 0.1)

    def forward(self, inputs):
        x = inputs
        u = torch.tanh(torch.matmul(x, self.w_omega))
        att = torch.matmul(u, self.u_omega)

        att_score = F.softmax(att, dim=1)
        outputs = x * att_score
        return outputs


class SentimentNet(nn.Module):
    def __init__(self, embed_size, num_hiddens, num_layers, bidirectional, weight, labels, use_gpu, **kwargs):
        super(SentimentNet, self).__init__(**kwargs)
        self.embed_size = embed_size
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.use_gpu = use_gpu
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding.from_pretrained(weight)
        self.embedding.weight.requires_grad = False
        self.encoder = nn.LSTM(input_size=self.embed_size, hidden_size=self.num_hiddens,
                               num_layers=self.num_layers, bidirectional=self.bidirectional,
                               dropout=0)
        self.attention = Attention(num_hiddens=self.num_hiddens, bidirectional=self.bidirectional)
        if self.bidirectional:
            self.decoder = nn.Linear(num_hiddens * 4, labels)
        else:
            self.decoder = nn.Linear(num_hiddens * 2, labels)

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        states, hidden = self.encoder(embeddings.permute(1, 0, 2))
        attention = self.attention(states)
        encoding = torch.cat([attention[0], attention[-1]], dim=1)
        outputs = self.decoder(encoding)
        # print(outputs)
        return outputs


if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info(r"running %s" % ''.join(sys.argv))

    logging.info('loading data...')
    pickle_file = os.path.join('/kaggle/working/imdb_glove.pickle3')
    [train_features, train_labels, val_features, val_labels, test_features, weight, word_to_idx, idx_to_word,
            vocab] = pickle.load(open(pickle_file, 'rb'))
    logging.info('data loaded!')

    net = SentimentNet(embed_size=embed_size, num_hiddens=num_hiddens, num_layers=num_layers,
                       bidirectional=bidirectional, weight=weight,
                       labels=labels, use_gpu=use_gpu)
    net.to(device)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=lr)

    train_set = torch.utils.data.TensorDataset(train_features, train_labels)
    val_set = torch.utils.data.TensorDataset(val_features, val_labels)
    test_set = torch.utils.data.TensorDataset(test_features, )

    train_iter = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_iter = torch.utils.data.DataLoader(val_set, batch_size=batch_size, shuffle=False)
    test_iter = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

    for epoch in range(num_epochs):
        start = time.time()
        train_loss, val_losses = 0, 0
        train_acc, val_acc = 0, 0
        n, m = 0, 0
        with tqdm(total=len(train_iter), desc='Epoch %d' % epoch) as pbar:
            for feature, label in train_iter:
                n += 1
                net.zero_grad()
                feature = Variable(feature.cuda())
                label = Variable(label.cuda())
                score = net(feature)
                loss = loss_function(score, label)
                loss.backward()
                optimizer.step()
                train_acc += accuracy_score(torch.argmax(score.cpu().data,
                                                         dim=1), label.cpu())
                train_loss += loss

                pbar.set_postfix({'epoch': '%d' % (epoch),
                                  'train loss': '%.4f' % (train_loss.data / n),
                                  'train acc': '%.2f' % (train_acc / n)
                                  })
                pbar.update(1)

            with torch.no_grad():
                for val_feature, val_label in val_iter:
                    m += 1
                    val_feature = val_feature.cuda()
                    val_label = val_label.cuda()
                    val_score = net(val_feature)
                    val_loss = loss_function(val_score, val_label)
                    val_acc += accuracy_score(torch.argmax(val_score.cpu().data, dim=1), val_label.cpu())
                    val_losses += val_loss
            end = time.time()
            runtime = end - start
            pbar.set_postfix({'epoch': '%d' % (epoch),
                              'train loss': '%.4f' % (train_loss.data / n),
                              'train acc': '%.2f' % (train_acc / n),
                              'val loss': '%.4f' % (val_losses.data / m),
                              'val acc': '%.2f' % (val_acc / m),
                              'time': '%.2f' % (runtime)
                              })

            # tqdm.write('{epoch: %d, train loss: %.4f, train acc: %.2f, val loss: %.4f, val acc: %.2f, time: %.2f}' %
            #       (epoch, train_loss.data / n, train_acc / n, val_losses.data / m, val_acc / m, runtime))

    test_pred = []
    with torch.no_grad():
        with tqdm(total=len(test_iter), desc='Prediction') as pbar:
            for test_feature, in test_iter:
                test_feature = test_feature.cuda()
                test_score = net(test_feature)
                # test_pred.extent
                test_pred.extend(torch.argmax(test_score.cpu().data, dim=1).numpy().tolist())

                pbar.update(1)

    result_output = pd.DataFrame(data={"id": test["id"], "sentiment": test_pred})
    result_output.to_csv("attention_lstm.csv", index=False, quoting=3)
    logging.info('result saved!')

INFO:colab_kernel_launcher.py:running /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py-f/root/.local/share/jupyter/runtime/kernel-a37f9765-44f7-47fe-9121-72b5daa569a2.json
INFO:root:loading data...
INFO:root:data loaded!
Epoch 0: 100%|██████████| 313/313 [00:25<00:00, 12.32it/s, epoch=0, train loss=0.6610, train acc=0.57, val loss=0.6163, val acc=0.72, time=25.40]
Epoch 1: 100%|██████████| 313/313 [00:26<00:00, 11.87it/s, epoch=1, train loss=0.4639, train acc=0.80, val loss=0.4857, val acc=0.78, time=26.38]
Epoch 2: 100%|██████████| 313/313 [00:27<00:00, 11.39it/s, epoch=2, train loss=0.3985, train acc=0.83, val loss=0.4195, val acc=0.83, time=27.48]
Epoch 3: 100%|██████████| 313/313 [00:27<00:00, 11.36it/s, epoch=3, train loss=0.3760, train acc=0.85, val loss=0.3643, val acc=0.86, time=27.55]
Epoch 4: 100%|██████████| 313/313 [00:27<00:00, 11.36it/s, epoch=4, train loss=0.3447, train acc=0.86, val loss=0.3635, val acc=0.86, time=27.56]
Epoch 5: 100%|██████████| 313/313

In [9]:
import pandas as pd
result_df = pd.read_csv("attention_lstm.csv")
print("测试集预测结果（前10条）：")
display(result_df.head(10)) 
# 统计正负情感数量
count_1 = len(result_df[result_df["sentiment"] == 1])
count_0 = len(result_df[result_df["sentiment"] == 0])
print(f"\n✅ 正面情感（1）：{count_1} 条")
print(f"❌ 负面情感（0）：{count_0} 条")
print(f"总计：{count_1 + count_0} 条")

测试集预测结果（前10条）：


Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,1
5,2913_8,0
6,4396_1,0
7,395_2,0
8,10616_1,0
9,9074_9,1



✅ 正面情感（1）：11297 条
❌ 负面情感（0）：13703 条
总计：25000 条
