# IMDb 文本情感分析

## RNN分析——深度学习初体验

In [27]:
#测试cuda是否可用
import torch
if torch.cuda.is_available():
    print("CUDA is available.")
else:
    print("CUDA is not available.")

CUDA is available.


In [28]:
import warnings
warnings.filterwarnings("ignore")
#导入数据
import pandas as pd
data = pd.read_csv('E:/本科/数据挖掘与商务分析/hw/final/IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [30]:
#将数据集分为训练集和测试集
from sklearn.model_selection import train_test_split
X = data['review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
train_dataset = [(x, y) for x, y in zip(X_train, y_train)]
train_dataset = [(x, y) for x, y in zip(X_train, y_train)]

In [31]:
import re
from collections import Counter, OrderedDict

token_counts = Counter()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized


for review, _ in train_dataset:
    tokens = tokenizer(review)
    token_counts.update(tokens)
 
    
print('Vocab-size:', len(token_counts))

Vocab-size: 94953


In [32]:
#利用torchtext的vocab将token转换为整数
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True) 
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 472]


In [35]:
#定义转换函数
import torch.nn as nn
device = torch.device('cpu' )
review_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
sentiment_pipeline = lambda x: 1 if x == 'positive' else 0

def collate_batch(batch):
    review_list, sentiment_list, lengths = [], [], []
    for _review, _sentiment,in batch:
        processed_text =torch.tensor(review_pipeline(_review), dtype=torch.int64)
        review_list.append(processed_text)
        sentiment_list.append(sentiment_pipeline(_sentiment))
        lengths.append(processed_text.size(0))
    sentiment_list = torch.tensor(sentiment_list)
    lengths = torch.tensor(lengths)
    padded_review_list = nn.utils.rnn.pad_sequence(review_list, batch_first=True)
    return padded_review_list.to(device), sentiment_list.to(device), lengths.to(device)

In [36]:
from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
review_batch, sentiment_batch, length_batch = next(iter(dataloader))
print(review_batch)
print(sentiment_batch)
print(length_batch)
print(review_batch.shape)

tensor([[  11,   20,   47,  ...,    0,    0,    0],
        [3721, 1203,  186,  ...,    0,    0,    0],
        [   2, 2787,   17,  ...,    0,    0,    0],
        [  53,   11,   20,  ...,   96,  127,  253]])
tensor([0, 1, 1, 1])
tensor([183,  63, 170, 650])
torch.Size([4, 650])
