# IMDb 文本情感分析

## 数据预处理

In [48]:
#载入IMDb数据
import pandas as pd
data = pd.read_csv('E:/本科/数据挖掘与商务分析/hw/final/IMDB Dataset.csv')
print(data.shape)

#将sentiment列中的positive和negative替换为1和0
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})
data.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


## RNN分析——深度学习初体验

In [49]:
#将样本划分为训练集和测试集
from sklearn.model_selection import train_test_split
X = data['review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape, X_test.shape)

(40000,) (10000,)


In [50]:
#将评论分为token
import nltk
from collections import Counter, OrderedDict

#nltk.download('punkt')
#nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize
X_train = X_train.apply(word_tokenize)
X_test = X_test.apply(word_tokenize)
print(X_train.head())

tokencounts = Counter()
for line in X_train:
    tokencounts.update(line)

print('vocab size:', len(tokencounts))

18165    [This, film, has, been, compared, to, the, hil...
36059    [Reasonably, effective, horror/science-fiction...
13242    [The, inspiration, for, the, ``, Naked, Gun, '...
32985    [When, this, film, was, originally, released, ...
41133    [I, happened, upon, this, by, chance, ., I, wa...
Name: review, dtype: object
vocab size: 173108


In [41]:
#利用torchtext的vocab将token转换为整数
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(tokencounts.items(), key=lambda x: x[1], reverse=True) 
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[19, 9, 46, 496]


In [64]:
#将X_train,y_train,X_test,y_test转换为pytorch的Dataset
import torch
from torch.utils.data import Dataset, DataLoader
class IMDBDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return torch.tensor([vocab[token] for token in self.X.iloc[idx]], dtype=torch.long), torch.tensor(self.y.iloc[idx], dtype=torch.float32)

train_dataset = IMDBDataset(X_train, y_train)
test_dataset = IMDBDataset(X_test, y_test)

In [None]:
#定义转换函数
import torch
import torch.nn as nn
device = torch.device('cuda' )
review_pipeline = lambda x: [vocab[token] for token in word_tokenize(x)]
sentiment_pipeline = lambda x: int(x)

def collate_batch(batch):
    sentiment_list, review_list, lengths = [], [], []
    for _sentiment, _review in batch:
        sentiment_list.append(sentiment_pipeline(_sentiment))
        processed_text = torch.tensor(review_pipeline(_review), dtype=torch.int64)
        review_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_review_list = nn.utils.rnn.pad_sequence(review_list, batch_first=True)
    return padded_review_list.to(device), label_list.to(device), lengths.to(device)

In [66]:
## Take a small batch

dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
review_batch, sentiment_batch, length_batch = next(iter(dataloader))
print(review_batch)
print(sentiment_batch)
print(length_batch)
print(review_batch.shape)

ValueError: only one element tensors can be converted to Python scalars