### 作業目的: 熟練Pytorch Dataset與DataLoader進行資料讀取

本此作業主要會使用[IMDB](http://ai.stanford.edu/~amaas/data/sentiment/)資料集利用Pytorch的Dataset與DataLoader進行
客製化資料讀取。
下載後的資料有分成train與test，因為這份作業目的在讀取資料，所以我們取用train部分來進行練習。
(請同學先行至IMDB下載資料)

### 載入套件

In [1]:
# Import torch and other required modules
import glob
import torch
import re
import nltk
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import load_svmlight_file
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords') #下載stopwords
nltk.download('punkt') #下載word_tokenize需要的corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def get_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, 
                "N": wordnet.NOUN, 
                "V": wordnet.VERB,
                "R": wordnet. ADV}
    return tag_dict.get(tag, wordnet.NOUN)

### 探索資料與資料前處理
在train資料中，有分成pos(positive)與neg(negative)，分別為正評價與負評價，此評價即為label。

In [3]:
# 讀取字典，這份字典為review內所有出現的字詞
import os
path = r'.\aclImdb\train'
folders = ['pos', 'neg']

lemmatizer = WordNetLemmatizer()
vocab = []

for fold in folders:
    folder = os.path.join(path, fold)
    for txt in os.listdir(folder):
        with open(os.path.join(folder, txt), mode='r', encoding='utf-8') as f:
            line = f.read()
            line = re.sub('[\'-]', '', line).lower()
            line = re.sub('[^a-z]', ' ', line)
            words = nltk.word_tokenize(line)
            content = []
            for word in words:
                content.append(lemmatizer.lemmatize(word, get_pos(word)))

            vocab = list(set(vocab).union(set(content)))

# 以nltk stopwords移除贅字，過多的贅字無法提供有用的訊息，也可能影響模型的訓練
print(f"vocab length before removing stopwords: {len(vocab)}")

vocab = list(set(vocab) - set(stopwords.words('english')))

print(f"vocab length after removing stopwords: {len(vocab)}")


vocab length before removing stopwords: 78602
vocab length after removing stopwords: 78468


In [4]:
# 將字典轉換成dictionary
vocab = sorted(vocab)

vocab_dic = {}
index = 0
for v in vocab:
    if v not in vocab_dic:
        vocab_dic[v] = index
        index += 1

vocab_dic

{'aa': 0,
 'aaa': 1,
 'aaaaaaah': 2,
 'aaaaah': 3,
 'aaaaatchkah': 4,
 'aaaahhhhhhh': 5,
 'aaaand': 6,
 'aaaarrgh': 7,
 'aaah': 8,
 'aaand': 9,
 'aaargh': 10,
 'aaaugh': 11,
 'aachen': 12,
 'aada': 13,
 'aadha': 14,
 'aag': 15,
 'aage': 16,
 'aaghh': 17,
 'aah': 18,
 'aahhh': 19,
 'aaip': 20,
 'aaja': 21,
 'aakash': 22,
 'aaker': 23,
 'aakrosh': 24,
 'aaliyah': 25,
 'aames': 26,
 'aamess': 27,
 'aamir': 28,
 'aan': 29,
 'aankh': 30,
 'aankhen': 31,
 'aap': 32,
 'aapke': 33,
 'aapkey': 34,
 'aardman': 35,
 'aardmans': 36,
 'aardvark': 37,
 'aargh': 38,
 'aaron': 39,
 'aarp': 40,
 'aarrrgh': 41,
 'aasize': 42,
 'aatish': 43,
 'aauugghh': 44,
 'aavjo': 45,
 'aaww': 46,
 'ab': 47,
 'aba': 48,
 'aback': 49,
 'abahy': 50,
 'abanazer': 51,
 'abandon': 52,
 'abandonment': 53,
 'abanks': 54,
 'abash': 55,
 'abashidze': 56,
 'abate': 57,
 'abatement': 58,
 'abattoir': 59,
 'abba': 60,
 'abbad': 61,
 'abbas': 62,
 'abbasi': 63,
 'abbasmustan': 64,
 'abbey': 65,
 'abbie': 66,
 'abbot': 67,
 'abbot

In [5]:
# 將資料打包成(x, y)配對，其中x為review的檔案路徑，y為正評(1)或負評(0)
# 這裡將x以檔案路徑代表的原因是讓同學練習不一次將資料全讀取進來，若電腦記憶體夠大(所有資料檔案沒有很大)
# 可以將資料全一次讀取，可以減少在訓練時I/O時間，增加訓練速度

folders = ['neg', 'pos']
review_pairs = []

for i, fold in enumerate(folders):
    folder = os.path.join(path, fold)
    for txt in os.listdir(folder):
        review_pairs.append((os.path.join(folder, txt), i))
    
print(review_pairs[:2])
print(f"Total reviews: {len(review_pairs)}")

[('.\\aclImdb\\train\\neg\\0_3.txt', 0), ('.\\aclImdb\\train\\neg\\10000_4.txt', 0)]
Total reviews: 25000


### 建立Dataset與DataLoader讀取資料
這裡我們會需要兩個helper functions，其中一個是讀取資料與清洗資料的函式(load_review)，另外一個是生成詞向量BoW的函式
(generate_bow)

In [6]:
def load_review(review_path):
    
    with open(review_path, mode='r', encoding='utf-8') as f:
        sentence = f.read()
    #移除non-alphabet符號、贅字與tokenize
        sentence = re.sub('[\'-]', '', sentence).lower()
        sentence = re.sub('[^a-z]', ' ', sentence)
        words = nltk.word_tokenize(sentence)
        review = []
        for word in words:
            review.append(lemmatizer.lemmatize(word, get_pos(word)))
        
        review = list(set(review)-set(stopwords.words('english')))
        
    return review

In [7]:
def generate_bow(review, vocab_dic):
    bag_vector = np.zeros(len(vocab_dic))
    for word in review:
        if vocab_dic.get(word):
            bag_vector[vocab_dic.get(word)] += 1
            
    return bag_vector

In [10]:
class dataset(Dataset):
    '''custom dataset to load reviews and labels
    Parameters
    ----------
    data_pairs: list
        directory of all review-label pairs
    vocab: list
        list of vocabularies
    '''
    def __init__(self, data_pairs, vocab):
        
        self.data_pairs = data_pairs
        self.vocab = vocab
        
    def __len__(self):
        return len(self.data_pairs)

    def __getitem__(self, idx):
        
        x = torch.tensor(generate_bow(load_review(self.data_pairs[idx][0]), self.vocab))
        y = torch.tensor(self.data_pairs[idx][1])
        return x, y
        

In [11]:
# 建立客製化dataset
custom_dst = dataset(review_pairs, vocab_dic)
custom_dst[10]

(tensor([0., 0., 0.,  ..., 0., 0., 0.], dtype=torch.float64), tensor(0))

In [12]:
# 建立dataloader
custom_dataloader = DataLoader(dataset=custom_dst, batch_size=5, shuffle=True)
next(iter(custom_dataloader))

[tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64),
 tensor([1, 0, 1, 1, 1])]

In [15]:
count = 0
for data in custom_dataloader:
    if count == 5:
        break
    print('x: {}, y:{}'.format(data[0], data[1]))
    count += 1

x: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64), y:tensor([0, 1, 1, 0, 0])
x: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64), y:tensor([0, 1, 0, 0, 0])
x: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64), y:tensor([1, 1, 0, 0, 1])
x: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64), y:tensor([0, 1, 0,