### 作業目的: 熟練自定義collate_fn與sampler進行資料讀取

本此作業主要會使用[IMDB](http://ai.stanford.edu/~amaas/data/sentiment/)資料集利用Pytorch的Dataset與DataLoader進行
客製化資料讀取。
下載後的資料有分成train與test，因為這份作業目的在讀取資料，所以我們取用train部分來進行練習。
(請同學先行至IMDB下載資料)

### 載入套件

In [1]:
# Import torch and other required modules
import glob
import torch
import re
import nltk
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import load_svmlight_file
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords') #下載stopwords
nltk.download('punkt') #下載word_tokenize需要的corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def get_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {'J': wordnet.ADJ,
                'N': wordnet.NOUN,
                'V': wordnet.VERB,
                'R': wordnet.ADV }
    
    return tag_dict.get(tag, wordnet.NOUN)

def preprocessing(sentence):
    # 處理 ' - 符號和轉小寫
    sentence = re.sub(r'[\'-]', '', sentence).lower()
    # 只留下英文字母
    sentence = re.sub(r'[^a-z]', ' ', sentence)
    # tokenize
    words = nltk.word_tokenize(sentence)
    # lemmatize
    review = []
    for word in words:
        review.append(lemmatizer.lemmatize(word, get_pos(word)))
    
    return review


### 探索資料與資料前處理
這份作業我們使用test資料中的pos與neg


In [3]:
# 讀取字典，這份字典為review內所有出現的字詞
import os
from tqdm import tqdm_notebook
path = r'.\aclImdb\train'
folders = ['neg', 'pos']

lemmatizer = WordNetLemmatizer()
vocabs = set()

for folder in folders:
    folder_path = os.path.join(path, folder)
    for txt in tqdm_notebook(os.listdir(folder_path)):
        with open(os.path.join(folder_path, txt), mode='r',encoding='utf-8') as f:
            content = f.read()
            vocabs = vocabs.union(set(preprocessing(content)))


HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))




In [4]:
# 以nltk stopwords移除贅字，過多的贅字無法提供有用的訊息，也可能影響模型的訓練
print(f"vocab length before removing stopwords: {len(vocabs)}")

vocabs = vocabs - set(stopwords.words('english'))

print(f"vocab length after removing stopwords: {len(vocabs)}")

# 將字典轉換成dictionary
vocabs = list(vocabs)
vocab_dict = {}

index = 0
for v in vocabs:
    if v not in vocab_dict:
        vocab_dict[v] = index
        index += 1


vocab length before removing stopwords: 78602
vocab length after removing stopwords: 78468


In [5]:
# 將資料打包成(x, y)配對，其中x為review的檔案路徑，y為正評(1)或負評(0)
# 這裡將x以檔案路徑代表的原因是讓同學練習不一次將資料全讀取進來，若電腦記憶體夠大(所有資料檔案沒有很大)
# 可以將資料全一次讀取，可以減少在訓練時I/O時間，增加訓練速度

review_pairs = []
for i, folder in enumerate(folders):
    folder_path = os.path.join(path, folder)
    for txt in os.listdir(folder_path):
        review_pairs.append((os.path.join(folder_path, txt), i))

print(review_pairs[:2])
print(f"Total reviews: {len(review_pairs)}")

[('.\\aclImdb\\train\\neg\\0_3.txt', 0), ('.\\aclImdb\\train\\neg\\10000_4.txt', 0)]
Total reviews: 25000


### 建立Dataset, DataLoader, Sampler與Collate_fn讀取資料
這裡我們會需要兩個helper functions，其中一個是讀取資料與清洗資料的函式(load_review)，另外一個是生成詞向量函式
(generate_vec)，注意這裡我們用來產生詞向量的方法是單純將文字tokenize(為了使產生的文本長度不同，而不使用BoW)

In [6]:
def load_review(review_path):
    with open(review_path, mode='r', encoding='utf-8') as f:
        sentence = f.read()
        reviews = preprocessing(sentence)
    
    review = list(set(reviews) - set(stopwords.words('english')))
    
    return review
    

def generate_vec(review, vocab_dict):
    vec = []
    for word in review:
        if word in vocab_dict:
            vec.append(vocab_dict[word])
    
    return vec

In [15]:
#建立客製化dataset

class dataset(Dataset):
    '''custom dataset to load reviews and labels
    Parameters
    ----------
    data_pairs: list
        directory of all review-label pairs
    vocab: list
        list of vocabularies
    '''
    def __init__(self, data_pairs, vocab_dict):
        self.data_pairs = data_pairs
        self.vocab = vocab_dict
#        self.maxLengh = self.__maxLength__()
    
    def __len__(self):
        return len(self.data_pairs)
    
    def __getitem__(self, idx):        
        review_path = load_review(self.data_pairs[idx][0])
        x = generate_vec(review_path, self.vocab)
        y = self.data_pairs[idx][1]
        
        return x,y
    
#     def __maxLength__(self):
#         maxLengh = 0
#         for idx in range(self.__len__):
#             x, y = self.__getitem__(idx)
#             if maxLengh < len(x):
#                 maxLengh = len(x)
        
#         return maxLengh
            

#建立客製化collate_fn，將長度不一的文本pad 0 變成相同長度
def collate_fn(batch):
    length = []
    label = []
    data = []
    outBatch = []
    for b in batch:
        length.append(len(b[0]))
        label.append(b[1])
    
    s = max(length)
    
    for i, b in enumerate(batch):
        data.append(b[0]+[0]*(s-length[i]))
    
    return (torch.tensor(data), torch.tensor(label), torch.tensor(length))

In [16]:
# 使用Pytorch的RandomSampler來進行indice讀取並建立dataloader
custom_dst = dataset(review_pairs, vocab_dict)
custom_dataloader = DataLoader(dataset=custom_dst, batch_size=2, shuffle=True, collate_fn=collate_fn)
next(iter(custom_dataloader))

(tensor([[72473, 37316, 15350, 10618, 26508, 27765, 35492, 52941, 39033,  4550,
          23411, 71684, 72556, 38173, 20877, 57344,  3525, 28037, 66918, 19344,
          23196, 53982, 26205, 75119, 48560, 75298,  8146,  2468, 23245, 19006,
          15368, 26483, 55965, 75994, 52590, 54556, 64647, 15297, 78207, 77343,
          31341, 19323,  4694, 20161, 52875,  2297, 17837, 74949,  3468, 75930,
          58316, 72153, 57295, 22199, 33183,  8473, 71861, 54143, 26273, 60072,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   