### 作業目的: 熟練以Torchtext進行文本資料讀取

本次作業主要會使用[polarity](http://www.cs.cornell.edu/people/pabo/movie-review-data/)的電影評論來進行使用torchtext資料讀取，學員可以在附件的polarity.tsv看到所使用的資料。

Hint: 這次作業同學可以嘗試使用[torchtext.data.TabularDataset](https://torchtext.readthedocs.io/en/latest/data.html#tabulardataset)，可以更簡易讀取資料

### 載入套件

In [1]:
import torch
import pandas as pd
import numpy as np
import random
from torchtext import datasets
from torchtext.legacy import data
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
import nltk
import re

lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')


In [2]:
# 探索資料
# 可以發現資料為文本與類別，而類別即為正評與負評
input_data = pd.read_csv('./polarity.tsv', delimiter='\t', header=None, names=['text', 'label'])
input_data

Unnamed: 0,text,label
0,films adapted from comic books have had plenty...,1
1,every now and then a movie comes along from a ...,1
2,you've got mail works alot better than it dese...,1
3,jaws is a rare film that grabs your attentio...,1
4,moviemaking is a lot like being the general ma...,1
...,...,...
1995,"if anything , "" stigmata "" should be taken as ...",0
1996,"john boorman's "" zardoz "" is a goofy cinematic...",0
1997,the kids in the hall are an acquired taste .it...,0
1998,there was a time when john carpenter was a gre...,0


In [3]:
len(input_data)

2000

In [4]:
# 自定義 preprocessing 所需函式
def get_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = { 'J': wordnet.ADJ,
                 'V': wordnet.VERB,
                 'N': wordnet.NOUN,
                 'R': wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preProcess(words):
    
    newWords = []
    for w in words:
        if w not in stop and len(w)>2:
            w = lemmatizer.lemmatize(w, pos=get_pos(w))
            newWords.append(w)
            
    
    sentence = ' '.join(newWords)
    sentence = re.sub('[\'-]', '', sentence)
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    
    return sentence.split()

### 建立Pipeline生成資料

In [5]:
# 建立Field與Dataset

textField = data.Field(sequential=True, dtype=torch.float64, lower=True, tokenize='spacy', preprocessing=preProcess)

labelField = data.Field(sequential=False)


In [6]:
# 取的examples並打亂順序
examples = []

for i in range(len(input_data)):
    examples.append(data.Example.fromlist(data=[input_data.loc[i, 'text'], input_data.loc[i, 'label']], 
                                          fields = [('text', textField), ('label', labelField)]))

random.shuffle(examples)

# 以8:2的比例切分examples
trainRatio = 0.8

train_ex = examples[:int(len(examples)*trainRatio)]
test_ex = examples[int(len(examples)*trainRatio):]

# 建立training與testing dataset
trainData = data.Dataset(examples=train_ex, fields={'text':textField, 'label':labelField})
testData = data.Dataset(examples=test_ex, fields={'text':textField, 'label':labelField})

trainData[0].label, trainData[0].text

(1,
 ['capsule',
  'rock',
  'roll',
  'fable',
  'indeed',
  'like',
  'hubert',
  'selby',
  'novel',
  'filter',
  'equal',
  'part',
  'damon',
  'runyan',
  'bruce',
  'springsteen',
  'ton',
  'fun',
  'streets',
  'fire',
  'bill',
  'rock',
  'roll',
  'fable',
  'description',
  'perfect',
  'this',
  'stylish',
  'great',
  'look',
  'breezily',
  'enjoyable',
  'movie',
  'feel',
  'like',
  'hubert',
  'selby',
  'novel',
  'damon',
  'runyan',
  'rewrite',
  'irected',
  'bruce',
  'springsteen',
  'noir',
  'one',
  'truly',
  'american',
  'movie',
  'genre',
  'aside',
  'hollywood',
  'musical',
  'western',
  'this',
  'post',
  'wwii',
  'brooklyn',
  'noir',
  'lack',
  'well',
  'label',
  'generous',
  'injection',
  'rock',
  'roll',
  'around',
  'confrontational',
  'attitude',
  'it',
  'work',
  'the',
  'plot',
  'simplicity',
  'rocker',
  'ellen',
  'aim',
  'diane',
  'lane',
  'kidnap',
  'vicious',
  'street',
  'gang',
  'lead',
  'bloodthirsty',
  'fe

In [33]:
# # 取的examples並打亂順序
# examples = []

# for i in range(len(input_data)):

# # 以8:2的比例切分examples
# ### <your code> ###
# test_ex = examples[int(len(examples)*0.8):]

# # 建立training與testing dataset
# ### <your code> ###
# test_data = data.Dataset(examples=test_ex, fields={'text':text_field, 'label':label_field})

# train_data[0].label, train_data[0].text

('0',
 [' ',
  'snake',
  'eyes',
  ' ',
  'is',
  'the',
  'most',
  'aggravating',
  'kind',
  'of',
  'movie',
  ':',
  'the',
  'kind',
  'that',
  'shows',
  'so',
  'much',
  'potential',
  'then',
  'becomes',
  'unbelievably',
  'disappointing',
  '.it',
  "'s",
  'not',
  'just',
  'because',
  'this',
  'is',
  'a',
  'brian',
  'depalma',
  'film',
  ',',
  'and',
  'since',
  'he',
  "'s",
  'a',
  'great',
  'director',
  'and',
  'one',
  'who',
  "'s",
  'films',
  'are',
  'always',
  'greeted',
  'with',
  'at',
  'least',
  'some',
  'fanfare',
  '.and',
  'it',
  "'s",
  'not',
  'even',
  'because',
  'this',
  'was',
  'a',
  'film',
  'starring',
  'nicolas',
  'cage',
  'and',
  'since',
  'he',
  'gives',
  'a',
  'brauvara',
  'performance',
  ',',
  'this',
  'film',
  'is',
  'hardly',
  'worth',
  'his',
  'talents',
  '.it',
  "'s",
  'worse',
  'than',
  'that',
  '.it',
  "'s",
  'aggravating',
  'for',
  'the',
  'sole',
  'reason',
  'that',
  'its',
  

In [7]:
# 建立字典
textField.build_vocab(trainData)
labelField.build_vocab(trainData)

print(f"Vocabularies of index 0-5: {textField.vocab.itos[:10]} \n")
print(f"words to index {textField.vocab.stoi}")

Vocabularies of index 0-5: ['<unk>', '<pad>', 'film', 'the', 'movie', 'nt', 'one', 'make', 'like', 'character'] 





In [39]:
# 建立字典
### <your code> ###

print(f"Vocabularies of index 0-5: {text_field.vocab.itos[:10]} \n")
print(f"words to index {text_field.vocab.stoi}")

Vocabularies of index 0-5: ['<unk>', '<pad>', ',', 'the', 'a', 'and', 'of', 'to', 'is', 'in'] 



In [14]:
# create iterator for training and testing data
# train_iter, test_iter = data.Iterator(trainData, batch_size=2, sort_key=lambda ex: len(ex.text)), \
#                         data.Iterator(testData, batch_size=2, sort_key=lambda ex: len(ex.text))

train_iter, test_iter = data.Iterator.splits(datasets=(trainData, testData), batch_sizes=(3,2), sort_key=lambda ex:len(ex.text))

In [15]:
for train_batch in train_iter:
    print(train_batch.text, train_batch.text.shape)
    print(train_batch.label, train_batch.label.shape)
    break

tensor([[5.2500e+02, 6.7000e+01, 2.8000e+01],
        [3.1330e+03, 3.6500e+02, 3.9000e+01],
        [3.8300e+02, 1.2000e+01, 1.5500e+02],
        ...,
        [2.5420e+03, 1.0000e+00, 1.0000e+00],
        [2.4500e+02, 1.0000e+00, 1.0000e+00],
        [5.0000e+00, 1.0000e+00, 1.0000e+00]], dtype=torch.float64) torch.Size([489, 3])
tensor([1, 2, 2]) torch.Size([3])


In [16]:
for test_batch in test_iter:
    print(test_batch.text, test_batch.text.shape)
    print(test_batch.label, test_batch.label.shape)
    break

tensor([[0.0000e+00, 1.6040e+03],
        [6.0000e+00, 3.9790e+03],
        [1.1938e+04, 5.9150e+03],
        [2.0000e+00, 3.9130e+03],
        [1.3600e+02, 8.4770e+03],
        [1.1000e+01, 5.8900e+03],
        [1.2000e+01, 7.0000e+00],
        [0.0000e+00, 1.4980e+03],
        [0.0000e+00, 2.1080e+03],
        [4.1000e+01, 1.0040e+03],
        [5.0730e+03, 1.1000e+02],
        [8.0000e+01, 9.3210e+03],
        [4.2840e+03, 3.0600e+02],
        [7.2300e+02, 5.6800e+03],
        [9.1800e+02, 2.8080e+03],
        [6.6000e+01, 1.1560e+03],
        [2.5000e+01, 1.0060e+03],
        [1.5494e+04, 6.2000e+01],
        [1.5204e+04, 1.2100e+02],
        [2.7770e+03, 3.6800e+02],
        [9.1200e+02, 4.7000e+01],
        [4.2400e+02, 3.0000e+00],
        [3.6000e+01, 1.8000e+01],
        [8.0000e+01, 2.0960e+03],
        [2.3300e+02, 1.8270e+03],
        [2.0000e+01, 8.0700e+02],
        [2.7100e+02, 1.4000e+01],
        [9.0000e+01, 1.9000e+01],
        [3.4000e+01, 3.8000e+01],
        [1.610

In [45]:
# for train_batch in train_iter:
#     print(train_batch.text, train_batch.text.shape)
#     print(train_batch.label, train_batch.label.shape)
#     break

tensor([[6.8000e+01, 3.7260e+03, 1.4420e+03],
        [3.8000e+01, 1.6000e+01, 1.3000e+01],
        [4.0000e+00, 2.8000e+01, 3.4050e+03],
        ...,
        [1.0000e+00, 1.9500e+02, 1.0000e+00],
        [1.0000e+00, 7.2000e+01, 1.0000e+00],
        [1.0000e+00, 1.7000e+01, 1.0000e+00]], dtype=torch.float64) torch.Size([769, 3])
tensor([1, 2, 2]) torch.Size([3])
