#  Data Preprocessing

In [2]:
#解壓縮 tar.gz 檔案

import tarfile
with tarfile.open("aclImdb_v1.tar.gz") as tf:
    tf.extractall()


In [1]:
import pandas as pd
import numpy as np
import csv
from datetime import datetime, timedelta
import os
import re

import tensorflow as tf
from transformers import BertTokenizer, BertModel, TFBertForSequenceClassification

In [2]:
def create_df(name,site):
    path = '/home/zihjie/Test/IMDB_bert_test/aclImdb/'+name+'/'+site
    datalist = []
    for i in os.listdir(path):
        if os.path.splitext(i)[1] == '.txt':     #选取后缀为txt的文件加入datalist
            datalist.append(i)
    
    print('datalist successful')
    print('-'*50)
    
    
    temporarylist = []
    for txt in datalist:
        data_path = os.path.join(path,txt)
        
        with open(data_path, encoding='utf-8') as file:
            words = file.read().strip()
            score = re.findall(r"_(.+?).t",txt)
            temporarylist.append([words,score])
    
    print('temporarylist successful')
    print('-'*50)
    
    
    temporary_df = pd.DataFrame(temporarylist, columns=["text", "score"])
    print("Shape:", temporary_df.shape)
    
    
    
    print('df successful')
    print('-'*50)

    print(temporary_df.head(5))
    print('-'*50)

    
    
    return temporary_df
    
    
    

In [3]:
train_pos_df = create_df('train','pos')
train_neg_df = create_df('train','neg')
test_pos_df = create_df('test','pos')
test_neg_df = create_df('test','neg')

datalist successful
--------------------------------------------------
temporarylist successful
--------------------------------------------------
Shape: (12500, 2)
df successful
--------------------------------------------------
                                                text score
0  It's nice to see a film with real people with ...  [10]
1  I saw this film at the Santa Barbara Film Fest...   [9]
2  As anyone old enough knows, South Africa long ...  [10]
3  If the movies are to be believed, Chinese ghos...   [7]
4  This is a small film , few characters ,theatri...   [9]
--------------------------------------------------
datalist successful
--------------------------------------------------
temporarylist successful
--------------------------------------------------
Shape: (12500, 2)
df successful
--------------------------------------------------
                                                text score
0  Scary Movie 2 was a grave disappointment. Simp...   [2]
1  This movie was

In [4]:
train_pos_df['label'] = pd.Series( np.ones( len(train_pos_df)))
train_neg_df['label'] = pd.Series( np.zeros( len(train_neg_df)))
test_pos_df['label'] = pd.Series( np.ones( len(test_pos_df)))
test_neg_df['label'] = pd.Series( np.zeros( len(test_neg_df)))



In [5]:
#concate pos and neg
train_df = train_pos_df.append(train_neg_df)
test_df = test_pos_df.append(test_neg_df)

In [75]:
# 修改dataframe裡的值'[' 跟 ']'
train_df['score'] = train_df['score'].map(lambda x: str(x)[:-1]).map(lambda x: str(x)[1:])
train_df['text'] = train_df['text'].map(lambda x: str(x)[:-1]).map(lambda x: str(x)[1:])

test_df['score'] = test_df['score'].map(lambda x: str(x)[:-1]).map(lambda x: str(x)[1:])
test_df['text'] = test_df['text'].map(lambda x: str(x)[:-1]).map(lambda x: str(x)[1:])

In [8]:
train_df['score'] = train_df['score'].map(lambda x: str(x)[:-1]).map(lambda x: str(x)[1:])
test_df['score'] = test_df['score'].map(lambda x: str(x)[:-1]).map(lambda x: str(x)[1:])


In [9]:
print(train_df.head(5))

                                                text score  label
0  It's nice to see a film with real people with ...    10    1.0
1  I saw this film at the Santa Barbara Film Fest...     9    1.0
2  As anyone old enough knows, South Africa long ...    10    1.0
3  If the movies are to be believed, Chinese ghos...     7    1.0
4  This is a small film , few characters ,theatri...     9    1.0


In [10]:
#shuffle the train and test dataframe
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    25000 non-null  object 
 1   score   25000 non-null  object 
 2   label   25000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 586.1+ KB


In [12]:
train_df['score'] = train_df['score'].astype(int) 
test_df['score'] = test_df['score'].astype(int) 


In [13]:
train_df['text'] = train_df['text'].astype(str) 
test_df['text'] = test_df['text'].astype(str) 


In [14]:
train_df['label'] = train_df['label'].astype(int) 
test_df['label'] = test_df['label'].astype(int) 


In [15]:
train_df.head(5)

Unnamed: 0,text,score,label
0,"How, in the name of all that's holy, did this ...",1,0
1,I read reviews on this movie and decided to gi...,1,0
2,And that is the only reason I posses this DVD....,2,0
3,"**SPOILERS**KHAMOSH is totally unrealistic, la...",2,0
4,I saw the trailer for this film a few months p...,4,0


# BERT test  

In [16]:
# tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')

## word to token

In [55]:
# Tokenize input
text = "It is slow and even, sweet and moving. One of the best \
unless you like car chases, sex scenes, and violence."
tokens = tokenizer.tokenize(text)      # 每個字切詞成一個list
print(type(tokens))                 # list
np.array(tokens)                    # 轉成numpy

<class 'list'>


array(['It', 'is', 'slow', 'and', 'even', ',', 'sweet', 'and', 'moving',
       '.', 'One', 'of', 'the', 'best', 'unless', 'you', 'like', 'car',
       'chase', '##s', ',', 'sex', 'scenes', ',', 'and', 'violence', '.'],
      dtype='<U8')

## token to id

In [56]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)   # 每個字轉成id
print(type(input_ids))                         # list
print(len(input_ids))
np.array(input_ids)

<class 'list'>
27


array([1135, 1110, 3345, 1105, 1256,  117, 4105, 1105, 2232,  119, 1448,
       1104, 1103, 1436, 4895, 1128, 1176, 1610, 9839, 1116,  117, 2673,
       4429,  117, 1105, 4289,  119])

In [57]:
token_type_ids = tokenizer.create_token_type_ids_from_sequences(input_ids) 
# token_type_ids 必須input還沒加CLS SEP
print(type(token_type_ids))                                # list
print(len(token_type_ids)) 
np.array(token_type_ids)
# 多兩個token是CLS、SEP

<class 'list'>
29


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

## id add CLS and SEP

In [58]:
input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)    # 句子前後加上 CLS SEP 的 id
print(type(input_ids))
print(len(input_ids))
np.array(input_ids)
# 101是CLS，101是SEP

<class 'list'>
29


array([ 101, 1135, 1110, 3345, 1105, 1256,  117, 4105, 1105, 2232,  119,
       1448, 1104, 1103, 1436, 4895, 1128, 1176, 1610, 9839, 1116,  117,
       2673, 4429,  117, 1105, 4289,  119,  102])

## fix input_id's dimension

In [59]:
n = 512 - len(input_ids)
input_ids2 = np.pad(input_ids, (0, n), mode ='constant', constant_values=(0))  
# array右邊append n 個 0  補長度到512
print(len(input_ids2))
input_ids2

512


array([ 101, 1135, 1110, 3345, 1105, 1256,  117, 4105, 1105, 2232,  119,
       1448, 1104, 1103, 1436, 4895, 1128, 1176, 1610, 9839, 1116,  117,
       2673, 4429,  117, 1105, 4289,  119,  102,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

# 把上述處理建立一個function，丟入一串文字試試

In [17]:
# 把維度固定在512維
def input_ids_all(text):
    if len(text)>510:
        text = text[0:510]        
    tokens = tokenizer.tokenize(text)                          # 每個字切詞成一個list
    input_ids = tokenizer.convert_tokens_to_ids(tokens)        # 每個字轉成id
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)    # 句子前後加上 CLS SEP 的 id
    input_ids = np.array(input_ids)                            # list 轉 numpy
    if len(input_ids) < 512:
        n = 512 - len(input_ids)
        input_ids = np.pad(input_ids, (0, n), mode ='constant', constant_values=(0))    
        # array右邊append n 個 0  補長度到512
    return input_ids

In [18]:
# 把維度固定再64維
def input_ids_all_64(text):
    if len(text)>62:
        text = text[0:62]        
    tokens = tokenizer.tokenize(text)                          # 每個字切詞成一個list
    input_ids = tokenizer.convert_tokens_to_ids(tokens)        # 每個字轉成id
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)    # 句子前後加上 CLS SEP 的 id
    input_ids = np.array(input_ids)                            # list 轉 numpy
    if len(input_ids) < 64:
        n = 64 - len(input_ids)
        input_ids = np.pad(input_ids, (0, n), mode ='constant', constant_values=(0))    
        # array右邊append n 個 0  補長度到512
    return input_ids

In [22]:
# 把維度固定再128維
def input_ids_all_128(text):
    if len(text)>126:
        text = text[0:126]        
    tokens = tokenizer.tokenize(text)                          # 每個字切詞成一個list
    input_ids = tokenizer.convert_tokens_to_ids(tokens)        # 每個字轉成id
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)    # 句子前後加上 CLS SEP 的 id
    input_ids = np.array(input_ids)                            # list 轉 numpy
    if len(input_ids) < 128:
        n = 128 - len(input_ids)
        input_ids = np.pad(input_ids, (0, n), mode ='constant', constant_values=(0))    
        # array右邊append n 個 0  補長度到512
    return input_ids

In [41]:
# 把維度固定再128維(改)
def input_ids_all_128(text):
    tokens = tokenizer.tokenize(text)                          # 每個字切詞成一個list
    if len(tokens)>126:
        tokens = tokens[0:126] 
        
    input_ids = tokenizer.convert_tokens_to_ids(tokens)        # 每個字轉成id
   
    
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)    # 句子前後加上 CLS SEP 的 id
    input_ids = np.array(input_ids)                            # list 轉 numpy
    
    if len(input_ids) < 128:
        n = 128 - len(input_ids)
        input_ids = np.pad(input_ids, (0, n), mode ='constant', constant_values=(0))    
        # array右邊append n 個 0  補長度到512
    return input_ids
    
    

## attention mask tensors

In [19]:
def attention_mask_all(text):
    tokens = tokenizer.tokenize(text)       # 每個字切詞成一個list
    input_ids = tokenizer.convert_tokens_to_ids(tokens)  # 每個字轉成id
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)    # 句子前後加上 CLS SEP 的 id
    input_ids = np.array(input_ids)          # list 轉 numpy
    attention_mask = np.array([1,1])
    attention_mask = np.pad(attention_mask, (0, len(input_ids)-2), mode ='constant', constant_values=(1)) 
    #array右邊append 1 到跟segment一樣長
    if len(attention_mask) < 512:
        n = 512 - len(attention_mask)
        attention_mask = np.pad(attention_mask, (0, n), mode ='constant', constant_values=(0))  
        # array右邊append n 個 0  補長度到512
    return attention_mask

In [70]:
attention_mask_all(text)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Let's do it

In [34]:
text = train_df['text'].to_numpy()
print(len(text))
text

25000


array(["How, in the name of all that's holy, did this film ever get distribution? It looks as if it has been shot on someone's mobile phone and takes the screaming girl victim scenario to whole new depths. They literally scream for the full 90 minutes of the movie. And that's all they do. There is no plot, no tension, no characters, and not a lot of acting. Just screaming and more screaming.<br /><br />I gave up after fifteen minutes and fast-wound through it to see if anything happened. It doesn't - except for screaming, of course. Odlly enough, the act of going through it on fast forward highlights another problem - there is no camera-work to speak of. Every shot looks like every other shot - middle distance, one angle, dull, dull, DULL.<br /><br />It's not so bad it's good. It's just plain bad.",
       'I read reviews on this movie and decided to give it a shot. I\'m an open minded guy after all and I’ve given good reviews to some pretty bad flicks. As the end credits rolled on thi

In [35]:
len(text[1])

1274

In [42]:
input_ids = [input_ids_all_128(i) for i in text]      # 必須要 [ ] 輸出是list
input_ids = np.array(input_ids)                    # 轉成numpy
input_ids

array([[ 101, 1731,  117, ..., 1104, 1736,  102],
       [ 101,  146, 2373, ...,  119,  146,  102],
       [ 101, 1262, 1115, ...,  170, 2963,  102],
       ...,
       [ 101, 1258, 3455, ..., 2270,  111,  102],
       [ 101, 1188, 1110, ...,    0,    0,    0],
       [ 101, 2421, 1143, ...,  131, 1335,  102]])

In [43]:
input_ids[1]

array([  101,   146,  2373,  3761,  1113,  1142,  2523,  1105,  1879,
        1106,  1660,  1122,   170,  2046,   119,   146,   112,   182,
        1126,  1501, 13767,  2564,  1170,  1155,  1105,   146,   787,
        1396,  1549,  1363,  3761,  1106,  1199,  2785,  2213, 22302,
        1116,   119,  1249,  1103,  1322,  6459,  3733,  1113,  1142,
        1141,   146,  8703,  1111,  2764,  1105,  1380,  3505,  1106,
        1474,   119,  3446,  2947,   131,   107,  1188,  1273,  1108,
        1143, 19878, 17126,  1193,  1603,   119,   107,  1337,   112,
         188,  1155,   146,  1400,   119,   133,  9304,   120,   135,
         133,  9304,   120,   135,  3956,   117,  3956,   119,  1109,
        3741,  1105,  5173,  1116,  1127,  1218,  1694,  1105,  1103,
        1390,  2375, 23559,  1106,  1103,  6601,  1104, 15345,  1297,
        1133,  1103,  1273,  1108,  8920,  1106,  2824,  1105,  1103,
       12401,  8556,  1261,  1283,  1121,  1103,  1363,  9182,   119,
         146,   102]

In [44]:
label = train_df['label'].to_numpy()
label

array([0, 0, 0, ..., 1, 1, 0])

In [45]:
from sklearn.model_selection import train_test_split
# Use train_test_split to split our data into train and validation sets for training

# # 設定 stratify = label 把每個類別平均
train_input_ids, validation_input_ids, train_label, validation_label = train_test_split(input_ids, label, 
                                                            random_state=2018, test_size=0.2, stratify=label )

In [46]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
# num_labels=5 分5類

model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
model.summary()


optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Model: "tf_bert_for_sequence_classification_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
_________________________________________________________________
dropout_113 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0
_________________________________________________________________


In [47]:
# Train and evaluate using tf.keras.Model.fit()  # batch size 8就會error 可能是記憶體爆掉
model_fit = model.fit(train_input_ids, train_label, 
                      batch_size=4, epochs=4, 
                      validation_data=(validation_input_ids, validation_label)
#                    ,steps_per_epoch=115
#                    validation_steps=7)
                   )

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
