## Install Dependencies and Download dataset

In [None]:
!python3 -m pip install transformers
!python3 -m pip install sentencepiece
!git clone https://github.com/YifanZhou1999/NLP_Project_datasets_fall22.git
!mkdir data
!mkdir models

## Imports

In [39]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
import keras
from tqdm import tqdm
import pickle
from keras.models import Model
import keras.backend as K
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint
import itertools
from keras.models import load_model
from sklearn.utils import shuffle
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig


def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_stopwords_shortwords(w):
    stopwords_list=stopwords.words('english')
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words) 

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w


## Read in the Data

In [52]:
data_file="/content/NLP_Project_datasets_fall22/dataset2.csv"
dataf = pd.read_csv(data_file).dropna()
print('File has {} rows and {} columns'.format(dataf.shape[0],dataf.shape[1]))
dataf.head()

File has 396523 rows and 2 columns


Unnamed: 0,type,post
0,INFJ,What has been the most life-changing experienc...
1,INFJ,May the PerC Experience immerse you.
2,INFJ,Hello ENFJ7. Sorry to hear of your distress. I...
3,INFJ,Welcome and stuff.
4,INFJ,"Prozac, wellbrutin, at least thirty minutes of..."


In [53]:
def labelconv(x):
    return 1 if x[0] == 'E' else 0

dataf['EI'] = dataf['type'].map(lambda x:labelconv(x))
print(len(dataf['post']), len(dataf['EI']))
dataf.head()

396523 396523


Unnamed: 0,type,post,EI
0,INFJ,What has been the most life-changing experienc...,0
1,INFJ,May the PerC Experience immerse you.,0
2,INFJ,Hello ENFJ7. Sorry to hear of your distress. I...,0
3,INFJ,Welcome and stuff.,0
4,INFJ,"Prozac, wellbrutin, at least thirty minutes of...",0


## Setting up pretrained BERT Model and Tokenizer

- Use distilbert-base-uncased
- numclass = 2

*   列表项
*   列表项



In [54]:
from transformers import *
from transformers import DistilBertTokenizer, TFDistilBertModel, BertConfig

num_classes = 2

bert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_model_EI = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=num_classes)

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_tok

## Tokenize

~ 10 minues to run

In [57]:
input_ids=[]
attention_masks=[]

sentences = dataf['post']
print("Progress:")
for id, sent in enumerate(sentences):
    print(id, end="\r")
    bert_inp=bert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =64,pad_to_max_length = True,return_attention_mask = True)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(dataf['EI'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Progress:
012345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273





In [58]:
len(input_ids),len(attention_masks),len(labels)

(396523, 396523, 396523)

## Saving and Loading Data into the pickle files

In [60]:
print('Preparing the pickle file.....')

pickle_inp_path='./data/bert_inp.pkl'
pickle_mask_path='./data/bert_mask.pkl'
pickle_label_path='./data/bert_label.pkl'

pickle.dump((input_ids),open(pickle_inp_path,'wb'))
pickle.dump((attention_masks),open(pickle_mask_path,'wb'))
pickle.dump((labels),open(pickle_label_path,'wb'))


print('Pickle files saved as ',pickle_inp_path,pickle_mask_path,pickle_label_path)


Preparing the pickle file.....
Pickle files saved as  ./data/bert_inp.pkl ./data/bert_mask.pkl ./data/bert_label.pkl


In [61]:
print('Loading the saved pickle files..')

input_ids=pickle.load(open(pickle_inp_path, 'rb'))
attention_masks=pickle.load(open(pickle_mask_path, 'rb'))
labels=pickle.load(open(pickle_label_path, 'rb'))

print('Input shape {} Attention mask shape {} Input label shape {}'.format(input_ids.shape,attention_masks.shape,labels.shape))

Loading the saved pickle files..
Input shape (396523, 64) Attention mask shape (396523, 64) Input label shape (396523,)


## Split Data

In [62]:
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp.shape,val_inp.shape,train_label.shape,val_label.shape,train_mask.shape,val_mask.shape))

Train inp shape (317218, 64) Val input shape (79305, 64)
Train label shape (317218,) Val label shape (79305,)
Train attention mask shape (317218, 64) Val attention mask shape (79305, 64)


## Training Setup
- Loss
- Optimizer
- Other metrics

In [64]:
log_dir='tensorboard_data/tb_bert'
model_save_path='./models/bert_model.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]

print('\nBert Model',bert_model_EI.summary())

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08)

bert_model_EI.compile(loss=loss,optimizer=optimizer,metrics=[metric])

Model: "tf_distil_bert_for_sequence_classification_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_251 (Dropout)       multiple                  0         
                                                                 
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0
_________________________________________________________________

Bert Model None


## Training the Model

In [4]:
# Check Device
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print("\n\n")
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1266015899841277200
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14415560704
locality {
  bus_id: 1
  links {
  }
}
incarnation: 10756538187025010526
physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
xla_global_id: 416903419
]



Found GPU at: /device:GPU:0


In [65]:
history=bert_model_EI.fit([train_inp,train_mask],train_label,batch_size=32,epochs=50,validation_data=([val_inp,val_mask],val_label),callbacks=callbacks)

Epoch 1/50
  23/9914 [..............................] - ETA: 30:25:36 - loss: 0.5732 - accuracy: 0.7636

KeyboardInterrupt: ignored

## Evaluation