## Install Dependencies and Download dataset

In [1]:
#!python3 -m pip install transformers
#!python3 -m pip install sentencepiece
#!git clone https://github.com/YifanZhou1999/NLP_Project_datasets_fall22.git
!mkdir data
!mkdir models

## Imports

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
import keras
from tqdm import tqdm
import pickle
from keras.models import Model
import keras.backend as K
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint
import itertools
from keras.models import load_model
from sklearn.utils import shuffle
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig


def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_stopwords_shortwords(w):
    stopwords_list=stopwords.words('english')
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words) 

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w


## Read in the Data

In [3]:
#data_file="/content/NLP_Project_datasets_fall22/dataset2.csv"
#data_file="./dataset2.csv"
data_file="balanced_EI_dataset.csv"
dataf = pd.read_csv(data_file).dropna()
print('File has {} rows and {} columns'.format(dataf.shape[0],dataf.shape[1]))
dataf.head()

File has 185310 rows and 2 columns


Unnamed: 0,type,post
0,INFJ,What has been the most life-changing experienc...
1,INFJ,May the PerC Experience immerse you.
2,INFJ,Hello ENFJ7. Sorry to hear of your distress. I...
3,INFJ,Welcome and stuff.
4,INFJ,"Prozac, wellbrutin, at least thirty minutes of..."


In [4]:
def labelconv(x):
    return 1 if x[0] == 'E' else 0

dataf['EI'] = dataf['type'].map(lambda x:labelconv(x))
print(len(dataf['post']), len(dataf['EI']))
dataf.head()

185310 185310


Unnamed: 0,type,post,EI
0,INFJ,What has been the most life-changing experienc...,0
1,INFJ,May the PerC Experience immerse you.,0
2,INFJ,Hello ENFJ7. Sorry to hear of your distress. I...,0
3,INFJ,Welcome and stuff.,0
4,INFJ,"Prozac, wellbrutin, at least thirty minutes of...",0


## Setting up pretrained BERT Model and Tokenizer

- Use distilbert-base-uncased
- numclass = 2

*   列表项
*   列表项



In [5]:
from transformers import *
from transformers import DistilBertTokenizer, TFDistilBertModel, BertConfig

num_classes = 2

bert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_model_EI = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=num_classes)

loading file vocab.txt from cache at /Users/zhouyifan/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /Users/zhouyifan/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/tokenizer_config.json
loading configuration file config.json from cache at /Users/zhouyifan/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads"

Metal device set to: Apple M1 Max


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'pre_classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

## Tokenize

~ 10 minues to run

In [6]:
input_ids=[]
attention_masks=[]

sentences = dataf['post']
print("Progress:")
for id, sent in enumerate(sentences):
    print(id, end="\r")
    bert_inp=bert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =64,pad_to_max_length = True,return_attention_mask = True)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(dataf['EI'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Progress:
0123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274



185309

In [7]:
len(input_ids),len(attention_masks),len(labels)

(185310, 185310, 185310)

## Saving and Loading Data into the pickle files

In [8]:
print('Preparing the pickle file.....')

pickle_inp_path='./data/bert_inp.pkl'
pickle_mask_path='./data/bert_mask.pkl'
pickle_label_path='./data/bert_label.pkl'

pickle.dump((input_ids),open(pickle_inp_path,'wb'))
pickle.dump((attention_masks),open(pickle_mask_path,'wb'))
pickle.dump((labels),open(pickle_label_path,'wb'))


print('Pickle files saved as ',pickle_inp_path,pickle_mask_path,pickle_label_path)


Preparing the pickle file.....
Pickle files saved as  ./data/bert_inp.pkl ./data/bert_mask.pkl ./data/bert_label.pkl


In [9]:
print('Loading the saved pickle files..')

input_ids=pickle.load(open(pickle_inp_path, 'rb'))
attention_masks=pickle.load(open(pickle_mask_path, 'rb'))
labels=pickle.load(open(pickle_label_path, 'rb'))

print('Input shape {} Attention mask shape {} Input label shape {}'.format(input_ids.shape,attention_masks.shape,labels.shape))

Loading the saved pickle files..
Input shape (185310, 64) Attention mask shape (185310, 64) Input label shape (185310,)


## Split Data

In [10]:
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp.shape,val_inp.shape,train_label.shape,val_label.shape,train_mask.shape,val_mask.shape))

Train inp shape (148248, 64) Val input shape (37062, 64)
Train label shape (148248,) Val label shape (37062,)
Train attention mask shape (148248, 64) Val attention mask shape (37062, 64)


## Training Setup
- Loss
- Optimizer
- Other metrics

In [13]:
log_dir='tensorboard_data/tb_bert'
model_save_path='./models/bert_model.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]

# Freeze pretrained layers
for _layer in bert_model_EI.layers:
    if _layer.name == 'distilbert' or _layer.name == 'pre_classifier':
        print(f"Freezing model layer {_layer.name}")
        _layer.trainable = False
    print(_layer.name)
    print(_layer.trainable)
        
print('\nBert Model',bert_model_EI.summary())

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08)

bert_model_EI.compile(loss=loss,optimizer=optimizer,metrics=[metric])

Freezing model layer distilbert
distilbert
False
Freezing model layer pre_classifier
pre_classifier
False
Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,955,010
Trainable params: 1,538
Non-trainable params: 66,953,472
__________________________________

## Training the Model

In [14]:
# Check Device
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print("\n\n")
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8564779918735524437
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
locality {
  bus_id: 1
}
incarnation: 17493216351942409757
physical_device_desc: "device: 0, name: METAL, pci bus id: <undefined>"
xla_global_id: -1
]



Found GPU at: /device:GPU:0


2022-11-30 21:29:20.289131: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-11-30 21:29:20.289244: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-11-30 21:29:20.296012: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-11-30 21:29:20.296069: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [15]:
history=bert_model_EI.fit([train_inp,train_mask],train_label,batch_size=32,epochs=50,validation_data=([val_inp,val_mask],val_label),callbacks=callbacks)

Epoch 1/50


2022-11-30 21:29:23.559630: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-11-30 21:29:27.318599: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-11-30 21:35:21.306383: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## Evaluation

In [19]:
model_save_path='./models/bert_model.h5'

trained_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2)
trained_model.compile(loss=loss,optimizer=optimizer, metrics=[metric])
trained_model.load_weights(model_save_path)

preds = trained_model.predict([val_inp,val_mask],batch_size=32)
#pred_labels = preds.argmax(axis=1)
pred_labels = np.argmax(preds.logits, axis=1)
f1 = f1_score(val_label,pred_labels)
print('F1 score',f1)
print('Classification Report')
print(classification_report(val_label,pred_labels,target_names=None))

print('Training and saving built model.....')   

loading configuration file config.json from cache at /Users/zhouyifan/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file tf_model.h5 from cache at /Users/zhouyifan/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/tf_model.h5
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceC

F1 score 0.5718375569928957
Classification Report
              precision    recall  f1-score   support

           0       0.56      0.55      0.56     18476
           1       0.56      0.58      0.57     18586

    accuracy                           0.56     37062
   macro avg       0.56      0.56      0.56     37062
weighted avg       0.56      0.56      0.56     37062

Training and saving built model.....
