In [68]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer
import re

In [69]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [70]:
ls

mbti_1.csv  [0m[01;34mMBTI_model[0m/  [01;34mMBTI_model_2[0m/  [01;34mMBTI_model_3[0m/  MBTI_model.ipynb  test.ipynb


In [None]:
path = "gdrive/MyDrive/Colab Notebooks/test/"

In [None]:
df = pd.read_csv(path + "mbti_1.csv")
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [None]:
types = list(set(df['type']))
types

['ENFJ',
 'INTJ',
 'ENTP',
 'ESFP',
 'ESFJ',
 'ISTP',
 'INFJ',
 'ISFJ',
 'ENFP',
 'ISTJ',
 'ESTJ',
 'INTP',
 'ENTJ',
 'ISFP',
 'INFP',
 'ESTP']

In [None]:
def label_mbti(mbti):
    return types.index(mbti)

In [None]:
df['label'] = df['type'].apply(label_mbti)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
 2   label   8675 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 203.4+ KB


In [None]:
df['type'].value_counts()

INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: type, dtype: int64

In [None]:
def process(df, remove_special=True):
    # Change to lowercase
    df['posts'] = df['posts'].apply(lambda x: x.lower())

    #Change case => lowercase
    df["posts"] = df["posts"].apply(lambda x: x.lower())

    #Remove acronyms of personality types within text, for accrate prediction with unknown data
    if remove_special:
        pers_types = ['INFP' ,'INFJ', 'INTP', 'INTJ', 'ENTP', 'ENFP', 'ISTP' ,'ISFP' ,'ENTJ', 'ISTJ','ENFJ', 'ISFJ' ,'ESTP', 'ESFP' ,'ESFJ' ,'ESTJ']
        pers_types = [p.lower() for p in pers_types]
        p = re.compile("(" + "|".join(pers_types) + ")")

    #Substitute hyperlinks with space
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'https?:\/\/.*?[\s+]', '', x.replace("|"," ") + " "))
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'https', '', x.replace("|"," ") + " "))

    # Substitute punctuations except EOS characters
        #Substitute all punctuation except EOS characters
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'\.', ' EOSTokenDot ', x + " "))
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'\?', ' EOSTokenQuest ', x + " "))
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'!', ' EOSTokenExs ', x + " "))
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'[\.+]', ".",x))  #remove punctuation
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'[^\w\s]','',x))  #avoid multiple full stops

    #Remove Numeric + Spl chars
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'[^a-zA-Z\s]','',x))

    #Remove multiple letters
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'([a-z])\1{2,}[\s|\w]*','',x))

    #Keep words within acceptable range (min letter 3, max 30)
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'(\b\w{0,3})?\b','',x))
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'(\b\w{30,1000})?\b','',x))
    return df


In [None]:
process(df)

In [None]:
df.head()

Unnamed: 0,type,posts,label
0,INFJ,enfp intj moments sportscenter plays...,6
1,ENTP,finding lack these posts very alarming EO...,2
2,INTP,good course which know thats bles...,11
3,INTJ,dear intp enjoyed conversation other EOS...,1
4,ENTJ,youre fired EOSTokenDot thats another silly...,12


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
df['posts'].iloc[0]

'    enfp  intj moments   sportscenter    plays   pranks   what  been  most lifechanging experience  your life EOSTokenQuest        repeat  most  today EOSTokenDot      perc experience immerse  EOSTokenDot     last thing  infj friend posted   facebook before committing suicide  next  EOSTokenDot  rest  peace     hello enfj EOSTokenDot  sorry  hear  your distress EOSTokenDot   only natural   relationship    perfection   time  every moment  existence EOSTokenDot    figure  hard times  times  growth  EOSTokenDot  EOSTokenDot  EOSTokenDot          EOSTokenDot  EOSTokenDot  EOSTokenDot    welcome  stuff EOSTokenDot     game EOSTokenDot   EOSTokenDot  match EOSTokenDot    prozac wellbrutin  least thirty minutes  moving your legs   dont mean moving them while sitting  your same desk chair weed  moderation maybe  edibles   healthier alternative EOSTokenDot  EOSTokenDot  EOSTokenDot    basically come  with three items youve determined that each type  whichever types  want   would more than like

In [None]:
token = tokenizer.encode_plus(
    df['posts'].iloc[0],
    max_length=256,
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    return_tensors='tf'
)

In [None]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [None]:
X_input_ids.shape

(8675, 256)

In [None]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['posts'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [None]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [None]:
X_input_ids

array([[  101.,  4035.,  2087., ...,  9025.,  1942.,   102.],
       [  101.,  4006.,  2960., ...,  1942., 27443.,   102.],
       [  101.,  1363.,  1736., ...,  9025.,  1942.,   102.],
       ...,
       [  101.,  1242.,  3243., ...,  3329.,  1567.,   102.],
       [  101.,  1304.,  4139., ...,  5425.,  1164.,   102.],
       [  101.,  1151.,  1263., ...,  2153.,   142.,   102.]])

In [None]:
labels = np.zeros((len(df), 16))

In [None]:
labels[np.arange(len(df)), df['label'].values] = 1

In [None]:
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [None]:
dataset.take(1)

<_TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(16,), dtype=tf.float64, name=None))>

In [None]:
def MBTIDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [None]:
dataset = dataset.map(MBTIDatasetMapFunction)

In [None]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(16,), dtype=tf.float64, name=None))>

In [None]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True)

In [None]:
p = 0.8
train_size = int((len(df)//16)*p)

In [None]:
train_size

433

In [None]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [None]:
from transformers import TFBertModel

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-cased')

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attention_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = bert_model.bert(input_ids, attention_mask=attention_masks)[1]
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(16, activation='softmax', name='output_layer')(intermediate_layer)

model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output_layer)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 256)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 256)]                0         []                            
 )                                                                                                
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1083102   ['input_ids[0][0]',           
                             ngAndCrossAttentions(last_   72         'attention_mask[0][0]']      
                             hidden_state=(None, 256, 7                                       

In [None]:
optim = tf.keras.optimizers.legacy.Adam(learning_rate=5e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [None]:
model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [None]:
hist = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.save('MBTI_model_2')

In [None]:
hist2 = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2
Epoch 2/2


In [None]:
model.save('MBTI_model_2')

In [None]:
hist3 = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2
Epoch 2/2


In [None]:
model.save('MBTI_model_2')

In [None]:
# cd 'Colab Notebooks'/test

/content/gdrive/MyDrive/Colab Notebooks/test


In [None]:
hist4 = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2
Epoch 2/2


In [None]:
model.save('MBTI_model_2')

In [None]:
hist5 = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2
Epoch 2/2


In [None]:
model.save('MBTI_model_4')

In [None]:
hist6 = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2
Epoch 2/2


In [71]:
model.save('MBTI_model_5')

In [72]:
hist7 = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2
Epoch 2/2


In [73]:
model.save('MBTI_model_6')

In [74]:
hist8 = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2
Epoch 2/2


In [75]:
loaded_model = tf.keras.models.load_model('MBTI_model_6')

In [98]:
input_text = "Dear Frank DiMino, I really can’t get across how thankful I am for your donation, which helped allow me to attend this study abroad course. This whole experience has been beneficial to me as I learned a lot of new information that will help me later in my college career. I was able to learn how to look out for research opportunities and really put myself out there. As I am transferring schools in the upcoming fall, I wanted to find out how I could take advantage of being in a university where there are several research opportunities I could take part in. The answers to the questions I had were difficult to find online, so being able to take part in this course was really helpful as it answered many of my questions. Most importantly, this course gave me so many new memories and experiences with the ocean and the life that resides in it. As a marine biology major, I was already interested in and passionate about ocean life prior to this course. But after the whole experience, it really solidified just how much I want to pursue a career in marine biology and how much I really appreciate ocean life. Again, I can’t thank you enough for your donation which really did help me out with covering the fees for this course.  With Gratitude, Evander Limqueco"

In [99]:
input_text

'Dear Frank DiMino, I really can’t get across how thankful I am for your donation, which helped allow me to attend this study abroad course. This whole experience has been beneficial to me as I learned a lot of new information that will help me later in my college career. I was able to learn how to look out for research opportunities and really put myself out there. As I am transferring schools in the upcoming fall, I wanted to find out how I could take advantage of being in a university where there are several research opportunities I could take part in. The answers to the questions I had were difficult to find online, so being able to take part in this course was really helpful as it answered many of my questions. Most importantly, this course gave me so many new memories and experiences with the ocean and the life that resides in it. As a marine biology major, I was already interested in and passionate about ocean life prior to this course. But after the whole experience, it really 

In [91]:
def process_input(input_text, tokenizer, remove_special=True):
  # Change to lowercase
  input_text = input_text.lower()

  #Remove acronyms of personality types within text, for accrate prediction with unknown data
  if remove_special:
      pers_types = ['INFP' ,'INFJ', 'INTP', 'INTJ', 'ENTP', 'ENFP', 'ISTP' ,'ISFP' ,'ENTJ', 'ISTJ','ENFJ', 'ISFJ' ,'ESTP', 'ESFP' ,'ESFJ' ,'ESTJ']
      pers_types = [p.lower() for p in pers_types]
      p = re.compile("(" + "|".join(pers_types) + ")")

  #Substitute hyperlinks with space
  re.sub(r'https?:\/\/.*?[\s+]', '', input_text.replace("|"," ") + " ")
  re.sub(r'https', '', input_text.replace("|"," ") + " ")

  # Substitute punctuations except EOS characters
      #Substitute all punctuation except EOS characters
  re.sub(r'\.', ' EOSTokenDot ', input_text + " ")
  re.sub(r'\?', ' EOSTokenQuest ', input_text + " ")
  re.sub(r'!', ' EOSTokenExs ', input_text + " ")
  re.sub(r'[\.+]', ".",input_text)  #remove punctuation
  re.sub(r'[^\w\s]','',input_text)  #avoid multiple full stops

  #Remove Numeric + Spl chars
  re.sub(r'[^a-zA-Z\s]','',input_text)

  #Remove multiple letters
  re.sub(r'([a-z])\1{2,}[\s|\w]*','',input_text)

  #Keep words within acceptable range (min letter 3, max 30)
  re.sub(r'(\b\w{0,3})?\b','',input_text)
  re.sub(r'(\b\w{30,1000})?\b','',input_text)
  token = tokenizer.encode_plus(
      input_text,
      max_length=256,
      truncation=True,
      padding='max_length',
      add_special_tokens=True,
      return_tensors='tf'
  )
  return {
      'input_ids': tf.cast(token.input_ids, tf.float64),
      'attention_mask': tf.cast(token.attention_mask, tf.float64)
  }


In [100]:
tokenized_input_text = process_input(input_text, tokenizer)

In [101]:
tokenized_input_text

{'input_ids': <tf.Tensor: shape=(1, 256), dtype=float64, numpy=
 array([[  101.,  7059.,   175., 14687., 12563.,  4559.,   117.,   178.,
          1541.,  1169.,   787.,   189.,  1243.,  1506.,  1293., 21602.,
           178.,  1821.,  1111.,  1240., 14324.,   117.,  1134.,  2375.,
          2621.,  1143.,  1106.,  4739.,  1142.,  2025.,  6629.,  1736.,
           119.,  1142.,  2006.,  2541.,  1144.,  1151., 16250.,  1106.,
          1143.,  1112.,   178.,  3560.,   170.,  1974.,  1104.,  1207.,
          1869.,  1115.,  1209.,  1494.,  1143.,  1224.,  1107.,  1139.,
          2134.,  1578.,   119.,   178.,  1108.,  1682.,  1106.,  3858.,
          1293.,  1106.,  1440.,  1149.,  1111.,  1844.,  6305.,  1105.,
          1541.,  1508.,  1991.,  1149.,  1175.,   119.,  1112.,   178.,
          1821., 15273.,  2126.,  1107.,  1103.,  8851.,  2303.,   117.,
           178.,  1458.,  1106.,  1525.,  1149.,  1293.,   178.,  1180.,
          1321.,  4316.,  1104.,  1217.,  1107.,   170.,  27

In [102]:
probs = loaded_model.predict(tokenized_input_text)



In [103]:
probs

array([[1.8365788e-05, 1.6239330e-05, 1.3243568e-05, 1.7291804e-05,
        8.8223105e-06, 3.8269637e-04, 1.5822264e-03, 1.8707877e-04,
        1.5993822e-02, 9.4632069e-06, 1.6204693e-05, 4.1708894e-04,
        9.7884527e-05, 5.5234972e-03, 9.7570920e-01, 6.7838364e-06]],
      dtype=float32)

In [104]:
pred_index = np.argmax(probs[0])

In [105]:
pred_mbti = types[np.argmax(probs[0])]
pred_mbti

'INFP'