In [24]:
import numpy as np 
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/reddit-mbti-dataset/unique_author.csv
/kaggle/input/reddit-mbti-dataset/reddit_post.csv
/kaggle/input/mbti-type/mbti_1.csv


In [2]:
data = pd.read_csv("/kaggle/input/reddit-mbti-dataset/reddit_post.csv")

In [3]:
data=data.head(5000)
data.head()

Unnamed: 0,author,body,mbti
0,LadyBanterbury,lol thats why i left,INFP
1,Finarin,i was just about to post i try telling people ...,INTP
2,xanplease,my first thought was pepsi or something probab...,INFP
3,HeirToGallifrey,not if the formula is something like every tim...,ENTP
4,ElementalVoltage,well i wouldnt know but i think theres a lot o...,INTP


In [4]:
types = np.unique(data.mbti.values)
types

array(['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP',
       'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'],
      dtype=object)

In [5]:
def get_type_index(string):
    return list(types).index(string)

In [6]:
data['type_index'] = data['mbti'].apply(get_type_index)

In [7]:
data.body.values[0]

'lol thats why i left '

In [8]:
import string
import re

def clean_text(text):
    regex = re.compile('[%s]' % re.escape('|'))
    text = regex.sub(" ", text)
    words = str(text).split()
    words = [i.lower() + " " for i in words]
    words = [i for i in words if not "http" in i]
    words = " ".join(words)
    words = words.translate(words.maketrans('', '', string.punctuation))
    return words


In [9]:
data['cleaned_text'] = data['body'].apply(clean_text)

In [10]:
data.cleaned_text.values[0]

'lol  thats  why  i  left '

In [11]:
data.head()

Unnamed: 0,author,body,mbti,type_index,cleaned_text
0,LadyBanterbury,lol thats why i left,INFP,9,lol thats why i left
1,Finarin,i was just about to post i try telling people ...,INTP,11,i was just about to post i try telling...
2,xanplease,my first thought was pepsi or something probab...,INFP,9,my first thought was pepsi or something ...
3,HeirToGallifrey,not if the formula is something like every tim...,ENTP,3,not if the formula is something like ev...
4,ElementalVoltage,well i wouldnt know but i think theres a lot o...,INTP,11,well i wouldnt know but i think theres ...


In [12]:
data=data.drop('author', axis=1)

In [13]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data)
train, val = train_test_split(train)

In [16]:
import tensorflow as tf
one_hot_labels = tf.keras.utils.to_categorical(train.type_index.values, num_classes=16)
val_labels= tf.keras.utils.to_categorical(val.type_index.values, num_classes=16)

In [None]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased')

In [18]:
maxlen = 256

train_input_ids = [tokenizer.encode(str(i), max_length = maxlen , pad_to_max_length = True) for i in train.cleaned_text.values]
val_input_ids = [tokenizer.encode(str(i), max_length = maxlen , pad_to_max_length = True) for i in val.cleaned_text.values]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [19]:
def create_model(): 
    input_word_ids = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32,
                                           name="input_word_ids")
    bert_layer = transformers.TFBertModel.from_pretrained('bert-base-uncased')
    bert_outputs = bert_layer(input_word_ids)[0]
    pred = tf.keras.layers.Dense(16, activation='softmax')(bert_outputs[:,0,:])
    
    model = tf.keras.models.Model(inputs=input_word_ids, outputs=pred)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.00001), metrics=['accuracy'])
    return model

In [20]:
import tensorflow as tf
use_tpu = False
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()
    
model.summary()

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 256)]             0         
_________________________________________________________________
tf_bert_model (TFBertModel)  TFBaseModelOutputWithPool 109482240 
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 16)                12304     
Total params: 109,494,544
Trainable params: 109,494,544
Non-trainable params: 0
_________________________________________________________________


In [None]:
batch_size = 6

model.fit(np.array(train_input_ids), one_hot_labels,validation_data = (np.array(val_input_ids), val_labels),
          verbose = 1, epochs = 10, batch_size = batch_size,  callbacks = [tf.keras.callbacks.EarlyStopping(patience = 8)])

In [25]:
model.save("bertcls")

In [26]:
!zip -r model2.zip bertcls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: bertcls/ (stored 0%)
  adding: bertcls/assets/ (stored 0%)
  adding: bertcls/variables/ (stored 0%)
  adding: bertcls/variables/variables.data-00000-of-00001 (deflated 16%)
  adding: bertcls/variables/variables.index (deflated 81%)
  adding: bertcls/keras_metadata.pb (deflated 95%)
  adding: bertcls/saved_model.pb (deflated 92%)
