<a href="https://colab.research.google.com/github/ampham03/673_proj/blob/anna's-branch/673_proj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
!pip install transformers
!pip install datasets



In [17]:
from transformers import BertTokenizer, BertModel

In [18]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained('bert-base-uncased')

In [23]:
import re
import nltk
from nltk.corpus import stopwords

In [60]:
import torch
import numpy as np

In [20]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
!pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"annampham","key":"0864a7af205155d232ad172465964e24"}'}

In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content"

In [None]:
!kaggle datasets download -d datasnaek/mbti-type

Dataset URL: https://www.kaggle.com/datasets/datasnaek/mbti-type
License(s): CC0-1.0
Downloading mbti-type.zip to /content
 70% 17.0M/24.4M [00:00<00:00, 36.4MB/s]
100% 24.4M/24.4M [00:00<00:00, 47.2MB/s]


In [None]:
import zipfile

with zipfile.ZipFile('mbti-type.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/mbti')

In [22]:
import pandas as pd

# load the MBTI dataset
df = pd.read_csv('/content/mbti/mbti_1.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [26]:
def clean_text(text):
  text = re.sub(r'http\S+|www\S+', '', text) # remove URLs
  text = " ".join([word for word in text.split() if word.lower() not in stop_words]) # remove stopwords
  return text

In [27]:
df['cleaned_posts'] = df['posts'].apply(clean_text)

In [69]:
df

Unnamed: 0,type,posts,cleaned_posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,' intj moments sportscenter top ten plays pran...
1,ENTP,'I'm finding the lack of me in these posts ver...,'I'm finding lack posts alarming.|||Sex boring...
2,INTP,'Good one _____ https://www.youtube.com/wat...,"'Good one _____ course, say know; that's bless..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, enjoyed conversation day. Esoteric..."
4,ENTJ,'You're fired.|||That's another silly misconce...,'You're fired.|||That's another silly misconce...
...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,' always think cats Fi doms reason. websites b...
8671,ENFP,'So...if this thread already exists someplace ...,'So...if thread already exists someplace else ...
8672,INTP,'So many questions when i do these things. I ...,'So many questions things. would take purple p...
8673,INFP,'I am very conflicted right now when it comes ...,'I conflicted right comes wanting children. ho...


In [29]:
def tokenize(text):
  return(tokenizer(text, return_tensors="pt"))

In [65]:
# to ensure that when tokenized, the text does not go over BERT's max token limit
def split_text(text, max_len=512):
  tokens = tokenizer.encode(text, truncation=False, add_special_tokens=True)
  chunks = [tokens[i:i+max_len] for i in range(0, len(tokens), max_len)]
  chunk_strings = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]
  return chunk_strings

In [57]:
def get_embeddings(posts):
  # expects posts as a list

  embeddings = []

  # Batch the posts into chunks
  for post in posts:
    chunks = split_text(post)  # Split the text into chunks

    # Tokenize all chunks in a batch
    encoded = tokenizer(chunks, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Get embeddings for all chunks at once (without gradients)
    with torch.no_grad():
      output = model(**encoded)

    # Extract the token embeddings for all chunks (batch_size, seq_len, hidden_size)
    token_embeddings = output.last_hidden_state

    # Calculate the average of the token embeddings for each chunk (batch_size, hidden_size)
    chunk_embeddings = torch.mean(token_embeddings, dim=1)

    # Combine all chunk embeddings for the post (average of the embeddings of all chunks)
    post_embedding = torch.mean(chunk_embeddings, dim=0)  # Average over all chunks
    embeddings.append(post_embedding.numpy())

  # Return all embeddings as a numpy array
  return np.array(embeddings)

In [75]:
print(get_embeddings([df['cleaned_posts'][0]]))

[[ 3.09044644e-02 -1.22304559e-01  6.90881729e-01 -1.39664128e-01
   3.57807338e-01 -9.62417126e-02  4.31843370e-01  3.30416173e-01
  -9.60962474e-03 -3.22354406e-01 -1.52873129e-01 -2.25250006e-01
  -1.55946106e-01  2.17720747e-01 -8.60665739e-03  2.87485898e-01
   2.31548086e-01 -1.37965139e-02 -7.76906125e-03  2.03309834e-01
   4.33109045e-01 -2.42638849e-02  2.37622768e-01  5.21417707e-02
   4.59699035e-01 -2.29564309e-03 -1.36724427e-01 -9.79470015e-02
  -3.81351471e-01  1.05294839e-01  4.14125979e-01  5.67369685e-02
   8.38957727e-02  2.10612789e-02  5.92089817e-02 -2.07018822e-01
  -8.85836333e-02  9.97729078e-02  7.36672878e-02  7.29801357e-02
  -2.01433897e-01 -3.18764150e-01  1.32722393e-01 -3.37541290e-02
  -2.93754935e-01 -2.94447064e-01 -2.20362291e-01 -4.68240380e-02
  -9.23373029e-02 -4.08713333e-02 -1.84312493e-01  2.41922826e-01
  -1.06321298e-01 -1.60781190e-01 -1.08961523e-01  3.36578637e-01
   2.78887540e-01 -6.22289181e-01 -2.95351207e-01 -2.95598149e-01
   6.38481

In [76]:
embeddings_list = []

batch_size = 20
for i in range(0, len(df['cleaned_posts']), batch_size):
  batch_posts = df['cleaned_posts'][i:i + batch_size].tolist()

  batch_number = (i // batch_size) + 1
  print(f"Processing batch {batch_number} of {len(df['cleaned_posts']) // batch_size + 1}")

  batch_embeddings = get_embeddings(batch_posts)

  embeddings_list.append(batch_embeddings)

combined_embeddings = np.concatenate(embeddings_list, axis=0)

df['embeddings'] = combined_embeddings

Processing batch 1 of 434


KeyboardInterrupt: 