# Classify MBTI personality type from text using Bag of Words

Hyunsoo Kim

2023-05-07

In [26]:
import random
import re

import nltk
import numpy as np
import pandas as pd
from emoji import demojize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

In [2]:
nltk.download("stopwords")
nltk.download("punkt")

stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DIAL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DIAL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
seed = 42

In [4]:
random.seed(seed)
np.random.seed(seed)

## Load dataset

Here, I use two datasets:
1. [(MBTI) Myers-Briggs Personality Type Dataset](https://www.kaggle.com/datasets/datasnaek/mbti-type)
2. [MBTI Personality Type Twitter Dataset](https://www.kaggle.com/datasets/mazlumi/mbti-personality-type-twitter-dataset)

I will refer to each dataset as `dataset1` and `dataset2`, respectively.

In [5]:
dataset1 = pd.read_csv("mbti_1.csv")
dataset2 = pd.read_csv("twitter_MBTI.csv")
dataset2.drop(columns="Unnamed: 0", inplace=True)

In [6]:
dataset1.head(3)

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...


In [7]:
dataset2.head(3)

Unnamed: 0,text,label
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj
1,@Hispanthicckk Being you makes you look cute||...,intj
2,@Alshymi Les balles sont réelles et sont tirée...,intj


Two datasets have different columns. I will rename the columns of `dataset2` to match those of `dataset1`.

Also, I capitalize the MBTI types in `dataset2` to match those of `dataset1`.

In [8]:
dataset2.rename(columns={"text": "posts", "label": "type"}, inplace=True)
dataset2["type"] = dataset2["type"].apply(str.upper)

dataset2.head(3)

Unnamed: 0,posts,type
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,INTJ
1,@Hispanthicckk Being you makes you look cute||...,INTJ
2,@Alshymi Les balles sont réelles et sont tirée...,INTJ


Now, I concatenate the two datasets.

In [9]:
dataset = pd.concat([dataset1, dataset2], ignore_index=True)
dataset.describe()

Unnamed: 0,type,posts
count,16486,16486
unique,16,16256
top,INFP,"@AzurLane_EN No Shinano rerun yet, huh?|||@ASM..."
freq,3114,2


### Dataset Preprocessing

I will preprocess the dataset as follows:
1. Remove URLs
2. Remove mentions (@)
3. Replace emojis with text
4. Remove non-alphabetic characters (including digits)
5. Remove duplicate posts

In [10]:
def replace_contractions(text):
	text = text.replace(r"n\"t", " not")
	text = text.replace(r"\"m", " am")
	text = text.replace(r"\"re", " are")
	text = text.replace(r"\"ve", " have")
	text = text.replace(r"\"ll", " will")
	text = text.replace(r"\"d", " would")

	return text

In [11]:
def preprocess(text):
	text = text.lower()

	# Remove emails
	text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "", text).strip()

	# Remove URLs
	text = re.sub(
		r"(https?://)?(www\.)?[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]"
		r"{0,61}[a-zA-Z0-9])?)+(?:/.*)?",
		"",
		text,
	).strip()

	# Remove mentions
	text = re.sub(r"@[a-zA-Z0-9_]+", "", text).strip()

	# Replace emojis with text
	text = demojize(text)

	# Remove non-alphabetic characters
	text = re.sub(r"[^\w\s]", "", text).strip()
	text = re.sub(r"\d+", "", text).strip()

	# Replace repetitive whitespace characters with a single space
	text = re.sub(r"\s+", " ", text).strip()

	text = replace_contractions(text)

	tokens = word_tokenize(text)
	tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
	text = " ".join(tokens)

	return text

Posts are separated by `|||`. I will split them into lists, and then expand them into rows.

In [12]:
dataset["posts"] = dataset["posts"].apply(lambda x: x.split("|||"))
dataset = dataset.explode("posts").reset_index(drop=True)
dataset.describe()

Unnamed: 0,type,posts
count,1516044,1516044.0
unique,16,1440590.0
top,INFP,
freq,263265,8899.0


Now I apply the preprocessing function to the posts. I also prune the posts that are too short.

In [13]:
tqdm.pandas(desc="Preprocessing")
dataset["posts"] = dataset["posts"].apply(replace_contractions)
dataset["posts"] = dataset["posts"].progress_apply(preprocess)

Preprocessing: 100%|██████████| 1516044/1516044 [04:48<00:00, 5253.88it/s]


In [14]:
dataset = dataset[dataset["posts"].str.len() > 16]
dataset.describe()

Unnamed: 0,type,posts
count,1115050,1115050
unique,16,1048894
top,INFP,loudly_crying_fac
freq,195833,626


There seems to be some duplicate posts. I will remove them.

In [15]:
dataset.drop_duplicates(subset="posts", inplace=True)
dataset.describe()

Unnamed: 0,type,posts
count,1048894,1048894
unique,16,1048894
top,INFP,lifechang experi life
freq,184454,1


In [16]:
dataset["posts"].apply(len).describe()

count    1.048894e+06
mean     6.165353e+01
std      3.717045e+01
min      1.700000e+01
25%      3.300000e+01
50%      5.500000e+01
75%      8.800000e+01
max      2.665000e+03
Name: posts, dtype: float64

Create one-hot encoded vectors for the MBTI types.

In [17]:
dataset["E_I"] = dataset["type"].apply(lambda x: 0 if x[0] == "E" else 1)
dataset["S_N"] = dataset["type"].apply(lambda x: 0 if x[1] == "S" else 1)
dataset["T_F"] = dataset["type"].apply(lambda x: 0 if x[2] == "T" else 1)
dataset["J_P"] = dataset["type"].apply(lambda x: 0 if x[3] == "J" else 1)
dataset.head(3)

Unnamed: 0,type,posts,E_I,S_N,T_F,J_P
3,INFJ,lifechang experi life,1,1,1,0
5,INFJ,may perc experi immers,1,1,1,0
6,INFJ,last thing infj friend post facebook commit su...,1,1,1,0


In [18]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(dataset["posts"], dataset[
	["E_I", "S_N", "T_F", "J_P"]
], test_size=0.2, random_state=42)

In [35]:
y_test.iloc[:, -4:].describe()

Unnamed: 0,E_I,S_N,T_F,J_P
count,209779.0,209779.0,209779.0,209779.0
mean,0.700189,0.807407,0.570491,0.566596
std,0.458176,0.394337,0.495007,0.495546
min,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,0.0
50%,1.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0


E: ~30% I: ~70%
S: ~20% N: ~80%
T: ~43% F: ~57%
J: ~43.5% P: ~56.5%

## Bag of Words

I will use Bag of Words to represent the posts as vectors.

In [19]:
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [20]:
y_train_E_I = y_train["E_I"].to_numpy()
y_train_S_N = y_train["S_N"].to_numpy()
y_train_T_F = y_train["T_F"].to_numpy()
y_train_J_P = y_train["J_P"].to_numpy()

In [22]:
len(vectorizer.vocabulary_)

278013

In [23]:
list(vectorizer.vocabulary_.keys())[:10]

['that',
 'okay',
 'pleading_fac',
 'good',
 'bub',
 'im',
 'glad',
 'follow',
 'hehe',
 'row']

In [27]:
clf_E_I = LogisticRegression(verbose=2, max_iter=10_000)
clf_S_N = LogisticRegression(verbose=2, max_iter=10_000)
clf_T_F = LogisticRegression(verbose=2, max_iter=10_000)
clf_J_P = LogisticRegression(verbose=2, max_iter=10_000)

In [28]:
clf_E_I.fit(X_train_bow, y_train_E_I)
clf_S_N.fit(X_train_bow, y_train_S_N)
clf_T_F.fit(X_train_bow, y_train_T_F)
clf_J_P.fit(X_train_bow, y_train_J_P)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   50.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   50.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   44.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   44.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   56.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   56.9s finished


### Class-wise accuracy

Each MBTI type is a class. I will calculate the accuracy for each class.

In [29]:
y_pred_E_I = clf_E_I.predict(X_test_bow)
y_pred_S_N = clf_S_N.predict(X_test_bow)
y_pred_T_F = clf_T_F.predict(X_test_bow)
y_pred_J_P = clf_J_P.predict(X_test_bow)

In [31]:
print("E/I Accuracy:", accuracy_score(y_test["E_I"], y_pred_E_I))
print("S/N Accuracy:", accuracy_score(y_test["S_N"], y_pred_S_N))
print("T/F Accuracy:", accuracy_score(y_test["T_F"], y_pred_T_F))
print("J/P Accuracy:", accuracy_score(y_test["J_P"], y_pred_J_P))

E/I Accuracy: 0.7046606190324103
S/N Accuracy: 0.8106054466843678
T/F Accuracy: 0.6114005691704126
J/P Accuracy: 0.5987014906163153


In [37]:
y_pred_E_I.sum() / len(y_pred_E_I)

0.9420914390858952

In [38]:
y_pred_S_N.sum() / len(y_pred_S_N)

0.9807845399205831

In [39]:
y_pred_T_F.sum() / len(y_pred_T_F)

0.7328617259115545

In [40]:
y_pred_J_P.sum() / len(y_pred_J_P)

0.7441926980298313

As seen, the model is biased towards the majority class. This is expected, since the dataset is imbalanced.

### MBTI type accuracy

In [41]:
y_pred = np.stack([y_pred_E_I, y_pred_S_N, y_pred_T_F, y_pred_J_P], axis=1)
y_pred.shape

(209779, 4)

In [42]:
accuracy_score(y_test.iloc[:, -4:], y_pred)

0.22177625024430472

In [45]:
dataset["type"].value_counts(normalize=True)

INFP    0.175856
INFJ    0.150005
INTP    0.118088
INTJ    0.112447
ENFP    0.090590
ENTP    0.076559
ENFJ    0.051587
ISFP    0.038936
ISTP    0.038055
ISFJ    0.036900
ENTJ    0.033617
ISTJ    0.029867
ESFP    0.016211
ESTP    0.011612
ESFJ    0.010443
ESTJ    0.009227
Name: type, dtype: float64

The model only got 22% accuracy. This is not good, but it is better than random guessing (6.25%).