# MBTI Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/studio-lab-user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Preclean and Encode Class Data for Reddit Data

In [4]:
df_mbti = pd.read_csv("mbti_full_pull.csv", engine='python', on_bad_lines='skip')

In [5]:
df_mbti.head()

Unnamed: 0,author_flair_text,body,subreddit
0,INTJ,Knowing you're in INTJ is a tool for you to us...,intj
1,INTJ,You are truly an enlightened mastermind.,intj
2,"INFJ, 26F",You should :) it will help if you have a down ...,infj
3,INTP,I watch a bit of everything (including hentai)...,INTP
4,INTJ,I don't know if I would count this as a pet pe...,intj


In [6]:
df_mbti.count()

author_flair_text    1794016
body                 1793949
subreddit            1794016
dtype: int64

In [7]:
mbti_classes = ['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'ISFP', 'INFP', 'INTP', 'ENTJ', 'ENTP', 'ENFJ', 'ENFP', 'ESTJ', 'ESFJ', 'ESTP', 'ESFP']

In [8]:
pattern = '|'.join(mbti_classes)

In [9]:
# ensure author_flair_text contains mbti class label
df_mbti = df_mbti[df_mbti['author_flair_text'].str.contains(pattern, case=False, na=False)]

In [10]:
# extract mbti class labels from author_flair_text
df_mbti['ISTJ'] = df_mbti['author_flair_text'].str.extract(r'(ISTJ)')
df_mbti['ISFJ'] = df_mbti['author_flair_text'].str.extract(r'(ISFJ)')
df_mbti['INFJ'] = df_mbti['author_flair_text'].str.extract(r'(INFJ)')
df_mbti['INTJ'] = df_mbti['author_flair_text'].str.extract(r'(INTJ)')

df_mbti['ISTP'] = df_mbti['author_flair_text'].str.extract(r'(ISTP)')
df_mbti['ISFP'] = df_mbti['author_flair_text'].str.extract(r'(ISFP)')
df_mbti['INFP'] = df_mbti['author_flair_text'].str.extract(r'(INFP)')
df_mbti['INTP'] = df_mbti['author_flair_text'].str.extract(r'(INTP)')

df_mbti['ENTJ'] = df_mbti['author_flair_text'].str.extract(r'(ENTJ)')
df_mbti['ENTP'] = df_mbti['author_flair_text'].str.extract(r'(ENTP)')
df_mbti['ENFJ'] = df_mbti['author_flair_text'].str.extract(r'(ENFJ)')
df_mbti['ENFP'] = df_mbti['author_flair_text'].str.extract(r'(ENFP)')

df_mbti['ESTJ'] = df_mbti['author_flair_text'].str.extract(r'(ESTJ)')
df_mbti['ESFJ'] = df_mbti['author_flair_text'].str.extract(r'(ESFJ)')
df_mbti['ESTP'] = df_mbti['author_flair_text'].str.extract(r'(ESTP)')
df_mbti['ESFP'] = df_mbti['author_flair_text'].str.extract(r'(ESFP)')


In [11]:
df_mbti['class'] = df_mbti['ISTJ'].fillna('') + '' + df_mbti['ISFJ'].fillna('') + df_mbti['INFJ'].fillna('') + '' + df_mbti['INTJ'].fillna('') + df_mbti['ISTP'].fillna('') + '' + df_mbti['ISFP'].fillna('') + df_mbti['INFP'].fillna('') + '' + df_mbti['INTP'].fillna('') + df_mbti['ENTJ'].fillna('') + '' + df_mbti['ENTP'].fillna('') + df_mbti['ENFJ'].fillna('') + '' + df_mbti['ENFP'].fillna('') + df_mbti['ESTJ'].fillna('') + '' + df_mbti['ESFJ'].fillna('') + df_mbti['ESTP'].fillna('') + '' + df_mbti['ESFP'].fillna('')



In [12]:
# drop unneeded columns
df_mbti = df_mbti.drop(columns=['author_flair_text', 'subreddit'])

In [13]:
# ensure body text columns are not blank
df_mbti = df_mbti[df_mbti['body'].notnull()]

In [14]:
# ensure class is one of 16 defined mbti classes
df_mbti = df_mbti[df_mbti['class'].isin(mbti_classes)]

In [15]:
# one-hot encode each of the 16 mbti classes
df_mbti['ISTJ'] = np.where(df_mbti['ISTJ'].notnull(), 1, 0)
df_mbti['ISFJ'] = np.where(df_mbti['ISFJ'].notnull(), 1, 0)
df_mbti['INFJ'] = np.where(df_mbti['INFJ'].notnull(), 1, 0)
df_mbti['INTJ'] = np.where(df_mbti['INTJ'].notnull(), 1, 0)

df_mbti['ISTP'] = np.where(df_mbti['ISTP'].notnull(), 1, 0)
df_mbti['ISFP'] = np.where(df_mbti['ISFP'].notnull(), 1, 0)
df_mbti['INFP'] = np.where(df_mbti['INFP'].notnull(), 1, 0)
df_mbti['INTP'] = np.where(df_mbti['INTP'].notnull(), 1, 0)

df_mbti['ENTJ'] = np.where(df_mbti['ENTJ'].notnull(), 1, 0)
df_mbti['ENTP'] = np.where(df_mbti['ENTP'].notnull(), 1, 0)
df_mbti['ENFJ'] = np.where(df_mbti['ENFJ'].notnull(), 1, 0)
df_mbti['ENFP'] = np.where(df_mbti['ENFP'].notnull(), 1, 0)

df_mbti['ESTJ'] = np.where(df_mbti['ESTJ'].notnull(), 1, 0)
df_mbti['ESFJ'] = np.where(df_mbti['ESFJ'].notnull(), 1, 0)
df_mbti['ESTP'] = np.where(df_mbti['ESTP'].notnull(), 1, 0)
df_mbti['ESFP'] = np.where(df_mbti['ESFP'].notnull(), 1, 0)

df_mbti.head()

Unnamed: 0,body,ISTJ,ISFJ,INFJ,INTJ,ISTP,ISFP,INFP,INTP,ENTJ,ENTP,ENFJ,ENFP,ESTJ,ESFJ,ESTP,ESFP,class
0,Knowing you're in INTJ is a tool for you to us...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ
1,You are truly an enlightened mastermind.,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ
2,You should :) it will help if you have a down ...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,INFJ
3,I watch a bit of everything (including hentai)...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,INTP
4,I don't know if I would count this as a pet pe...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ


In [16]:
introverted = ['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'INFP', 'ISFP', 'INTP']
extroverted = ['ENTJ', 'ENTP', 'ENFJ', 'ENFP', 'ESTJ', 'ESFJ', 'ESTP', 'ESFP']

sensing = ['ISTJ', 'ISFJ','ISTP', 'ISFP', 'ESTJ', 'ESFJ', 'ESTP', 'ESFP']
intuition = ['ENTJ', 'ENTP', 'ENFJ', 'ENFP', 'INFJ', 'INTJ', 'INFP', 'INTP']

thinking = ['ISTJ', 'INTJ', 'ISTP', 'INTP', 'ENTJ', 'ENTP', 'ESTJ', 'ESTP']
feeling = ['ENFJ', 'ENFP', 'ESFJ', 'ESFP', 'ISFJ', 'INFJ', 'INFP', 'ISFP'] 

judging = ['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ENTJ', 'ENFJ', 'ESTJ', 'ESFJ']
perceiving = ['ENTP', 'ENFP', 'ESTP', 'ESFP', 'ISTP', 'INFP', 'ISFP', 'INTP']

In [17]:
# encode individual mbti classes
df_mbti['Introvert'] = np.where(df_mbti['class'].isin(introverted), 1, 0)
df_mbti['Extrovert'] = np.where(df_mbti['class'].isin(extroverted), 1, 0)

df_mbti['Sensing'] = np.where(df_mbti['class'].isin(sensing), 1, 0)
df_mbti['Intuition'] = np.where(df_mbti['class'].isin(intuition), 1, 0)

df_mbti['Thinking'] = np.where(df_mbti['class'].isin(thinking), 1, 0)
df_mbti['Feeling'] = np.where(df_mbti['class'].isin(feeling), 1, 0)

df_mbti['Judging'] = np.where(df_mbti['class'].isin(judging), 1, 0)
df_mbti['Perceiving'] = np.where(df_mbti['class'].isin(perceiving), 1, 0)

In [18]:
df_mbti.head()

Unnamed: 0,body,ISTJ,ISFJ,INFJ,INTJ,ISTP,ISFP,INFP,INTP,ENTJ,ENTP,ENFJ,ENFP,ESTJ,ESFJ,ESTP,ESFP,class,Introvert,Extrovert,Sensing,Intuition,Thinking,Feeling,Judging,Perceiving
0,Knowing you're in INTJ is a tool for you to us...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0
1,You are truly an enlightened mastermind.,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0
2,You should :) it will help if you have a down ...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,INFJ,1,0,0,1,0,1,1,0
3,I watch a bit of everything (including hentai)...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,INTP,1,0,0,1,1,0,0,1
4,I don't know if I would count this as a pet pe...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0


### Set up NLP on Reddit Dataset

In [19]:
df_reddit = df_mbti

In [20]:
# lowercase all text in body
df_reddit['body'] = df_reddit['body'].str.lower()

In [21]:
df_reddit['body'].head(10)

0    knowing you're in intj is a tool for you to us...
1             you are truly an enlightened mastermind.
2    you should :) it will help if you have a down ...
3    i watch a bit of everything (including hentai)...
4    i don't know if i would count this as a pet pe...
5    ah. \nnot sure how se works for other types, b...
6    i think that the military is a job people volu...
7    mostly i try not to get too caught up in anyth...
8    based on two politicians who in my eyes, some ...
9    the scores on an individual level mean very li...
Name: body, dtype: object

In [22]:
http_pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

In [23]:
# remove links from body using regex pattern
df_reddit['body'] = df_reddit['body'].replace(http_pattern, '', regex=True)

In [24]:
spec_pattern = r'[^\w\s]'

In [25]:
# remove special characters (including emojis) from body using regex pattern
df_reddit['body'] = df_reddit['body'].replace(spec_pattern, '', regex=True)

In [26]:
mbti_lower = ['istj', 'isfj', 'infj', 'intj', 'istp', 'isfp', 'infp', 'intp', 'entj', 'entp', 'enfj', 'enfp', 'estj', 'esfj', 'estp', 'esfp']
mbti_pattern = '|'.join(mbti_lower)

In [27]:
# remove class labels from body to prevent label leakage
df_reddit['body'] = df_reddit['body'].replace(mbti_pattern, '', regex=True)

In [28]:
df_reddit['body'].head()

0    knowing youre in  is a tool for you to use in ...
1              you are truly an enlightened mastermind
2    you should  it will help if you have a down mo...
3    i watch a bit of everything including hentai i...
4    i dont know if i would count this as a pet pee...
Name: body, dtype: object

In [29]:
# tokenize body text
df_reddit['body'] = df_reddit['body'].apply(word_tokenize)

In [30]:
df_reddit.head()

Unnamed: 0,body,ISTJ,ISFJ,INFJ,INTJ,ISTP,ISFP,INFP,INTP,ENTJ,ENTP,ENFJ,ENFP,ESTJ,ESFJ,ESTP,ESFP,class,Introvert,Extrovert,Sensing,Intuition,Thinking,Feeling,Judging,Perceiving
0,"[knowing, youre, in, is, a, tool, for, you, to...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0
1,"[you, are, truly, an, enlightened, mastermind]",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0
2,"[you, should, it, will, help, if, you, have, a...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,INFJ,1,0,0,1,0,1,1,0
3,"[i, watch, a, bit, of, everything, including, ...",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,INTP,1,0,0,1,1,0,0,1
4,"[i, dont, know, if, i, would, count, this, as,...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0


In [31]:
stop_words = set(stopwords.words('english'))

In [32]:
def remove_stopwords_from_tokens(tokens):
        return [word for word in tokens if word.lower() not in stop_words]

In [33]:
# remove stop words
df_reddit['body'] = df_reddit['body'].apply(remove_stopwords_from_tokens)

In [34]:
df_reddit.head()

Unnamed: 0,body,ISTJ,ISFJ,INFJ,INTJ,ISTP,ISFP,INFP,INTP,ENTJ,ENTP,ENFJ,ENFP,ESTJ,ESFJ,ESTP,ESFP,class,Introvert,Extrovert,Sensing,Intuition,Thinking,Feeling,Judging,Perceiving
0,"[knowing, youre, tool, use, interactions, peop...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0
1,"[truly, enlightened, mastermind]",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0
2,"[help, moment, hobby, keep, mind, busy, dont, ...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,INFJ,1,0,0,1,0,1,1,0
3,"[watch, bit, everything, including, hentai, te...",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,INTP,1,0,0,1,1,0,0,1
4,"[dont, know, would, count, pet, peeze, somethi...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0


In [35]:
lemmatizer = WordNetLemmatizer()

In [36]:
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

In [37]:
# lemmatize text
df_reddit['body'] = df_reddit['body'].apply(lemmatize_tokens)

In [38]:
df_reddit.head()

Unnamed: 0,body,ISTJ,ISFJ,INFJ,INTJ,ISTP,ISFP,INFP,INTP,ENTJ,ENTP,ENFJ,ENFP,ESTJ,ESFJ,ESTP,ESFP,class,Introvert,Extrovert,Sensing,Intuition,Thinking,Feeling,Judging,Perceiving
0,"[knowing, youre, tool, use, interaction, peopl...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0
1,"[truly, enlightened, mastermind]",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0
2,"[help, moment, hobby, keep, mind, busy, dont, ...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,INFJ,1,0,0,1,0,1,1,0
3,"[watch, bit, everything, including, hentai, te...",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,INTP,1,0,0,1,1,0,0,1
4,"[dont, know, would, count, pet, peeze, somethi...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0


### Set Up NLP on Kaggle Dataset

In [39]:
df_kaggle = pd.read_csv('mbti_1 2.csv')

In [40]:
df_kaggle.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [41]:
df_kaggle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [42]:
df_kaggle['posts'] = df_kaggle['posts'].astype('string')

In [43]:
df_kaggle['posts'] = df_kaggle['posts'].str.split(r'\|\|\|')

In [44]:
df_kaggle = df_kaggle.explode('posts')

In [45]:
df_kaggle.info()

<class 'pandas.core.frame.DataFrame'>
Index: 422845 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   type    422845 non-null  object
 1   posts   422845 non-null  object
dtypes: object(2)
memory usage: 9.7+ MB


In [46]:
df_kaggle.tail()

Unnamed: 0,type,posts
8674,INFP,I was going to close my facebook a few months ...
8674,INFP,30 Seconds to Mars - All of my collections. It...
8674,INFP,"I have seen it, and i agree. I did actually th..."
8674,INFP,Ok so i have just watched Underworld 4 (Awaken...
8674,INFP,I would never want to turn off my emotions. so...


In [47]:
# ensure body text columns are not blank
df_kaggle = df_kaggle[df_kaggle['posts'].notnull()]

In [48]:
# extract mbti class labels for one-hot encoding
df_kaggle['ISTJ'] = df_kaggle['type'].str.extract(r'(ISTJ)')
df_kaggle['ISFJ'] = df_kaggle['type'].str.extract(r'(ISFJ)')
df_kaggle['INFJ'] = df_kaggle['type'].str.extract(r'(INFJ)')
df_kaggle['INTJ'] = df_kaggle['type'].str.extract(r'(INTJ)')

df_kaggle['ISTP'] = df_kaggle['type'].str.extract(r'(ISTP)')
df_kaggle['ISFP'] = df_kaggle['type'].str.extract(r'(ISFP)')
df_kaggle['INFP'] = df_kaggle['type'].str.extract(r'(INFP)')
df_kaggle['INTP'] = df_kaggle['type'].str.extract(r'(INTP)')

df_kaggle['ENTJ'] = df_kaggle['type'].str.extract(r'(ENTJ)')
df_kaggle['ENTP'] = df_kaggle['type'].str.extract(r'(ENTP)')
df_kaggle['ENFJ'] = df_kaggle['type'].str.extract(r'(ENFJ)')
df_kaggle['ENFP'] = df_kaggle['type'].str.extract(r'(ENFP)')

df_kaggle['ESTJ'] = df_kaggle['type'].str.extract(r'(ESTJ)')
df_kaggle['ESFJ'] = df_kaggle['type'].str.extract(r'(ESFJ)')
df_kaggle['ESTP'] = df_kaggle['type'].str.extract(r'(ESTP)')
df_kaggle['ESFP'] = df_kaggle['type'].str.extract(r'(ESFP)')

df_kaggle.head()

Unnamed: 0,type,posts,ISTJ,ISFJ,INFJ,INTJ,ISTP,ISFP,INFP,INTP,ENTJ,ENTP,ENFJ,ENFP,ESTJ,ESFJ,ESTP,ESFP
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,,,INFJ,,,,,,,,,,,,,
0,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,,,INFJ,,,,,,,,,,,,,
0,INFJ,enfp and intj moments https://www.youtube.com...,,,INFJ,,,,,,,,,,,,,
0,INFJ,What has been the most life-changing experienc...,,,INFJ,,,,,,,,,,,,,
0,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,,,INFJ,,,,,,,,,,,,,


In [49]:
# one-hot encode each of the 16 mbti classes
df_kaggle['ISTJ'] = np.where(df_kaggle['ISTJ'].notnull(), 1, 0)
df_kaggle['ISFJ'] = np.where(df_kaggle['ISFJ'].notnull(), 1, 0)
df_kaggle['INFJ'] = np.where(df_kaggle['INFJ'].notnull(), 1, 0)
df_kaggle['INTJ'] = np.where(df_kaggle['INTJ'].notnull(), 1, 0)

df_kaggle['ISTP'] = np.where(df_kaggle['ISTP'].notnull(), 1, 0)
df_kaggle['ISFP'] = np.where(df_kaggle['ISFP'].notnull(), 1, 0)
df_kaggle['INFP'] = np.where(df_kaggle['INFP'].notnull(), 1, 0)
df_kaggle['INTP'] = np.where(df_kaggle['INTP'].notnull(), 1, 0)

df_kaggle['ENTJ'] = np.where(df_kaggle['ENTJ'].notnull(), 1, 0)
df_kaggle['ENTP'] = np.where(df_kaggle['ENTP'].notnull(), 1, 0)
df_kaggle['ENFJ'] = np.where(df_kaggle['ENFJ'].notnull(), 1, 0)
df_kaggle['ENFP'] = np.where(df_kaggle['ENFP'].notnull(), 1, 0)

df_kaggle['ESTJ'] = np.where(df_kaggle['ESTJ'].notnull(), 1, 0)
df_kaggle['ESFJ'] = np.where(df_kaggle['ESFJ'].notnull(), 1, 0)
df_kaggle['ESTP'] = np.where(df_kaggle['ESTP'].notnull(), 1, 0)
df_kaggle['ESFP'] = np.where(df_kaggle['ESFP'].notnull(), 1, 0)

df_kaggle.head()

Unnamed: 0,type,posts,ISTJ,ISFJ,INFJ,INTJ,ISTP,ISFP,INFP,INTP,ENTJ,ENTP,ENFJ,ENFP,ESTJ,ESFJ,ESTP,ESFP
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
0,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
0,INFJ,enfp and intj moments https://www.youtube.com...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
0,INFJ,What has been the most life-changing experienc...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
0,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [50]:
# lowercase all text in body
df_kaggle['posts'] = df_kaggle['posts'].str.lower()

In [51]:
http_pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

In [52]:
# remove links from body using regex pattern
df_kaggle['posts'] = df_kaggle['posts'].replace(http_pattern, '', regex=True)

In [53]:
spec_pattern = r'[^\w\s]'

In [54]:
# remove special characters (including emojis) from body using regex pattern
df_kaggle['posts'] = df_kaggle['posts'].replace(spec_pattern, '', regex=True)

In [55]:
mbti_lower = ['istj', 'isfj', 'infj', 'intj', 'istp', 'isfp', 'infp', 'intp', 'entj', 'entp', 'enfj', 'enfp', 'estj', 'esfj', 'estp', 'esfp']
mbti_pattern = '|'.join(mbti_lower)

In [56]:
# remove class labels from body to prevent label leakage
df_kaggle['posts'] = df_kaggle['posts'].replace(mbti_pattern, '', regex=True)

In [57]:
# drop blanks
df_kaggle = df_kaggle[df_kaggle['posts'] != '']

In [58]:
# tokenize body text
df_kaggle['posts'] = df_kaggle['posts'].apply(word_tokenize)

In [59]:
# remove stop words
df_kaggle['posts'] = df_kaggle['posts'].apply(remove_stopwords_from_tokens)

In [60]:
df_kaggle.head()

Unnamed: 0,type,posts,ISTJ,ISFJ,INFJ,INTJ,ISTP,ISFP,INFP,INTP,ENTJ,ENTP,ENFJ,ENFP,ESTJ,ESFJ,ESTP,ESFP
0,INFJ,"[moments, sportscenter, top, ten, plays, pranks]",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
0,INFJ,"[lifechanging, experience, life]",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
0,INFJ,"[repeat, today]",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
0,INFJ,"[may, perc, experience, immerse]",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
0,INFJ,"[last, thing, friend, posted, facebook, commit...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [61]:
# lemmatize text
df_kaggle['posts'] = df_kaggle['posts'].apply(lemmatize_tokens)

In [62]:
# encode individual mbti classes
df_kaggle['Introvert'] = np.where(df_kaggle['type'].isin(introverted), 1, 0)
df_kaggle['Extrovert'] = np.where(df_kaggle['type'].isin(extroverted), 1, 0)

df_kaggle['Sensing'] = np.where(df_kaggle['type'].isin(sensing), 1, 0)
df_kaggle['Intuition'] = np.where(df_kaggle['type'].isin(intuition), 1, 0)

df_kaggle['Thinking'] = np.where(df_kaggle['type'].isin(thinking), 1, 0)
df_kaggle['Feeling'] = np.where(df_kaggle['type'].isin(feeling), 1, 0)

df_kaggle['Judging'] = np.where(df_kaggle['type'].isin(judging), 1, 0)
df_kaggle['Perceiving'] = np.where(df_kaggle['type'].isin(perceiving), 1, 0)

df_kaggle.head()

Unnamed: 0,type,posts,ISTJ,ISFJ,INFJ,INTJ,ISTP,ISFP,INFP,INTP,ENTJ,ENTP,ENFJ,ENFP,ESTJ,ESFJ,ESTP,ESFP,Introvert,Extrovert,Sensing,Intuition,Thinking,Feeling,Judging,Perceiving
0,INFJ,"[moment, sportscenter, top, ten, play, prank]",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0
0,INFJ,"[lifechanging, experience, life]",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0
0,INFJ,"[repeat, today]",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0
0,INFJ,"[may, perc, experience, immerse]",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0
0,INFJ,"[last, thing, friend, posted, facebook, commit...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0


## Final Counts and Info

In [63]:
df_reddit.head()

Unnamed: 0,body,ISTJ,ISFJ,INFJ,INTJ,ISTP,ISFP,INFP,INTP,ENTJ,ENTP,ENFJ,ENFP,ESTJ,ESFJ,ESTP,ESFP,class,Introvert,Extrovert,Sensing,Intuition,Thinking,Feeling,Judging,Perceiving
0,"[knowing, youre, tool, use, interaction, peopl...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0
1,"[truly, enlightened, mastermind]",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0
2,"[help, moment, hobby, keep, mind, busy, dont, ...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,INFJ,1,0,0,1,0,1,1,0
3,"[watch, bit, everything, including, hentai, te...",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,INTP,1,0,0,1,1,0,0,1
4,"[dont, know, would, count, pet, peeze, somethi...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,INTJ,1,0,0,1,1,0,1,0


In [64]:
df_reddit.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1651100 entries, 0 to 1794015
Data columns (total 26 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   body        1651100 non-null  object
 1   ISTJ        1651100 non-null  int64 
 2   ISFJ        1651100 non-null  int64 
 3   INFJ        1651100 non-null  int64 
 4   INTJ        1651100 non-null  int64 
 5   ISTP        1651100 non-null  int64 
 6   ISFP        1651100 non-null  int64 
 7   INFP        1651100 non-null  int64 
 8   INTP        1651100 non-null  int64 
 9   ENTJ        1651100 non-null  int64 
 10  ENTP        1651100 non-null  int64 
 11  ENFJ        1651100 non-null  int64 
 12  ENFP        1651100 non-null  int64 
 13  ESTJ        1651100 non-null  int64 
 14  ESFJ        1651100 non-null  int64 
 15  ESTP        1651100 non-null  int64 
 16  ESFP        1651100 non-null  int64 
 17  class       1651100 non-null  object
 18  Introvert   1651100 non-null  int64 
 19  Extro

In [65]:
category_counts = df_reddit['class'].value_counts()
category_percentages = df_reddit['class'].value_counts(normalize=True) * 100

unique_values_info = pd.DataFrame({
    'Count': category_counts,
    'Percentage': category_percentages
})

print(unique_values_info)

        Count  Percentage
class                    
INTP   452235   27.389922
INTJ   358042   21.685058
INFJ   194680   11.790927
ENTP   194338   11.770214
INFP   176991   10.719581
ENFP    97835    5.925444
ISTP    50060    3.031918
ENTJ    43642    2.643208
ENFJ    20936    1.268003
ISTJ    16590    1.004785
ESTP    12793    0.774817
ISFP    11345    0.687118
ESFP     7483    0.453213
ISFJ     7002    0.424081
ESTJ     4477    0.271153
ESFJ     2651    0.160560


In [66]:
df_kaggle.head()

Unnamed: 0,type,posts,ISTJ,ISFJ,INFJ,INTJ,ISTP,ISFP,INFP,INTP,ENTJ,ENTP,ENFJ,ENFP,ESTJ,ESFJ,ESTP,ESFP,Introvert,Extrovert,Sensing,Intuition,Thinking,Feeling,Judging,Perceiving
0,INFJ,"[moment, sportscenter, top, ten, play, prank]",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0
0,INFJ,"[lifechanging, experience, life]",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0
0,INFJ,"[repeat, today]",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0
0,INFJ,"[may, perc, experience, immerse]",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0
0,INFJ,"[last, thing, friend, posted, facebook, commit...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0


In [67]:
df_kaggle.info()

<class 'pandas.core.frame.DataFrame'>
Index: 410915 entries, 0 to 8674
Data columns (total 26 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   type        410915 non-null  object
 1   posts       410915 non-null  object
 2   ISTJ        410915 non-null  int64 
 3   ISFJ        410915 non-null  int64 
 4   INFJ        410915 non-null  int64 
 5   INTJ        410915 non-null  int64 
 6   ISTP        410915 non-null  int64 
 7   ISFP        410915 non-null  int64 
 8   INFP        410915 non-null  int64 
 9   INTP        410915 non-null  int64 
 10  ENTJ        410915 non-null  int64 
 11  ENTP        410915 non-null  int64 
 12  ENFJ        410915 non-null  int64 
 13  ENFP        410915 non-null  int64 
 14  ESTJ        410915 non-null  int64 
 15  ESFJ        410915 non-null  int64 
 16  ESTP        410915 non-null  int64 
 17  ESFP        410915 non-null  int64 
 18  Introvert   410915 non-null  int64 
 19  Extrovert   410915 non-null  i

In [68]:
category_counts2 = df_kaggle['type'].value_counts()
category_percentages2 = df_kaggle['type'].value_counts(normalize=True) * 100

unique_values_info2 = pd.DataFrame({
    'Count': category_counts2,
    'Percentage': category_percentages2
})

print(unique_values_info2)

      Count  Percentage
type                   
INFP  86959   21.162284
INFJ  69990   17.032720
INTP  61438   14.951511
INTJ  51129   12.442719
ENTP  33024    8.036699
ENFP  32083    7.807697
ISTP  15981    3.889125
ISFP  12460    3.032257
ENTJ  10988    2.674032
ISTJ   9628    2.343064
ENFJ   9104    2.215543
ISFJ   7886    1.919132
ESTP   4238    1.031357
ESFP   2141    0.521032
ESFJ   1986    0.483312
ESTJ   1880    0.457516


## Outputs as CSVs

In [69]:
df_reddit.to_csv('reddit_data.csv', index=False)

In [70]:
df_kaggle.to_csv('kaggle_data.csv', index=False)