# Clean Original MBTI Data

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rebeccawright/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rebeccawright/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

<br><Br>
### Importing the data

In [97]:
data = pd.read_csv('../../data/original/mbti_1.csv')

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


<br><br>
### Setting up the DataFrame

In [99]:
# add empty columns for trait pairs
data = data.reindex(columns = data.columns.tolist() + ['E_I','N_S','F_T','J_P'])

In [100]:
# split MBTI into trait pair column values
for i in range(data.shape[0]):
    data.loc[i,'E_I'] = data.iloc[i,0][0]
    data.loc[i,'N_S'] = data.iloc[i,0][1]
    data.loc[i,'F_T'] = data.iloc[i,0][2]
    data.loc[i,'J_P'] = data.iloc[i,0][3]

In [101]:
# create dictionary of trait pair binary encoding values
mbti_binary_values = {'E': 1, 'I': 0, 'N': 1, 'S': 0, 'F': 1, 'T': 0, 'J': 1, 'P': 0}

In [102]:
# create columns of trait pairs as binary encoded values
for col in data.columns[-4:]:
    data[f'{col}_code'] = data[col].map(mbti_binary_values)

In [103]:
# create column of type as binary code
data['type_code'] = data.iloc[:,-4:].apply(lambda x: ''.join(x.values.astype(str)), axis=1)  

<br><br>
<div class="alert alert-block alert-warning">
<b>Ordinal Encoding Deprecated</b> from mbti_v1</div>

### Ordinal Encoding of mbti_type column

###### **Ordinal Encoding Values**: INFP:9 ||| INFJ:8 ||| INTP:11 ||| INTJ:10 ||| ENTP:3 ||| ENFP:1 ||| ISTP:15 ||| ISFP:13 ||| ENTJ:2 ||| ISTJ:14 ||| ENFJ:0 ||| ISFJ:12 ||| ESTP:7 ||| ESFP:7 ||| ESFJ:4 ||| ESTJ:6

<br><br>
### Applying Preprocessor

In [104]:
def preprocessor(text):
    text = text.lower()     # cast to lowercase
    text = text.replace('|||', ' ')     # replace ||| with " "
    text = re.sub(r'(www|http)\S+', "", text)     # remove urls and websites
    text = re.sub(r'\d+w+\d+\W', "", text)     # remove 1w2 tritype - will capture "1w2", "1w2 ", "1w2=", "1w2-"
    regex_tokenizer = RegexpTokenizer(r'\w+')
    text = regex_tokenizer.tokenize(text)     # remove special characters
    return " ".join(text)

In [105]:
data['posts'] = data['posts'].apply(lambda x : preprocessor(x)) 

<br><br>
### Confirm No Empty Posts Generated By Preprocessor

In [107]:
# confirm no posts value was made empty during clean_post process
shortest = 100
for post in data['posts']:
    if len(post) < shortest:
        shortest = len(post)
shortest

9

<br><br>
### Write To CSV

In [108]:
data.to_csv('../../data/cleaned/mbti_v5.csv', index=False)

<div class="alert alert-block alert-info">
<b>TO_CSV version tracking: mbti_v5</b><br>
* preprocessing steps:<br>
*** add re.sub(r'\d+w+\d+\W', "", text)<br>
[timestamp: 2/17-10:30p]
</div>

<div class="alert alert-block alert-info">
<b>TO_CSV version tracking: mbti_v4</b><br>
* restructured Dataframe Columns: <br>
*** type col is string (4 chars) <br>
*** each trait pair has column with char value (E_I, N_S, etc) <br>
*** each trait pair has trait_code column with binary value with first letter in pair as 1 (E=1, I=0, etc) <br>
*** type_code column with four-digit binary encoded code representing MBTI type (ex: 1001, 1111, etc) <br>
IMPORTANT NOTE : as a integer, leading zeros are dropped, so 0100 is the same as 100 and 0001 is the same as 1<br>
* preprocessing steps: (identical to previous version)<br>
[timestamp: 2/17-10:00p]
</div>

<div class="alert alert-block alert-info">
<b>TO_CSV version tracking: mbti_v3</b><br>
* left 4 letter string mbti_type column <br>
* reassigned values from binary back to string for columns for each personality trait pair<br>
* preprocessing steps: (identical to previous version)<br>
[timestamp: 2/17-11:30a]
</div>

<div class="alert alert-block alert-info">
<b>TO_CSV version tracking: mbti_v2</b><br>
* left 4 letter string mbti_type column <br>
* created binary encoded columns for each personality trait pair with first letter in pair as 1<br>
*** E_I<br>
*** N_S<br>
*** F_T<br>
*** J_P<br>
* preprocessing steps: (identical to previous version)<br>
[timestamp: 2/17-9:30a]
</div>

<div class="alert alert-block alert-info">
<b>TO_CSV version tracking: mbti_v1</b><br>
* catgorically encoded MBTI_type column to numeric value 1-16 <br>
* preprocessing steps:<br>
*** text.lower()<br>
*** text.replace('|||', ' ')<br>
*** re.sub(r'(www|http)\S+', "", text)<br>
*** RegexpTokenizer(r'\w+')<br>
*** regex_tokenizer.tokenize(text)<br>
[timestamp: 2/16-5:00p]<br><br>
<b>UPDATED:</b> Dislike the categorical encoding method. [timestamp: 2/17-9:37p]
</div>