In [6]:
import os

# Data Analysis
import pandas as pd
import numpy as np

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import wordcloud
from wordcloud import WordCloud

# Text Processing
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Machine Learning packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Model training and evaluation
from sklearn.model_selection import train_test_split

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

#Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Ignore noise warning
import warnings
warnings.filterwarnings("ignore")

from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


In [4]:
df_mbti = pd.read_csv('/Users/syj/Documents/DeepLearning/mbti_1.csv')

# Clear data

In [3]:
def clear_text(df):
    data_length=[]
    lemmatizer=WordNetLemmatizer()
    cleaned_text=[]
    stop_words = set(stopwords.words('english')) # Load stop words
    pers_types = ['INFP' ,'INFJ', 'INTP', 'INTJ', 'ENTP', 'ENFP', 'ISTP' ,'ISFP' ,'ENTJ', 'ISTJ','ENFJ', 'ISFJ' ,'ESTP', 'ESFP' ,'ESFJ' ,'ESTJ']
    pers_types = [p.lower() for p in pers_types]
    
    print("Cleaning The Dataset")
    
    for sentence in tqdm(df.posts):
        
        sentence=sentence.lower()
        
        sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
        
        sentence=re.sub('[^0-9a-z]',' ',sentence)
        
        sentence = " ".join([word for word in sentence.split() if word not in stop_words]) # Remove stop words
        #print(len(sentence))
        
        for p in pers_types:
            sentence = re.sub(p, '', sentence)
        #print(len(sentence))
        
        sentence = lemmatizer.lemmatize(sentence) # Lemmatize words
        
        data_length.append(len(sentence.split())) #Split data, measure length of new filtered data
        
        cleaned_text.append(sentence)
        
    return cleaned_text,data_length

In [5]:
df_clean = df_mbti
df_clean.posts,df_clean_length = clear_text(df_mbti)
df_clean

Cleaning The Dataset


100%|██████████| 8675/8675 [00:03<00:00, 2430.20it/s]


Unnamed: 0,type,posts
0,INFJ,moments sportscenter top ten plays pranks lif...
1,ENTP,finding lack posts alarming sex boring positio...
2,INTP,good one course say know blessing curse absolu...
3,INTJ,dear enjoyed conversation day esoteric gabbin...
4,ENTJ,fired another silly misconception approaching ...
...,...,...
8670,ISFP,always think cats fi doms reason websites beco...
8671,ENFP,thread already exists someplace else heck dele...
8672,INTP,many questions things would take purple pill p...
8673,INFP,conflicted right comes wanting children honest...


In [None]:
df_clean.to_csv('mbti_clean.csv', index=False)

In [8]:
target_encoder=LabelEncoder()
target_encoder.fit(df_clean.type)

In [9]:
le_name_mapping = dict(zip(target_encoder.classes_, target_encoder.fit_transform(target_encoder.classes_)))
new_dict = dict([(value, key) for key, value in le_name_mapping.items()])
new_dict

{0: 'ENFJ',
 1: 'ENFP',
 2: 'ENTJ',
 3: 'ENTP',
 4: 'ESFJ',
 5: 'ESFP',
 6: 'ESTJ',
 7: 'ESTP',
 8: 'INFJ',
 9: 'INFP',
 10: 'INTJ',
 11: 'INTP',
 12: 'ISFJ',
 13: 'ISFP',
 14: 'ISTJ',
 15: 'ISTP'}