In [None]:
import pandas as pd
import numpy as np
import re
import os
import nltk
import kagglehub
from datasets import load_dataset

In [None]:
try:
    kaggle_path = kagglehub.dataset_download("datasnaek/mbti-type")
    print("Kaggle dataset downloaded to:", kaggle_path)
    mbti_csv_path = os.path.join(kaggle_path, "mbti_1.csv")
    if os.path.exists(mbti_csv_path):
        print("Dataset successfully found at:", mbti_csv_path)
    else:
        print("Dataset not found in the expected location.")

except Exception as e:
    print(f"An error occured while downloadin the dataset: {e}")

In [None]:
try:
    hf_dataset = load_dataset("jingjietan/pandora-big5")
    print("Hugging Face dataset loaded successfully.")
    print("Dataset structure:", hf_dataset)

except Exception as e:
    print(f"An error occured while loading the dataset: {e}")

In [None]:
try:
    df_mbti = pd.read_csv(mbti_csv_path)
    print("Kaggle CSV loaded into DataFrame:")
    df_mbti.info()
except NameError:
    print("Variable 'mbti_csv_path' not found.")

In [None]:
try:
    df_big5 = hf_dataset['train'].to_pandas()
    print("Hugging Face dataset converted to DataFrame:")
    df_big5.info()
except NameError:
    print("Variable 'hf_dataset' not found.")

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords', quiet = True)
nltk.download('wordnet', quiet = True)
nltk.download('omw-1.4', quiet = True)

In [None]:
lemmatier = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags = re.MULTILINE)
    text = re.sub(r'\@\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = text.split()
    
    cleaned_tokens = [lemmatier.lemmatize(word) for word in tokens if word not in stop_words]

    return ' '.join(cleaned_tokens)

In [None]:
df_mbti['I-E'] = df_mbti['type'].apply(lambda x: x[0])
df_mbti['N-S'] = df_mbti['type'].apply(lambda x: x[1])
df_mbti['T-F'] = df_mbti['type'].apply(lambda x: x[2])
df_mbti['J-P'] = df_mbti['type'].apply(lambda x: x[3])

expanded_rows = []
for index, row in df_mbti.iterrows():
    posts = row['posts'].split('|||')
    for post in posts:
        cleanded_post = clean_text(post)
        if len(cleanded_post.strip()) > 0:
            new_row = {
                'text': cleanded_post,
                'type': row['type'],
                'I-E': row['I-E'], 'N-S': row['N-S'], 'T-F': row['T-F'], 'J-P': row['J-P']
            }
            expanded_rows.append(new_row)

df_mbti_processed = pd.DataFrame(expanded_rows)
print(f"Original Kaggle dataset: {len(df_mbti)} rows")
print(f"Processed Kaggle dataset: {len(df_mbti_processed)} rows\n")
print(df_mbti_processed.head())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

output_dir = "EDA_outputs"

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Distribution of MBTI Dimensions')

sns.countplot(ax=axes[0, 0], x='I-E', data=df_mbti_processed, order=['I', 'E'])
axes[0, 0].set_title('Introversion vs. Extroversion')

sns.countplot(ax=axes[0, 1], x='N-S', data=df_mbti_processed, order=['N', 'S'])
axes[0, 1].set_title('Intuition vs. Sensing')

sns.countplot(ax=axes[1, 0], x='T-F', data=df_mbti_processed, order=['T', 'F'])
axes[1, 0].set_title('Thinking vs. Feeling')

sns.countplot(ax=axes[1, 1], x='J-P', data=df_mbti_processed, order=['J', 'P'])
axes[1, 1].set_title('Judging vs. Perceiving')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
distribution_mbti_plot_path = os.path.join(output_dir, 'distribution_mbti.png')
plt.savefig(distribution_mbti_plot_path)
print(f"Saved diagnostic plot to '{distribution_mbti_plot_path}'")
plt.show()

In [None]:
def map_big5_to_mbti(row):
    
    i_e = 'E' if row['E'] > 50 else 'I'
    n_s = 'N' if row['O'] > 50 else 'S'    
    t_f = 'T' if row['A'] < 50 else 'F' 
    j_p = 'J' if row['C'] > 50 else 'P'
    return i_e, n_s, t_f, j_p

mbti_labels = df_big5.apply(map_big5_to_mbti, axis=1, result_type='expand')
df_big5[['I-E', 'N-S', 'T-F', 'J-P']] = mbti_labels
df_big5['type'] = df_big5['I-E'] + df_big5['N-S'] + df_big5['T-F'] + df_big5['J-P']

df_big5['text'] = df_big5['text'].apply(clean_text)
df_big5_processed = df_big5

print("Big Five dataset processed and mapped to MBTI labels.")
print(df_big5_processed.head())

In [None]:
common_columns = ['text', 'type', 'I-E', 'N-S', 'T-F', 'J-P']

df_final = pd.concat(
    [df_mbti_processed[common_columns], df_big5_processed[common_columns]],
    ignore_index=True
)

output_filename = 'preprocessed_personality_data.csv'
df_final.to_csv(output_filename, index=False)

print(f"Preprocessing complete! Combined dataset saved to '{output_filename}'")
print(f"Total rows in final dataset: {len(df_final)}")
