In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import os

In [None]:
data_file = 'preprocessed_personality_data.csv'
output_dir = "EDA_outputs"
if not os.path.exists(output_dir):
    os.makedirs(output_dir, exist_ok = True)

try:
    df = pd.read_csv(data_file)
    print(f'Successfully loaded data from {data_file}, shape: {df.shape}')

except FileNotFoundError:
    print(f"{data_file} not found")

In [None]:
import kagglehub
print("\n--- Investigating MBTI Type Distribution from Kaggle ---")
try:
    print("Downloading Kaggle dataset info...")
    kaggle_dir_path = kagglehub.dataset_download("datasnaek/mbti-type")
    kaggle_file_path = os.path.join(kaggle_dir_path, 'mbti_1.csv')
    print(f"Reading Kaggle data from: {kaggle_file_path}")
    
    df_kaggle_raw = pd.read_csv(kaggle_file_path)
    
    print("Kaggle dataset loaded successfully.")

    plt.figure(figsize=(18, 8))
    sns.countplot(x='type', data=df_kaggle_raw, order=df_kaggle_raw['type'].value_counts().index, palette='mako')
    plt.title('Distribution of MBTI Types in the Original Kaggle Dataset', fontsize=16)
    plt.xlabel('MBTI Type', fontsize=12)
    plt.ylabel('Number of Users', fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()
    diagnostic_plot_path_kaggle = os.path.join(output_dir, 'kaggle_type_distribution.png')
    plt.savefig(diagnostic_plot_path_kaggle)
    print(f"Saved diagnostic plot to '{diagnostic_plot_path_kaggle}'")
    plt.show()

except FileNotFoundError:
    print(f"Could not find the Kaggle CSV file. Skipping the diagnostic plot for Kaggle.")
    print("Please ensure the Kaggle dataset downloaded correctly.")
except Exception as e:
    print(f"An error occurred while analyzing the Kaggle dataset: {e}")

In [None]:
from datasets import load_dataset

print("\n--- Investigating Pseudo-MBTI Type Distribution from Hugging Face ---")
try:
    def map_big5_to_mbti(row):
        """Heuristically maps Big Five scores to an MBTI type."""
        i_e = 'E' if row['E'] > 50 else 'I'
        n_s = 'N' if row['O'] > 50 else 'S'
        t_f = 'T' if row['A'] < 50 else 'F'
        j_p = 'J' if row['C'] > 50 else 'P'
        return f"{i_e}{n_s}{t_f}{j_p}"

    print("Loading raw 'pandora-big5' dataset from Hugging Face...")
    ds = load_dataset("jingjietan/pandora-big5")
    df_hf_raw = ds['train'].to_pandas()
    print("Hugging Face dataset loaded successfully.")

    print("Generating pseudo-MBTI types from Big Five scores...")
    df_hf_raw['type'] = df_hf_raw.apply(map_big5_to_mbti, axis=1)

    plt.figure(figsize=(18, 8))
    sns.countplot(x='type', data=df_hf_raw, order=df_hf_raw['type'].value_counts().index, palette='crest')
    plt.title('Distribution of Pseudo-MBTI Types in the Hugging Face Dataset', fontsize=16)
    plt.xlabel('Pseudo-MBTI Type (from Big Five)', fontsize=12)
    plt.ylabel('Number of Entries', fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    diagnostic_plot_path_hf = os.path.join(output_dir, 'huggingface_type_distribution.png')
    plt.savefig(diagnostic_plot_path_hf)
    print(f"Saved diagnostic plot to '{diagnostic_plot_path_hf}'")
    plt.show()

except Exception as e:
    print(f"An error occurred while analyzing the Hugging Face dataset: {e}")

In [None]:
print("\n--- Analyzing MBTI Type Distribution ---")
plt.figure(figsize=(18, 8))
sns.countplot(x='type', data=df, order=df['type'].value_counts().index, palette='viridis')
plt.title('Concatenated aDistribution of the 16 MBTI Personality Types', fontsize=16)
plt.xlabel('MBTI Type', fontsize=12)
plt.ylabel('Number of Posts', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout() # Adjust layout to make room for labels
plot_path = os.path.join(output_dir, 'mbti_type_distribution.png')
plt.savefig(plot_path)
print(f"Saved plot to '{plot_path}'")
plt.show()