## Getting the Data

In [1]:
!pip install -q kaggle

In [2]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "content"

In [3]:
!kaggle datasets download -d datasnaek/mbti-type

Dataset URL: https://www.kaggle.com/datasets/datasnaek/mbti-type
License(s): CC0-1.0
mbti-type.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
import zipfile

with zipfile.ZipFile('mbti-type.zip', 'r') as zip_ref:
    zip_ref.extractall('content/mbti')

In [5]:
import pandas as pd

# load the MBTI dataset
df = pd.read_csv('content/mbti/mbti_1.csv')

## Cleaning the Data

In [6]:
# !pip install -q transformers
# !pip install -q datasets

In [9]:
def clean_text(text):
  text = text.lower() # convert text to lower case
  text = re.sub(r'http\S+|www\S+', '', text) # remove URLs
  text = re.sub(r'[^a-z\s]', ' ', text) # remove special characters
  tokens = [lemmatizer.lemmatize(word) for word in text.split()
              if word not in stop_words and word not in mbti_terms]
  return " ".join(tokens)

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import torch
import numpy as np
nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zache\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zache\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


mbti_terms = {'infj', 'intj', 'entp', 'enfp', 'istj', 'isfj', 'estp', 'esfp',
              'istp', 'isfp', 'estj', 'esfj', 'intp', 'infp', 'entj', 'enfj'}

In [13]:
import re

df['cleaned_posts'] = df['posts'].apply(clean_text)

## Adding Features to the Data

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
count_vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()

In [14]:
X_counts = count_vectorizer.fit_transform(df['cleaned_posts'])

In [15]:
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [17]:
df['token_counts'] = list(X_counts.toarray())
df['tfidf_values'] = list(X_tfidf.toarray())

In [18]:
# add separate columns for each MBTI axis
df['IE'] = df['type'].apply(lambda x: 'I' if 'I' in x else 'E')
df['NS'] = df['type'].apply(lambda x: 'N' if 'N' in x else 'S')
df['TF'] = df['type'].apply(lambda x: 'T' if 'T' in x else 'F')
df['JP'] = df['type'].apply(lambda x: 'J' if 'J' in x else 'P')

df = df.drop(columns=['type'])

# Training

In [19]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

X = X_tfidf
axes = ['IE', 'NS', 'TF', 'JP']

param_grid = {'C': [0.1, 1, 10], 'solver': ['liblinear']}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [22]:
results = {}
for axis in axes:
    print(f"Training model for {axis} axis")

    # target
    y = df[axis]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    grid_search = GridSearchCV(
        LogisticRegression(), param_grid, cv=skf, scoring='accuracy', n_jobs=-1
    )
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred)

    results[axis] = {
        'best_params': grid_search.best_params_,
        'classification_report': report
    }
    print(f"Best Params for {axis}: {grid_search.best_params_}")
    print(report)


for axis, result in results.items():
    print(f"Axis: {axis}")
    print(f"Best Parameters: {result['best_params']}")
    print("Classification Report:")
    print(result['classification_report'])
    print("\n")

Training model for IE axis
Best Params for IE: {'C': 10, 'solver': 'liblinear'}
              precision    recall  f1-score   support

           E       0.69      0.29      0.41       400
           I       0.82      0.96      0.88      1335

    accuracy                           0.81      1735
   macro avg       0.75      0.63      0.65      1735
weighted avg       0.79      0.81      0.77      1735

Training model for NS axis
Best Params for NS: {'C': 10, 'solver': 'liblinear'}
              precision    recall  f1-score   support

           N       0.88      0.99      0.93      1496
           S       0.79      0.16      0.26       239

    accuracy                           0.88      1735
   macro avg       0.84      0.58      0.60      1735
weighted avg       0.87      0.88      0.84      1735

Training model for TF axis
Best Params for TF: {'C': 1, 'solver': 'liblinear'}
              precision    recall  f1-score   support

           F       0.83      0.84      0.84       93

## LIME - Explaining the Model's Output