In [45]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re

import warnings
warnings.filterwarnings('ignore')

In [47]:
# Load and preprocess data
df = pd.read_csv('mbti_dataset.csv')

In [48]:
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [49]:
df['I/E'] = df['type'].apply(lambda x: 0 if x[0] == 'I' else 1)  # Introversion/Extraversion
df['S/N'] = df['type'].apply(lambda x: 0 if x[1] == 'S' else 1)  # Sensing/Intuition
df['T/F'] = df['type'].apply(lambda x: 0 if x[2] == 'T' else 1)  # Thinking/Feeling
df['J/P'] = df['type'].apply(lambda x: 0 if x[3] == 'J' else 1)  # Judging/Perceiving

In [50]:
df.head()

Unnamed: 0,type,posts,I/E,S/N,T/F,J/P
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0,1,1,0
1,ENTP,'I'm finding the lack of me in these posts ver...,1,1,0,1
2,INTP,'Good one _____ https://www.youtube.com/wat...,0,1,0,1
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",0,1,0,0
4,ENTJ,'You're fired.|||That's another silly misconce...,1,1,0,0


In [52]:
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    return text


df['posts'] = df['posts'].apply(preprocess_text)

In [53]:
df.head()

Unnamed: 0,type,posts,I/E,S/N,T/F,J/P
0,INFJ,and intj moments sportscenter not top ten ...,0,1,1,0
1,ENTP,im finding the lack of me in these posts very ...,1,1,0,1
2,INTP,good one course to which i say i know tha...,0,1,0,1
3,INTJ,dear intp i enjoyed our conversation the oth...,0,1,0,0
4,ENTJ,youre firedthats another silly misconception t...,1,1,0,0


In [54]:
# Create feature-target split
X = df['posts']
y = df[['I/E', 'S/N', 'T/F', 'J/P']]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
# Feature extraction
vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [56]:
from sklearn.multioutput import MultiOutputClassifier

clf = MultiOutputClassifier(MultinomialNB())
clf.fit(X_train_features, y_train)

# Predict MBTI types
y_pred = clf.predict(X_test_features)

In [57]:
# Evaluate performance using appropriate metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       382
           1       0.86      1.00      0.92      1489
           2       0.54      1.00      0.70       937
           3       0.61      1.00      0.76      1066

   micro avg       0.67      0.90      0.77      3874
   macro avg       0.50      0.75      0.60      3874
weighted avg       0.63      0.90      0.73      3874
 samples avg       0.67      0.90      0.74      3874

