In [1]:
import pandas as pd
import numpy as np
kaggle_data = pd.read_csv('/kaggle/input/mbti-type/mbti_1.csv')

kaggle_data.head()

kaggle_data["type"].value_counts()
kaggle_data["posts"].count()
kaggle_data["type"] = kaggle_data["type"].str.upper().str[0].map({
    "I": "Introvert",
    "E": "Extrovert"
})
kaggle_data.head()
kaggle_data["type"].value_counts()
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Separate posts
intro_posts = " ".join(kaggle_data[kaggle_data["type"] == "Introvert"]["posts"].astype(str))
extro_posts = " ".join(kaggle_data[kaggle_data["type"] == "Extrovert"]["posts"].astype(str))

In [19]:
# Replace '|||' separator with space
kaggle_data['posts'] = kaggle_data['posts'].str.replace('|||', ' ', regex=False)

# Verify (no.4 specifically)
print(kaggle_data['posts'].head(6))

0    'http://www.youtube.com/watch?v=qsXHcwe3krw ht...
1    'I'm finding the lack of me in these posts ver...
2    'Good one  _____   https://www.youtube.com/wat...
3    'Dear INTP,   I enjoyed our conversation the o...
4    'You're fired. That's another silly misconcept...
5    '18/37 @.@ Science  is not perfect. No scienti...
Name: posts, dtype: object


In [20]:
# lowercase
kaggle_data['posts'] = kaggle_data['posts'].str.lower()

# Verify
print(kaggle_data['posts'].head())

0    'http://www.youtube.com/watch?v=qsxhcwe3krw ht...
1    'i'm finding the lack of me in these posts ver...
2    'good one  _____   https://www.youtube.com/wat...
3    'dear intp,   i enjoyed our conversation the o...
4    'you're fired. that's another silly misconcept...
Name: posts, dtype: object


In [21]:
# Remove URLs
import re

def remove_urls(text):
    return re.sub(r'http\S+', '', text)

kaggle_data['posts'] = kaggle_data['posts'].apply(remove_urls)

In [22]:
# Remove MBTI Keywords (Prevent data leakage)

mbti_types = [
    'infj', 'entp', 'intp', 'intj', 'entj', 'enfj', 'infp', 'enfp',
    'isfp', 'istp', 'isfj', 'istj', 'estp', 'esfp', 'estj', 'esfj',
    'introvert', 'extrovert'  
]

def remove_leakage_words(text):
    # join all types into one pattern: (infj|entp|intp|...)
    pattern = r'\b(?:' + '|'.join(mbti_types) + r')\b'
    return re.sub(pattern, '', text)

kaggle_data['posts'] = kaggle_data['posts'].apply(remove_leakage_words)

# verify
print(kaggle_data['posts'].head())

0    '   and  moments    sportscenter not top ten p...
1    'i'm finding the lack of me in these posts ver...
2    'good one  _____    of course, to which i say ...
3    'dear ,   i enjoyed our conversation the other...
4    'you're fired. that's another silly misconcept...
Name: posts, dtype: object


In [23]:
# Remove Punctuation & Numbers
import nltk
from nltk.corpus import stopwords

def remove_noise(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text

kaggle_data['posts'] = kaggle_data['posts'].apply(remove_noise)
print(kaggle_data['posts'].head())

0       and  moments    sportscenter not top ten pl...
1    im finding the lack of me in these posts very ...
2    good one  _____    of course to which i say i ...
3    dear    i enjoyed our conversation the other d...
4    youre fired thats another silly misconception ...
Name: posts, dtype: object


In [24]:
import emoji

# Convert Emoji 
def convert_emojis(text):
    text = emoji.demojize(text)
    text = text.replace(":", "").replace("_", " ")
    return text

def emoji_process(text):
    return convert_emojis(text)

kaggle_data['posts'] = kaggle_data['posts'].apply(emoji_process)

# verify
print(kaggle_data['posts'].head())


0       and  moments    sportscenter not top ten pl...
1    im finding the lack of me in these posts very ...
2    good one           of course to which i say i ...
3    dear    i enjoyed our conversation the other d...
4    youre fired thats another silly misconception ...
Name: posts, dtype: object


In [25]:
# Remove Stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    # Split into words, check if stopword, join 
    return ' '.join([word for word in str(text).split() if word not in stop_words])
    
kaggle_data['posts'] = kaggle_data['posts'].apply(remove_stopwords)

# Verify
print(kaggle_data['posts'].head())

0    moments sportscenter top ten plays pranks life...
1    im finding lack posts alarming sex boring posi...
2    good one course say know thats blessing curse ...
3    dear enjoyed conversation day esoteric gabbing...
4    youre fired thats another silly misconception ...
Name: posts, dtype: object


In [28]:
# Lemmatize
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

#  map NLTK tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # Default to Noun

def lemmatize_text_smart(text):
    # split, tag , join
    tokens = str(text).split()
    pos_tags = nltk.pos_tag(tokens)
    return " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags])

print("lemmatizing...")
kaggle_data['posts'] = kaggle_data['posts'].apply(lemmatize_text_smart)
print("Complete")

# verify
print(kaggle_data['posts'].head())

lemmatizing...
Complete
0    moment sportscenter top ten play prank lifecha...
1    im find lack post alarm sex boring position of...
2    good one course say know thats bless curse abs...
3    dear enjoyed conversation day esoteric gabbing...
4    youre fire thats another silly misconception a...
Name: posts, dtype: object


In [29]:
# Save CSV 
kaggle_data.to_csv('mbti_traditional_new.csv', index=False)

print("File saved successfully as 'mbti_traditional_new.csv'")

File saved successfully as 'mbti_traditional_new.csv'


In [None]:
# nltk.download('punkt')
# nltk.download('punkt_tab')
# !pip uninstall -y scikit-learn sklearn imbalanced-learn
# !pip install -U scikit-learn imbalanced-learn

In [9]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from gensim.models import FastText
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report, confusion_matrix
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.decomposition import TruncatedSVD             
from scipy.sparse import hstack                           

mbti_data_cleaned = pd.read_csv('/kaggle/input/mbti-preprocessed/mbti_traditional_new.csv')

# remove NaN texts (TF-IDF cannot handle np.nan)
mbti_data_cleaned['posts'] = mbti_data_cleaned['posts'].fillna('')  

print(mbti_data_cleaned['type'].value_counts())
print(mbti_data_cleaned['type'].value_counts(normalize=True))  

type
Introvert    6676
Extrovert    1999
Name: count, dtype: int64
type
Introvert    0.769568
Extrovert    0.230432
Name: proportion, dtype: float64


In [10]:
# Tokenize
print("Tokenizing...")
mbti_data_cleaned['tokens'] = mbti_data_cleaned['posts'].apply(lambda x: word_tokenize(str(x)))

# Encode Extrovert = 0, Introvert = 1
print("Encoding...")
le = LabelEncoder()
y = le.fit_transform(mbti_data_cleaned['type'])

X = mbti_data_cleaned[['posts', 'tokens']]  

# Split Data 80/20
print("Splitting...")
X_train, X_test, y_train, y_test = train_test_split(   
    X, 
    y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)


X_train_text = X_train['posts'].tolist()  
X_test_text = X_test['posts'].tolist()    
X_train_tokens = X_train['tokens'].tolist()  
X_test_tokens = X_test['tokens'].tolist()    

Tokenizing...
Encoding...
Splitting...


In [11]:
# Train FastText on training data (unsupervised)
print("Training FastText Model...")
fasttext_model = FastText(                 #use 300-dim FastText
    sentences=X_train_tokens, 
    vector_size=300,                       
    window=5, 
    min_count=2, 
    sg=1
)

def get_w2v_vectors(data, model, vector_size=300):     
    vectors = []
    for sentence in data:
        sentence_vec = np.zeros(vector_size)
        count = 0
        for word in sentence:
            if word in model.wv.key_to_index:
                sentence_vec += model.wv[word]
                count += 1
        if count != 0:
            sentence_vec /= count
        vectors.append(sentence_vec)
    return np.array(vectors)

print("Vectorizing Data with FastText...")
X_train_fasttext = get_w2v_vectors(X_train_tokens, fasttext_model, vector_size=300)  #HERE
X_test_fasttext = get_w2v_vectors(X_test_tokens, fasttext_model, vector_size=300)    #HERE

Training FastText Model...
Vectorizing Data with FastText...


In [12]:
#TF-IDF word-level (1–2 grams)
print("Building TF-IDF (word-level)...")
tfidf_word = TfidfVectorizer(                     
    ngram_range=(1, 2),
    max_features=20000
)
X_train_tfidf_word = tfidf_word.fit_transform(X_train_text)   
X_test_tfidf_word = tfidf_word.transform(X_test_text)         

#TF-IDF char-level (3–5 grams)
print("Building TF-IDF (char-level)...")
tfidf_char = TfidfVectorizer(                     
    analyzer='char',
    ngram_range=(3, 5),
    max_features=20000
)
X_train_tfidf_char = tfidf_char.fit_transform(X_train_text)   
X_test_tfidf_char = tfidf_char.transform(X_test_text)        

#Combine word + char
print("Combining word & char TF-IDF...")
X_train_tfidf = hstack([X_train_tfidf_word, X_train_tfidf_char])  
X_test_tfidf = hstack([X_test_tfidf_word, X_test_tfidf_char])     

Building TF-IDF (word-level)...
Building TF-IDF (char-level)...
Combining word & char TF-IDF...


In [13]:
#Dimensionality reduction with TruncatedSVD (LSA)
print("Applying TruncatedSVD on TF-IDF...")
svd = TruncatedSVD(n_components=300, random_state=42)        
X_train_tfidf_svd = svd.fit_transform(X_train_tfidf)         
X_test_tfidf_svd = svd.transform(X_test_tfidf)               

#Simple extra features – text length stats
def length_features(token_list):                             
    n_words = len(token_list)
    if n_words == 0:
        return [0.0, 0.0, 0.0]
    avg_len = np.mean([len(w) for w in token_list])
    unique_ratio = len(set(token_list)) / n_words
    return [n_words, avg_len, unique_ratio]

print("Computing length-based features...")
X_train_len_feats = np.array([length_features(toks) for toks in X_train_tokens])  
X_test_len_feats = np.array([length_features(toks) for toks in X_test_tokens])    

#feature fusion
print("Fusing TF-IDF(SVD) + FastText + length features...")
X_train_final = np.hstack([X_train_tfidf_svd, X_train_fasttext, X_train_len_feats])  
X_test_final = np.hstack([X_test_tfidf_svd, X_test_fasttext, X_test_len_feats])      


Applying TruncatedSVD on TF-IDF...
Computing length-based features...
Fusing TF-IDF(SVD) + FastText + length features...


In [14]:
#Oversample minority class (extrovert)
print("Oversampling minority class...")
ros = RandomOverSampler(random_state=42)                        
X_train_res, y_train_res = ros.fit_resample(X_train_final, y_train) 
print(f"New train class distribution: {np.bincount(y_train_res)}")   

Oversampling minority class...
New train class distribution: [5341 5341]


In [15]:
# Define LR and RF
models = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000, 
        class_weight='balanced',         
        n_jobs=-1
    ),
    'Random Forest': RandomForestClassifier(
        class_weight='balanced',
        random_state=42,
        n_estimators=300,                
        n_jobs=-1
    )
}

results = {'Model': [], 'Train Accuracy': [], 'Test Accuracy': []}

In [16]:
print("Training Classifiers...")
for model_name, model in models.items():
    model.fit(X_train_res, y_train_res)                       
    train_acc = model.score(X_train_res, y_train_res)
    test_acc = model.score(X_test_final, y_test)

    results['Model'].append(model_name + " (TFIDF+FT)")
    results['Train Accuracy'].append(train_acc)
    results['Test Accuracy'].append(test_acc)
    print(f"{model_name}: Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")

    # F1, confusion matrix
    y_pred = model.predict(X_test_final)
    print(f"\nClassification Report for {model_name}:\n")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-"*60)

Training Classifiers...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression: Train Acc: 0.7738, Test Acc: 0.7499

Classification Report for Logistic Regression:

              precision    recall  f1-score   support

   Extrovert       0.47      0.64      0.54       400
   Introvert       0.88      0.78      0.83      1335

    accuracy                           0.75      1735
   macro avg       0.67      0.71      0.68      1735
weighted avg       0.78      0.75      0.76      1735

Confusion Matrix:
 [[ 255  145]
 [ 289 1046]]
------------------------------------------------------------
Random Forest: Train Acc: 1.0000, Test Acc: 0.7729

Classification Report for Random Forest:

              precision    recall  f1-score   support

   Extrovert       0.53      0.12      0.20       400
   Introvert       0.79      0.97      0.87      1335

    accuracy                           0.77      1735
   macro avg       0.66      0.55      0.54      1735
weighted avg       0.73      0.77      0.71      1735

Confusion Matrix:
 [[  50  350]
 [  44 

In [22]:
import numpy as np
from scipy.sparse import hstack as sparse_hstack  

def predict_personality(text):
   
    text_clean = text.lower()
    tfidf_w = tfidf_word.transform([text_clean])
    tfidf_c = tfidf_char.transform([text_clean])
    tfidf_combo = sparse_hstack([tfidf_w, tfidf_c])
    svd_vec = svd.transform(tfidf_combo)
    tokens = text_clean.split()
    ft_vec = np.zeros(300)
    count = 0
    for w in tokens:
        if w in fasttext_model.wv.key_to_index:
            ft_vec += fasttext_model.wv[w]
            count += 1
    if count > 0:
        ft_vec /= count
    ft_vec = ft_vec.reshape(1, -1)
    n_words = len(tokens)
    avg_len = np.mean([len(w) for w in tokens]) if n_words > 0 else 0
    uniq = len(set(tokens))/n_words if n_words > 0 else 0
    length_feats = np.array([n_words, avg_len, uniq]).reshape(1, -1)
    final_vec = np.hstack([svd_vec, ft_vec, length_feats])

    # Use trained LR or RF
    logreg_model = models["Logistic Regression"]
    pred = logreg_model.predict(final_vec)
    return le.inverse_transform(pred)[0]

In [29]:
# textbook extrovert vs "Reddit-style" extrovert (casual, opinionated)
textbook_extrovert = "I feel energized when i am around people."
reddit_extrovert = "Lmao that is hilarious! I literally shouted at my screen. We should totally do a meetup for this sub, it would be chaotic but fun."

print(f"Textbook Extrovert: {predict_personality(textbook_extrovert)}")
print(f"Reddit Extrovert:   {predict_personality(reddit_extrovert)}")

Textbook Extrovert: Introvert
Reddit Extrovert:   Extrovert


In [28]:
# textbook introvert vs "Reddit-style" introvert (casual, opinionated)
textbook_introvert = "I recharge my energy by spending time alone. Social interactions often feel draining to me, and I prefer deep, one-on-one conversations over large groups."
reddit_introvert = "Ugh, honestly I just want to stay in my room and play video games all weekend. People are so exhausting lol. Does anyone else feel like hiding when the doorbell rings?"

print(f"Textbook Introvert: {predict_personality(textbook_introvert)}")
print(f"Reddit Introvert:   {predict_personality(reddit_introvert)}")

Textbook Introvert: Introvert
Reddit Introvert:   Introvert


***Save model***

In [32]:
import joblib
import os

save_dir = "/kaggle/working/RF"
os.makedirs(save_dir, exist_ok=True)
print("Folder created at:", save_dir)
models["Random Forest"] #set LR or RF
rf_model = models["Random Forest"]  
joblib.dump(rf_model, f"{save_dir}/random_forest_model.pkl")
joblib.dump(tfidf_word, f"{save_dir}/tfidf_word.pkl")
joblib.dump(tfidf_char, f"{save_dir}/tfidf_char.pkl")
joblib.dump(svd, f"{save_dir}/svd.pkl")
joblib.dump(le,    f"{save_dir}/label_encoder.pkl")
fasttext_model.save(f"{save_dir}/fasttext.model")
print("saved to /kaggle/working/RF/")

Folder created at: /kaggle/working/RF
saved to /kaggle/working/RF/
