# MBTI Prediction: GloVe + Bi-LSTM (GPU Optimized)

This notebook implements MBTI personality prediction using:
- **Embedding**: GloVe (pretrained)
- **Model**: Bidirectional LSTM
- **Task**: 4 binary classifications (E/I, N/S, T/F, P/J)
- **Optimization**: NVIDIA RTX 3090 (24GB VRAM)

## 1. GPU Setup and Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import os
import urllib.request
import zipfile
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import mixed_precision

import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

# GPU Configuration
print("Setting up GPU...")
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        mixed_precision.set_global_policy('mixed_float16')
        print(f"GPU: {len(gpus)} device(s) | Mixed Precision: Enabled")
    except RuntimeError as e:
        print(e)

print(f"TensorFlow: {tf.__version__}")

## 2. Load Data

In [None]:
TRAIN_URL = 'https://raw.githubusercontent.com/beefed-up-geek/nlp_final_project/refs/heads/main/kaggle_data/2025MBTItrain.csv'
TEST_URL = 'https://raw.githubusercontent.com/beefed-up-geek/nlp_final_project/refs/heads/main/kaggle_data/2025test.csv'

train_df = pd.read_csv(TRAIN_URL)
test_df = pd.read_csv(TEST_URL)
print(f"Train: {train_df.shape}, Test: {test_df.shape}")

## 3. Preprocessing

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zA-Z\s.,!?]', '', text)
    text = ' '.join(text.split())
    return text

train_df['cleaned_posts'] = train_df['posts'].apply(preprocess_text)
test_df['cleaned_posts'] = test_df['posts'].apply(preprocess_text)

train_df['E_I'] = train_df['type'].apply(lambda x: 1 if x[0] == 'E' else 0)
train_df['N_S'] = train_df['type'].apply(lambda x: 1 if x[1] == 'N' else 0)
train_df['T_F'] = train_df['type'].apply(lambda x: 1 if x[2] == 'T' else 0)
train_df['P_J'] = train_df['type'].apply(lambda x: 1 if x[3] == 'P' else 0)
print("Preprocessing complete!")

## 4. Tokenization

In [None]:
MAX_WORDS = 20000
MAX_SEQUENCE_LENGTH = 400

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['cleaned_posts'])

X_train_padded = pad_sequences(
    tokenizer.texts_to_sequences(train_df['cleaned_posts']),
    maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post'
)
X_test_padded = pad_sequences(
    tokenizer.texts_to_sequences(test_df['cleaned_posts']),
    maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post'
)

print(f"Train: {X_train_padded.shape}, Test: {X_test_padded.shape}")

## 5. Download and Load GloVe Embeddings

In [None]:
# Download GloVe embeddings if not exists
GLOVE_URL = 'http://nlp.stanford.edu/data/glove.6B.zip'
GLOVE_DIR = 'glove_embeddings'
EMBEDDING_DIM = 100
GLOVE_FILE = f'{GLOVE_DIR}/glove.6B.{EMBEDDING_DIM}d.txt'

if not os.path.exists(GLOVE_FILE):
    print("Downloading GloVe embeddings...")
    os.makedirs(GLOVE_DIR, exist_ok=True)
    zip_path = f'{GLOVE_DIR}/glove.6B.zip'
    
    urllib.request.urlretrieve(GLOVE_URL, zip_path)
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(GLOVE_DIR)
    
    os.remove(zip_path)
    print("Download complete!")
else:
    print("GloVe embeddings already exist")

# Load GloVe embeddings
print("Loading GloVe embeddings...")
embeddings_index = {}
with open(GLOVE_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(f"Loaded {len(embeddings_index)} word vectors")

# Create embedding matrix
vocab_size = min(len(tokenizer.word_index) + 1, MAX_WORDS)
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM), dtype=np.float32)

hits = 0
for word, idx in tokenizer.word_index.items():
    if idx >= MAX_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector
        hits += 1
    else:
        embedding_matrix[idx] = np.random.normal(0, 0.05, EMBEDDING_DIM)

print(f"Embedding matrix: {embedding_matrix.shape}")
print(f"Coverage: {hits}/{vocab_size} ({hits/vocab_size*100:.1f}%)")

## 6. Build Model

In [None]:
def create_bilstm_model(embedding_matrix, max_length):
    input_layer = Input(shape=(max_length,))
    
    embedding = Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_matrix.shape[1],
        weights=[embedding_matrix],
        input_length=max_length,
        trainable=True
    )(input_layer)
    
    x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(embedding)
    x = Bidirectional(LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2))(x)
    
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.3)(x)
    
    output = Dense(1, activation='sigmoid', dtype='float32')(x)
    
    model = Model(inputs=input_layer, outputs=output)
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

## 7. Train Models

In [None]:
BATCH_SIZE = 64
EPOCHS = 50
PATIENCE = 5

models = {}
histories = {}
dimensions = ['E_I', 'N_S', 'T_F', 'P_J']

for dim in dimensions:
    print(f"\n{'='*60}")
    print(f"Training {dim}")
    print(f"{'='*60}")
    
    y = train_df[dim].values
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train_padded, y, test_size=0.2, random_state=42, stratify=y
    )
    
    model = create_bilstm_model(embedding_matrix, MAX_SEQUENCE_LENGTH)
    
    callbacks = [
        EarlyStopping(monitor='val_accuracy', patience=PATIENCE, restore_best_weights=True, verbose=1),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1)
    ]
    
    history = model.fit(
        X_tr, y_tr,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=(X_val, y_val),
        callbacks=callbacks,
        verbose=1
    )
    
    val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
    print(f"✓ Best Validation Accuracy: {val_acc:.4f}")
    
    models[dim] = model
    histories[dim] = history
    tf.keras.backend.clear_session()

print("\nAll models trained!")

## 8. Generate Predictions

In [None]:
predictions = {}
for dim in dimensions:
    pred_proba = models[dim].predict(X_test_padded, batch_size=BATCH_SIZE, verbose=0)
    predictions[dim] = (pred_proba > 0.5).astype(int).flatten()

submission = pd.DataFrame({
    'ID': test_df['ID'],
    'E_I': predictions['E_I'],
    'N_S': predictions['N_S'],
    'T_F': predictions['T_F'],
    'P_J': predictions['P_J']
})

submission.to_csv('submission_glove_lstm.csv', index=False)
print("✓ Submission saved: submission_glove_lstm.csv")

print("\n" + "="*60)
print("RESULTS")
print("="*60)
for dim in dimensions:
    print(f"{dim}: {max(histories[dim].history['val_accuracy']):.4f}")
print("="*60)