# MBTI Prediction: GloVe + CNN-LSTM Hybrid (GPU Optimized)

This notebook implements MBTI personality prediction using:
- **Embedding**: GloVe (pretrained)
- **Model**: CNN-LSTM Hybrid
- **Task**: 4 binary classifications (E/I, N/S, T/F, P/J)
- **Optimization**: NVIDIA RTX 3090 (24GB VRAM)

## 1. GPU Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import re, os, urllib.request, zipfile
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import mixed_precision

import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

# GPU Setup
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    mixed_precision.set_global_policy('mixed_float16')
    print(f"GPU: {len(gpus)} | Mixed Precision: ON")
print(f"TensorFlow: {tf.__version__}")

## 2-4. Load Data, Preprocess & Tokenize

In [None]:
TRAIN_URL = 'https://raw.githubusercontent.com/beefed-up-geek/nlp_final_project/refs/heads/main/kaggle_data/2025MBTItrain.csv'
TEST_URL = 'https://raw.githubusercontent.com/beefed-up-geek/nlp_final_project/refs/heads/main/kaggle_data/2025test.csv'

train_df = pd.read_csv(TRAIN_URL)
test_df = pd.read_csv(TEST_URL)

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zA-Z\s.,!?]', '', text)
    return ' '.join(text.split())

train_df['cleaned_posts'] = train_df['posts'].apply(preprocess_text)
test_df['cleaned_posts'] = test_df['posts'].apply(preprocess_text)

for i, char in enumerate(['E', 'N', 'T', 'P']):
    col = ['E_I', 'N_S', 'T_F', 'P_J'][i]
    train_df[col] = train_df['type'].apply(lambda x: 1 if x[i] == char else 0)

MAX_WORDS = 20000
MAX_LENGTH = 400

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['cleaned_posts'])

X_train = pad_sequences(tokenizer.texts_to_sequences(train_df['cleaned_posts']), maxlen=MAX_LENGTH, padding='post', truncating='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(test_df['cleaned_posts']), maxlen=MAX_LENGTH, padding='post', truncating='post')

print(f"Data ready: Train {X_train.shape}, Test {X_test.shape}")

## 5. Load GloVe Embeddings

In [None]:
GLOVE_DIR = 'glove_embeddings'
EMBEDDING_DIM = 100
GLOVE_FILE = f'{GLOVE_DIR}/glove.6B.{EMBEDDING_DIM}d.txt'

if not os.path.exists(GLOVE_FILE):
    print("Downloading GloVe...")
    os.makedirs(GLOVE_DIR, exist_ok=True)
    zip_path = f'{GLOVE_DIR}/glove.6B.zip'
    urllib.request.urlretrieve('http://nlp.stanford.edu/data/glove.6B.zip', zip_path)
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(GLOVE_DIR)
    os.remove(zip_path)

embeddings_index = {}
with open(GLOVE_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

vocab_size = min(len(tokenizer.word_index) + 1, MAX_WORDS)
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM), dtype=np.float32)

for word, idx in tokenizer.word_index.items():
    if idx >= MAX_WORDS:
        continue
    vec = embeddings_index.get(word)
    embedding_matrix[idx] = vec if vec is not None else np.random.normal(0, 0.05, EMBEDDING_DIM)

print(f"Embedding matrix: {embedding_matrix.shape}")

## 6. Build CNN-LSTM Hybrid Model

In [None]:
def create_hybrid_model(emb_matrix, max_len):
    inp = Input(shape=(max_len,))
    emb = Embedding(emb_matrix.shape[0], emb_matrix.shape[1], weights=[emb_matrix], trainable=True)(inp)
    
    # CNN
    cnn = Conv1D(128, 3, activation='relu', padding='same')(emb)
    cnn = MaxPooling1D(2)(cnn)
    cnn = Conv1D(64, 3, activation='relu', padding='same')(cnn)
    
    # LSTM
    lstm = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(cnn)
    
    # Dense
    x = Dense(64, activation='relu')(lstm)
    x = Dropout(0.4)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.3)(x)
    out = Dense(1, activation='sigmoid', dtype='float32')(x)
    
    model = Model(inp, out)
    model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

## 7. Train Models

In [None]:
BATCH_SIZE = 64
models = {}
histories = {}

for dim in ['E_I', 'N_S', 'T_F', 'P_J']:
    print(f"\n{'='*60}\nTraining {dim}\n{'='*60}")
    
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train, train_df[dim].values, test_size=0.2, random_state=42, stratify=train_df[dim]
    )
    
    model = create_hybrid_model(embedding_matrix, MAX_LENGTH)
    
    history = model.fit(
        X_tr, y_tr, batch_size=BATCH_SIZE, epochs=50,
        validation_data=(X_val, y_val),
        callbacks=[
            EarlyStopping('val_accuracy', patience=5, restore_best_weights=True, verbose=1),
            ReduceLROnPlateau('val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1)
        ],
        verbose=1
    )
    
    print(f"✓ Best: {max(history.history['val_accuracy']):.4f}")
    models[dim] = model
    histories[dim] = history
    tf.keras.backend.clear_session()

## 8-9. Predict & Save

In [None]:
predictions = {dim: (models[dim].predict(X_test, batch_size=BATCH_SIZE, verbose=0) > 0.5).astype(int).flatten() 
               for dim in ['E_I', 'N_S', 'T_F', 'P_J']}

submission = pd.DataFrame({'ID': test_df['ID'], **predictions})
submission.to_csv('submission_glove_hybrid.csv', index=False)
print("✓ Saved: submission_glove_hybrid.csv")

print("\n" + "="*60 + "\nRESULTS\n" + "="*60)
for dim in ['E_I', 'N_S', 'T_F', 'P_J']:
    print(f"{dim}: {max(histories[dim].history['val_accuracy']):.4f}")
print("="*60)