## Imports

In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from tensorflow.keras import backend as K
from tensorflow.keras.layers import (
    Input, Embedding, Bidirectional, LSTM,
    Dense, Dropout, Softmax, Lambda, Layer
)
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## NLTK Setup

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(tok) for tok in tokens if tok not in stop_words]
    return ' '.join(tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Load & Preprocess Data



In [5]:
data = pd.read_csv('/content/drive/MyDrive/ML_Project/mbti_1.csv')
data['clean_posts'] = data['posts'].apply(preprocess_text)

train_df, test_df = train_test_split(data, test_size=0.25, stratify=data['type'], random_state=42)


## Tokenize & Pad

In [7]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['clean_posts'])

X_train = tokenizer.texts_to_sequences(train_df['clean_posts'])
X_test = tokenizer.texts_to_sequences(test_df['clean_posts'])

max_len = max(len(x) for x in X_train)
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')

## Encode Targets

In [8]:
le = LabelEncoder()
y_train = le.fit_transform(train_df['type'])
y_test = le.transform(test_df['type'])

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))

## Load GloVe Embeddings

In [9]:
def load_glove(path, dim=100):
    embeddings_index = {}
    with open(path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

glove_path = '/content/drive/MyDrive/ML_Project/glove.6B.100d.txt'  # Update path
embedding_dim = 100
glove_embeddings = load_glove(glove_path, dim=embedding_dim)

word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in glove_embeddings:
        embedding_matrix[i] = glove_embeddings[word]


##Attention Layer

In [13]:
class AttentionLayer(Layer):
    def build(self, input_shape):
        self.W = self.add_weight(shape=(input_shape[-1], 1), initializer='glorot_uniform', trainable=True)
        super().build(input_shape)

    def call(self, inputs):
        score = tf.nn.tanh(tf.tensordot(inputs, self.W, axes=1))
        attention_weights = tf.nn.softmax(score, axis=1)
        context = tf.reduce_sum(attention_weights * inputs, axis=1)
        return context

## Build Custom LSTM Model

In [14]:
def build_model(vocab_size, embedding_dim, max_len, num_classes):
    input_layer = Input(shape=(max_len,))
    embedding_layer = Embedding(input_dim=vocab_size,
                                output_dim=embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_len,
                                trainable=False)(input_layer)

    x = Bidirectional(LSTM(128, return_sequences=True))(embedding_layer)
    x = Dropout(0.5)(x)
    x = AttentionLayer()(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)
    output_layer = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=input_layer, outputs=output_layer)
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4, clipnorm=1.0)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

model = build_model(len(word_index)+1, embedding_dim, max_len, len(le.classes_))
model.summary()



In [19]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20,
    batch_size=64,
    class_weight=class_weights,
    callbacks=callbacks
)

Epoch 1/20
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 93ms/step - accuracy: 0.1307 - loss: 2.6667 - val_accuracy: 0.1355 - val_loss: 2.7619 - learning_rate: 5.0000e-05
Epoch 2/20
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 92ms/step - accuracy: 0.0811 - loss: 2.7603 - val_accuracy: 0.0489 - val_loss: 2.7622 - learning_rate: 5.0000e-05
Epoch 3/20
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 95ms/step - accuracy: 0.0818 - loss: 2.8025 - val_accuracy: 0.1577 - val_loss: 2.7581 - learning_rate: 5.0000e-05
Epoch 4/20
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 93ms/step - accuracy: 0.0977 - loss: 2.7684 - val_accuracy: 0.0493 - val_loss: 2.7575 - learning_rate: 5.0000e-05
Epoch 5/20
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 95ms/step - accuracy: 0.0761 - loss: 2.7655 - val_accuracy: 0.1853 - val_loss: 2.7548 - learning_rate: 5.0000e-05
Epoch 6/20
[1m102/102[0m [32m━━━━━━━━

## Evaluation metrics

In [20]:
loss, acc = model.evaluate(X_test, y_test)
print(f"\nTest Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")

y_pred = np.argmax(model.predict(X_test), axis=1)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.2170 - loss: 2.5084

Test Loss: 2.4960, Test Accuracy: 0.2310
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step
              precision    recall  f1-score   support

        ENFJ       0.09      0.29      0.14        48
        ENFP       0.08      0.01      0.01       169
        ENTJ       0.05      0.21      0.08        58
        ENTP       0.16      0.17      0.16       171
        ESFJ       0.00      0.00      0.00        10
        ESFP       0.00      0.00      0.00        12
        ESTJ       0.00      0.00      0.00        10
        ESTP       0.00      0.00      0.00        22
        INFJ       0.24      0.06      0.09       368
        INFP       0.39      0.47      0.42       458
        INTJ       0.31      0.06      0.10       273
        INTP       0.25      0.52      0.34       326
        ISFJ       0.00      0.00      0.00        41
        ISFP       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
macro_f1 = f1_score(y_test, y_pred, average='macro')
weighted_f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nMacro F1-score: {macro_f1:.4f}")
print(f"Weighted F1-score: {weighted_f1:.4f}")

y_test_bin = label_binarize(y_test, classes=np.arange(len(le.classes_)))
y_pred_proba = model.predict(X_test)

try:
    auc_macro = roc_auc_score(y_test_bin, y_pred_proba, average='macro', multi_class='ovr')
    print(f"AUC-ROC (macro): {auc_macro:.4f}")
except ValueError as e:
    print("AUC-ROC could not be computed:", e)



Macro F1-score: 0.1032
Weighted F1-score: 0.1982
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step
AUC-ROC (macro): 0.6998


In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(12, 10))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_,
            yticklabels=le.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()

## Load & Preprocess Data for Biany Classification

In [23]:
data = pd.read_csv('/content/drive/MyDrive/ML_Project/mbti_1.csv')
data['clean_posts'] = data['posts'].apply(preprocess_text)

train_df, test_df = train_test_split(data, test_size=0.25, stratify=data['type'], random_state=42)

In [24]:
for df in (train_df, test_df):
    df['IE'] = df['type'].str[0].map({'I':0, 'E':1})
    df['SN'] = df['type'].str[1].map({'S':0, 'N':1})
    df['TF'] = df['type'].str[2].map({'T':0, 'F':1})
    df['JP'] = df['type'].str[3].map({'J':0, 'P':1})

y_train = { t: train_df[t].values for t in ['IE','SN','TF','JP'] }
y_test  = { t: test_df[t].values  for t in ['IE','SN','TF','JP'] }


In [25]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['clean_posts'])

X_train_seq = tokenizer.texts_to_sequences(train_df['clean_posts'])
X_test_seq  = tokenizer.texts_to_sequences(test_df['clean_posts'])

max_len = max(len(s) for s in X_train_seq)
X_train = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test  = pad_sequences(X_test_seq,  maxlen=max_len, padding='post')

word_index = tokenizer.word_index

In [26]:
def load_glove(path, dim=100):
    idx = {}
    with open(path, encoding='utf-8') as f:
        for line in f:
            parts = line.split()
            word, vec = parts[0], np.asarray(parts[1:], 'float32')
            idx[word] = vec
    return idx

glove_path    = '/content/drive/MyDrive/ML_Project/glove.6B.100d.txt'    # ← your path here
embedding_dim = 100
glove_index   = load_glove(glove_path, dim=embedding_dim)

embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))
for w,i in word_index.items():
    vec = glove_index.get(w)
    if vec is not None:
        embedding_matrix[i] = vec

In [27]:
def build_binary_model(vocab_size, emb_dim, seq_len, emb_matrix):
    inp = Input(shape=(seq_len,), name='input_ids')

    # Embedding
    x = Embedding(
        vocab_size, emb_dim,
        weights=[emb_matrix],
        trainable=False
    )(inp)

    # Bi‑LSTM
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.5)(x)

    # Attention
    score   = Dense(1, activation='tanh')(x)     # (batch, seq, 1)
    weights = Softmax(axis=1)(score)             # (batch, seq, 1)
    context = Lambda(lambda z: K.sum(z[0]*z[1], axis=1))([weights, x])

    # Classifier head
    h = Dense(64, activation='relu')(context)
    h = Dropout(0.3)(h)
    out = Dense(1, activation='sigmoid')(h)

    m = Model(inp, out)
    m.compile(
        loss='binary_crossentropy',
        optimizer=tf.keras.optimizers.Adam(1e-4),
        metrics=['accuracy']
    )
    return m

In [28]:
models, histories = {}, {}
for task in ['IE','SN','TF','JP']:
    print(f"\n=== Training {task} classifier ===")
    cw = compute_class_weight(
        'balanced',
        classes=np.unique(y_train[task]),
        y=y_train[task]
    )
    class_weight = {i: w for i,w in enumerate(cw)}

    m = build_binary_model(
        vocab_size=len(word_index)+1,
        emb_dim=embedding_dim,
        seq_len=max_len,
        emb_matrix=embedding_matrix
    )

    h = m.fit(
        X_train, y_train[task],
        validation_data=(X_test, y_test[task]),
        epochs=10,
        batch_size=64,
        class_weight=class_weight,
        callbacks=[
            EarlyStopping('val_loss', patience=2, restore_best_weights=True),
            ReduceLROnPlateau('val_loss', factor=0.5, patience=1)
        ],
        verbose=2
    )
    models[task]     = m
    histories[task]  = h



=== Training IE classifier ===
Epoch 1/10
102/102 - 13s - 129ms/step - accuracy: 0.5063 - loss: 0.6943 - val_accuracy: 0.2444 - val_loss: 0.7069 - learning_rate: 1.0000e-04
Epoch 2/10
102/102 - 20s - 192ms/step - accuracy: 0.5709 - loss: 0.6893 - val_accuracy: 0.5629 - val_loss: 0.6903 - learning_rate: 1.0000e-04
Epoch 3/10
102/102 - 10s - 102ms/step - accuracy: 0.5719 - loss: 0.6834 - val_accuracy: 0.3900 - val_loss: 0.7514 - learning_rate: 1.0000e-04
Epoch 4/10
102/102 - 10s - 94ms/step - accuracy: 0.5890 - loss: 0.6698 - val_accuracy: 0.6865 - val_loss: 0.6310 - learning_rate: 5.0000e-05
Epoch 5/10
102/102 - 10s - 98ms/step - accuracy: 0.6165 - loss: 0.6605 - val_accuracy: 0.6302 - val_loss: 0.6662 - learning_rate: 5.0000e-05
Epoch 6/10
102/102 - 10s - 100ms/step - accuracy: 0.6182 - loss: 0.6527 - val_accuracy: 0.6210 - val_loss: 0.6629 - learning_rate: 2.5000e-05

=== Training SN classifier ===
Epoch 1/10
102/102 - 13s - 124ms/step - accuracy: 0.6204 - loss: 0.6929 - val_accuracy

In [29]:
print("\n=== Test Accuracies ===")
for task, m in models.items():
    loss, acc = m.evaluate(X_test, y_test[task], verbose=0)
    print(f"{task}: {acc:.3f}")



=== Test Accuracies ===
IE: 0.686
SN: 0.842
TF: 0.734
JP: 0.592


In [30]:
pred_bits = {}
for task, m in models.items():
    # shape (n_samples, 1) → flatten → 0 or 1 by 0.5 threshold
    p = m.predict(X_test, batch_size=64)
    pred_bits[task] = (p.flatten() >= 0.5).astype(int)

bit2letter = {
    'IE': {0:'I', 1:'E'},
    'SN': {0:'S', 1:'N'},
    'TF': {0:'T', 1:'F'},
    'JP': {0:'J', 1:'P'},
}

pred_types = []
for i in range(len(X_test)):
    chars = [ bit2letter[task][pred_bits[task][i]]
              for task in ['IE','SN','TF','JP'] ]
    pred_types.append(''.join(chars))

true_types = test_df['type'].values
overall_acc = np.mean(np.array(pred_types) == true_types)
print(f"Overall 4‑letter accuracy: {overall_acc:.3f}")

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(true_types, pred_types))
cm = confusion_matrix(true_types, pred_types, labels=np.unique(true_types))
print("16‑way confusion matrix:\n", cm)


[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step
Overall 4‑letter accuracy: 0.260
              precision    recall  f1-score   support

        ENFJ       0.10      0.10      0.10        48
        ENFP       0.18      0.34      0.24       169
        ENTJ       0.05      0.09      0.06        58
        ENTP       0.23      0.14      0.18       171
        ESFJ       0.00      0.00      0.00        10
        ESFP       0.05      0.17      0.08        12
        ESTJ       0.00      0.00      0.00        10
        ESTP       0.00      0.00      0.00        22
        INFJ       0.29      0.27      0.28       368
        INFP       0.35      0.41      0.37       458
        INTJ       0.26      0.42      0.32       273
        INTP       0.3

In [31]:
f1_macro    = f1_score(true_types, pred_types, average='macro')
f1_weighted = f1_score(true_types, pred_types, average='weighted')

print(f"16‑class Macro F1:    {f1_macro:.4f}")
print(f"16‑class Weighted F1: {f1_weighted:.4f}")

le16      = LabelEncoder().fit(data['type'])
classes16 = le16.classes_
y_true16  = le16.transform(true_types)

p_IE = models['IE'].predict(X_test).flatten()
p_SN = models['SN'].predict(X_test).flatten()
p_TF = models['TF'].predict(X_test).flatten()
p_JP = models['JP'].predict(X_test).flatten()

n_samples = X_test.shape[0]
probs16   = np.zeros((n_samples, len(classes16)))

for j, mbti in enumerate(classes16):
    bitE = 1 if mbti[0]=='E' else 0
    bitN = 1 if mbti[1]=='N' else 0
    bitF = 1 if mbti[2]=='F' else 0
    bitP = 1 if mbti[3]=='P' else 0

    p0 =    p_IE if bitE else (1-p_IE)
    p1 =    p_SN if bitN else (1-p_SN)
    p2 =    p_TF if bitF else (1-p_TF)
    p3 =    p_JP if bitP else (1-p_JP)

    probs16[:, j] = p0 * p1 * p2 * p3

y_true_onehot = tf.keras.utils.to_categorical(y_true16, num_classes=len(classes16))

auc = roc_auc_score(y_true_onehot, probs16,
                            average='macro',   multi_class='ovr')

print(f"16‑class AUC:    {auc:.4f}")


16‑class Macro F1:    0.1225
16‑class Weighted F1: 0.2455
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step
16‑class AUC:    0.6875
