In [1]:
import preprocessing


cleaned_data = preprocessing.data

cleaned_data.head()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ayenyeinsan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ayenyeinsan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ayenyeinsan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
print(cleaned_data['type'].value_counts())


In [None]:

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split


In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    cleaned_data['posts'], cleaned_data['type'], test_size=0.2, random_state=42, stratify=cleaned_data['type']
)

In [None]:
import pandas as pd

# Check class distribution in the full dataset
print(cleaned_data['type'].value_counts(normalize=True) * 100)


In [None]:
# Convert train and test labels to Pandas Series for value counts
train_labels = pd.Series(train_labels)
test_labels = pd.Series(test_labels)

# Check class balance in train and test sets
print("Train set class distribution:\n", train_labels.value_counts(normalize=True) * 100)
print("\nTest set class distribution:\n", test_labels.value_counts(normalize=True) * 100)

In [None]:
#beacuse of the class imbalance more "I" in the train set and test set, we need to compute the class weights
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Convert labels to numeric values (0 = I, 1 = E)
cleaned_data['type'] = cleaned_data['type'].map({'I': 0, 'E': 1})

# Compute class weights correctly
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=cleaned_data['type'])

# Convert to dictionary for model training
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

print("Class Weights:", class_weights_dict)

 

In [None]:
#Loading Bert model and tokenizer
# Choose model (BERT)
MODEL_NAME = "bert-base-uncased"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load BERT model with 2 output labels (Introvert & Extrovert)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

In [None]:
# Tokenize data
def tokenize_function(texts):
    return tokenizer(list(texts), truncation=True, padding=True, max_length=512)

train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': list(train_labels)
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': list(test_labels)
})

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=200,
    load_best_model_at_end=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=None,  # You can define custom metrics if needed
    compute_loss=class_weights  # Apply class-weighted loss
)

trainer.train()


In [None]:
trainer.evaluate()


In [None]:
#Preducting the personality type
def predict_mbti(text):
    inputs = tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors="pt")
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits).item()
    return "Extrovert (E)" if prediction == 1 else "Introvert (I)"

# Example
new_post = "I love spending time alone reading books."
print(predict_mbti(new_post))
