### ***Link To drive***

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### ***Import Libraries***

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
import spacy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk


### ***Load Dataset***

In [None]:

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/mbti_1.csv")

### ***Preprocessing***

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk # Make sure to import nltk

from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
from nltk.tokenize import word_tokenize
# Download the 'punkt' resource
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Function to preprocess text with lemmatization and stemming
def preprocess_text_full(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters, digits, and punctuations
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    # Stemming
    stemmed = [stemmer.stem(word) for word in lemmatized]
    # Join tokens back into string
    return ' '.join(stemmed)

# Apply full preprocessing to the 'text' column
df['full_preprocessed_text'] = df['posts'].apply(preprocess_text_full)


In [None]:
posts_split = df['posts'].str.split('\|\|\|')

In [None]:
df.head()

In [None]:
# Preprocess the text data for  traits
df['JP'] = df['posts'].apply(lambda x: x[1])
df['class'] = df['posts'].apply(lambda x: 0 if x.startswith('J') else 1)

### ***Machine Learning***

### ***Apply POS TAGGING***

In [None]:
# Load the model for spaCy
nlp = spacy.load("en_core_web_sm")

# Function to perform POS tagging
def pos_tagging(text):
    doc = nlp(text)
    pos_tags = [token.pos_ for token in doc]
    return " ".join(pos_tags)


In [None]:
# Apply POS tagging to the posts
df['pos_tags'] = df['posts'].apply(pos_tagging)

X_ns = df['pos_tags']
y_ns = df['JP']

In [None]:
# Feature extraction for all four traits
# Step 2: Feature Extraction
posts_combined = posts_split.apply(lambda x: ' '.join(x))
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_ns = tfidf_vectorizer.fit_transform(posts_combined)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import tqdm

In [None]:
# Train-test split
X_train_jp, X_test_jp, y_train_jp, y_test_jp = train_test_split(X_jp, y_jp, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
!pip install xgboost
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_jp = le.fit_transform(y_train_jp)
y_test_jp = le.transform(y_test_jp)

# ***ML + EL with POS Tagging***

In [None]:
# Define models
models = {
    'SVM': SVC(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression()
}

In [None]:
# Function to train and evaluate models for a given trait
def train_and_evaluate_model(X_train, X_test, y_train, y_test, trait_name):
    print(f"Training models for {trait_name}...")
    for name, model in models.items():
        print(f"Training {name} for {trait_name}...")
        # Initialize the TfidfVectorizer
        vectorizer = TfidfVectorizer(max_features=1000)
        # Convert POS-tagged text to numerical features for training and testing data
        X_train_vectorized = vectorizer.fit_transform(X_train)
        X_test_vectorized = vectorizer.transform(X_test)

        model.fit(X_train_vectorized, y_train)
        y_pred = model.predict(X_test_vectorized)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy for {name} for {trait_name}: {accuracy}")
        print(f"Classification report for {name} for {trait_name}:")
        print(classification_report(y_test, y_pred))
        print("----------------------------------------------------")




In [None]:
train_and_evaluate_model(X_train_jp, X_test_jp, y_train_jp, y_test_jp, "Judging vs. Perceiving")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc


# Initialize a figure for the combined ROC curve
plt.figure(figsize=(10, 8))

# Initialize arrays to store combined false positive rates and true positive rates
all_fpr = np.linspace(0, 1, 100)
mean_tpr = 0.0


# Plot ROC curve for each classifier and calculate the mean true positive rate
for name, model in models.items():
    # Fit the model
    model.fit(X_train_vectorized, y_train_jp)

    # Get scores (decision function output) on the test set
    if hasattr(model, "decision_function"):
        scores = model.decision_function(X_test_vectorized)
    else:
        scores = model.predict(X_test_vectorized)

    # Convert scores into probabilities
    y_pred_proba = (scores - scores.min()) / (scores.max() - scores.min())

    # Compute ROC curve and ROC area for Introversion vs. Extroversion (IE) trait
    fpr, tpr, _ = roc_curve(y_test_njp, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve for the model
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

    # Calculate mean true positive rate
    mean_tpr += np.interp(all_fpr, fpr, tpr)


# Calculate the mean true positive rate across all classifiers
mean_tpr /= len(models)
mean_auc = auc(all_fpr, mean_tpr)

    # Plot the combined ROC curve
plt.plot(all_fpr, mean_tpr, color='black', linestyle='--', lw=2, label=f'Combined ROC (AUC = {mean_auc:.2f})')

# Add labels and legend
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Combined Receiver Operating Characteristic (ROC) Curve for J/P POS tagging')
plt.legend(loc="lower right")

# Show plot
plt.grid(True)
plt.show()



In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize Naive Bayes classifier
model = MultinomialNB()

In [None]:
accuracy_jp = accuracy_score(y_test_ns, y_pred_jp)

print("\Judging vs. Perceing:")
print(f"Accuracy: {accuracy_jp}")
print(classification_report(y_test_jp, y_pred_jp))

In [None]:
# Function to train and evaluate models for a given trait
def train_and_evaluate_model(X_train, X_test, y_train, y_test, trait_name):
    print(f"Training models for {trait_name}...")
    for name, model in models.items():
        print(f"Training {name} for {trait_name}...")
        # Initialize the TfidfVectorizer
        vectorizer = TfidfVectorizer(max_features=1000)
        # Convert POS-tagged text to numerical features for training and testing data
        X_train_vectorized = vectorizer.fit_transform(X_train)
        X_test_vectorized = vectorizer.transform(X_test)

        model.fit(X_train_vectorized, y_train)
        y_pred = model.predict(X_test_vectorized)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy for {name} for {trait_name}: {accuracy}")
        print(f"Classification report for {name} for {trait_name}:")
        print(classification_report(y_test, y_pred))
        print("----------------------------------------------------")

# ***ML + EL with TF-IDF***

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
import pandas as pd
import tqdm

In [None]:
# Feature extraction for all four traits
# Step 2: Feature Extraction
posts_combined = posts_split.apply(lambda x: ' '.join(x))
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_jp = tfidf_vectorizer.fit_transform(posts_combined)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_jp = le.fit_transform(y_train_jp)
y_test_jp = le.transform(y_test_jp)

In [None]:
# Define models
models = {

    'SVM': SVC(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression()
}

In [None]:
import numpy as np

# List to store accuracy scores for all models across four traits
accuracy_scores = []

# Function to train and evaluate models for a given trait
def train_and_evaluate_model(X_train, X_test, y_train, y_test, trait_name):
    print(f"Training models for {trait_name}...")
    for name, model in models.items():
        print(f"Training {name} for {trait_name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy for {name} for {trait_name}: {accuracy}")
        print(f"Classification report for {name} for {trait_name}:")
        print(classification_report(y_test, y_pred))
        print("----------------------------------------------------")

In [None]:
train_and_evaluate_model(X_train_jp, X_test_jp, y_train_jp, y_test_jp, "Judging vs. Perceiving")


In [None]:
!pip install scikit-learn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
for model in models.values():
    model.probability = True

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc


# Initialize a figure for the combined ROC curve
plt.figure(figsize=(10, 8))

# Initialize arrays to store combined false positive rates and true positive rates
all_fpr = np.linspace(0, 1, 100)
mean_tpr = 0.0

# Plot ROC curve for each classifier and calculate the mean true positive rate
for name, model in models.items():
    model.fit(X_train_jp, y_train_jp)
    y_score = model.predict_proba(X_test_jp)[:, 1]
    fpr, tpr, _ = roc_curve(y_test_jp, y_score)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')
    mean_tpr += np.interp(all_fpr, fpr, tpr)

# Calculate the mean true positive rate across all classifiers
mean_tpr /= len(models)
mean_auc = auc(all_fpr, mean_tpr)

# Plot the combined ROC curve
plt.plot(all_fpr, mean_tpr, color='black', linestyle='--', lw=2, label=f'Combined ROC (AUC = {mean_auc:.2f})')

# Add labels and legend
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Combined Receiver Operating Characteristic (ROC) Curve- J/P TF-IDF ')
plt.legend(loc="lower right")

# Show plot
plt.grid(True)
plt.show()


### ***EDA***

In [None]:
df['full_preprocessed_text'] = df['posts'].apply(preprocess_text_full)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Filter the dataset for labels 'J' and 'P'
df['label_JP'] = df['type'].apply(lambda x: 'J' if 'J' in x else ('P' if 'P' in x else np.nan))
data_JP = df.dropna(subset=['label_JP'])

# Text length analysis for labels 'J' and 'P'
data_JP['post_length'] = data_NS['full_preprocessed_text'].apply(lambda x: len(x.split()))

# Plot the distribution of post length by type
sns.histplot(data=data_JP, x='post_length', hue='label_JP', bins=50, kde=True)
plt.title('Distribution of Post Length by Type (J vs P)')
plt.xlabel('Post Length')
plt.ylabel('Frequency')
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import spacy
!pip install
import nltk
nltk.download('punkt')

# Load dataset
data = pd.read_csv("/content/drive/MyDrive/mbti_1.csv")

# Download stopwords
nltk.download('stopwords')

# Load Spacy's English language model
nlp = spacy.load('en_core_web_sm')

# Combine NLTK and sklearn stopwords
stop_words = set(stopwords.words('english')).union(set(ENGLISH_STOP_WORDS))

# Filter posts related to 'N' and 'S' labels
j_posts = data[data['type'].str.contains('J')]['posts']
p_posts = data[data['type'].str.contains('P')]['posts']

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
    return " ".join(tokens)

# Preprocess the posts
data['cleaned_posts'] = data['posts'].apply(preprocess_text)
j_posts_cleaned = j_posts.apply(preprocess_text)
p_posts_cleaned = p_posts.apply(preprocess_text)

# Basic data inspection
print(data.info())
print(data.describe())

# Text length analysis
data['post_length'] = data['cleaned_posts'].apply(lambda x: len(x.split()))
sns.histplot(data=data, x='post_length', hue='type', bins=50, kde=True)
plt.title('Distribution of Post Length by Type')
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
data = pd.read_csv("/content/drive/MyDrive/mbti_1.csv")

# Combine NLTK and sklearn stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
    return " ".join(tokens)

# Preprocess the posts
data['cleaned_posts'] = data['posts'].apply(preprocess_text)

# Filter posts related to 'J' and 'P' labels
j_posts_cleaned = data[data['type'].str.contains('J')]['cleaned_posts']
p_posts_cleaned = data[data['type'].str.contains('P')]['cleaned_posts']

# Generate word clouds
def create_wordcloud(text_data, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(text_data))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title, fontsize=18)
    plt.axis('off')
    plt.show()

# Create word clouds for 'J' and 'P' posts
create_wordcloud(n_posts_cleaned, 'Word Cloud for Judging (J) Posts')
create_wordcloud(s_posts_cleaned, 'Word Cloud for Perceiving (P) Posts')


In [None]:
# Filter posts related to 'J' and 'P' labels
JP_posts_cleaned = data[data['type'].str.contains('J|P')]['cleaned_posts']

# Combine all posts into a single string
combined_text_jp = " ".join(JP_posts_cleaned)

# Generate and display the word cloud
wordcloud_jp = WordCloud(width=800, height=400, background_color='white').generate(combined_text_jp)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_jp, interpolation='bilinear')
plt.title('Combined Word Cloud for Judging (J) and Perceiving (P) Posts', fontsize=18)
plt.axis('off')
plt.show()


### ***DL MODELS***

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec
from torchtext.vocab import GloVe
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer


In [None]:
import pandas as pd
import json
import re
import numpy as np
import spacy
import tqdm
import xgboost as xgb
import lightgbm as lgb
import nltk
!pip install emoji
import emoji

!pip install catboost
from catboost import CatBoostClassifier # This is where CatBoostClassifier is defined

# Download NLTK data for tokenization
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier # Remove CatBoostClassifier from here
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
from nltk import pos_tag, word_tokenize
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
# Download NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, GRU, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, SimpleRNN
from tensorflow.keras.optimizers import Adam
from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.fasttext import FastText
import transformers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding, Dropout
from torchtext.vocab import GloVe
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer


In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

In [None]:
# Preprocess text (simple preprocessing considering only removal of URLs and lowercasing)
data['posts'] = data['posts'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
data['posts'] = data['posts'].str.lower()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['posts'], data['class'], test_size=0.2, random_state=42)

In [None]:
# Train-test split
X_train_jp, X_test_jp, y_train_, y_test_jp = train_test_split(X_jp, y_jp, test_size=0.2, random_state=42)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_jp)
X_train_seq = tokenizer.texts_to_sequences(X_train_jp)
X_test_seq = tokenizer.texts_to_sequences(X_test_jp)

# Pad sequences to ensure uniform input size
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)


In [None]:
# Train a Word2Vec model
sentences = [sentence.split() for sentence in X_train_jp]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Create an embedding matrix
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = w2v_model.wv[word] if word in w2v_model.wv else None
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
# Load the GloVe model
glove = GloVe(name='6B', dim=100)

# Create an embedding matrix for Glove
embedding_matrix_glove = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = glove[word]
    if embedding_vector is not None:
        embedding_matrix_glove[i] = embedding_vector


In [None]:
print(type(X_train_jp))
print(type(X_test_jp))
X_train_jp = X_train_jp.tolist()
X_test_jp = X_test_jp.tolist()


In [None]:
# Initialize Sentence Transformer Model
sbert_model = SentenceTransformer('bert-large-nli-mean-tokens')

# Encode sentences (for simplification we use mean pooling of embeddings)
X_train_embeddings = sbert_model.encode(X_train_jp, show_progress_bar=True)
X_test_embeddings = sbert_model.encode(X_test_jp, show_progress_bar=True)

In [None]:
def build_model(embedding_matrix, lstm_type='lstm'):
    model = Sequential()
    model.add(Embedding(input_dim=embedding_matrix.shape[0], output_dim=100,
                        weights=[embedding_matrix], trainable=False))
    if lstm_type == 'lstm':
        model.add(LSTM(100))
    elif lstm_type == 'bilstm':
        model.add(Bidirectional(LSTM(100)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [None]:
# Check the data types of your target variable
print(y_train_jp.dtype)

# Check for string values in your features (after padding)
print(np.unique(X_train_pad))

# If you find string values, you need to convert them to numerical representations.
# For example, if the padding token is a string, you can convert it to an integer:

X_train_pad = np.where(X_train_pad == '512', 0, X_train_pad).astype(np.float32)
X_test_pad = np.where(X_test_pad == '512', 0, X_test_pad).astype(np.float32)

# If your target variable contains strings, you need to encode them numerically (e.g., using label encoding or one-hot encoding)

In [None]:
# LSTM Model
model_w2v_lstm = build_model(embedding_matrix, 'lstm')
model_w2v_lstm.fit(X_train_pad, y_train_jp, epochs=100, batch_size=64, validation_data=(X_test_pad, y_test_jp))

In [None]:
!pip install sklearn.metrics
from sklearn.metrics import classification_report

In [None]:
# Predict probabilities for each class for the test data using the trained LSTM model
y_pred_probs_lstm = model_w2v_lstm.predict(X_test_pad)

# Convert probabilities to class labels
y_pred_lstm = (y_pred_probs_lstm > 0.5).astype(int)

# Generate and print the classification report
classification_report_lstm = classification_report(y_test, y_pred_lstm)
print("Classification Report for LSTM Model:")
print(classification_report_lstm)


In [None]:
# Bi-LSTM Model
model_w2v_bilstm = build_model(embedding_matrix, 'bilstm')
model_w2v_bilstm.fit(X_train_pad, y_train, epochs=100, batch_size=64, validation_data=(X_test_pad, y_test))

In [None]:
# Predict probabilities for each class for the test data using the trained Bi-LSTM model
y_pred_probs_bilstm = model_w2v_bilstm.predict(X_test_pad)

# Convert probabilities to class labels
y_pred_bilstm = (y_pred_probs_bilstm > 0.5).astype(int)

# Generate and print the classification report
classification_report_bilstm = classification_report(y_test, y_pred_bilstm)
print("Classification Report for Bi-LSTM Model:")
print(classification_report_bilstm)


In [None]:

# lstm Model with GloVe
lstm_model_glove = build_lstm_model(embedding_matrix_glove)
lstm_model_glove.fit(X_train_pad, y_train, epochs=100, batch_size=64, validation_data=(X_test_pad, y_test))


In [None]:
from sklearn.metrics import classification_report

# Predict probabilities for each class for the test data using the trained  model
y_pred_probs = lstm_model_glove.predict(X_test_pad)

# Convert probabilities to class labels
y_pred = (y_pred_probs > 0.5).astype(int)

# Generate and print the classification report
classification_report_lstm = classification_report(y_test, y_pred)
print("Classification Report for CNN Model with GloVe:")
print(classification_report_lstm)


In [None]:
# lstm Model with Sentence Embeddings
lstm_model_sentence = build_lstm_model(None)  # No embedding layer needed
lstm_model_sentence.fit(X_train_embeddings, y_train, epochs=199, batch_size=64, validation_data=(X_test_embeddings, y_test))

### ***LLM-based Model - BERT Large***

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Load Dataset (Replace with actual dataset)
df = pd.read_csv("conscientiousness_dataset.csv")
texts = df['text'].tolist()
labels = [0 if label == 'P' else 1 for label in df['label'].tolist()]  # 0: Perceiving (P), 1: Judging (J)

# Load BERT-Large Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

# Tokenize Data
class PersonalityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Split Data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create Dataset
train_dataset = PersonalityDataset(train_texts, train_labels, tokenizer)
val_dataset = PersonalityDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Define BERT-Large Model for J/P Prediction
class BertClassifier(nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-large-uncased')
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, 2)  # 2 classes (J/P)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        return self.fc(x)

# Initialize Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Training Function
def train(model, dataloader):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# Evaluation Function
def evaluate(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions)

# Train Model
epochs = 100
for epoch in range(epochs):
    train_loss = train(model, train_loader)
    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Validation Accuracy = {val_acc:.4f}")

# Save Model
torch.save(model.state_dict(), "bert_large_conscientiousness_JP_model.pth")
