In [1]:
%pip uninstall tqdm


Found existing installation: tqdm 4.64.0
Uninstalling tqdm-4.64.0:
  Would remove:
    /opt/anaconda3/bin/tqdm
    /opt/anaconda3/lib/python3.9/site-packages/tqdm-4.64.0.dist-info/*
    /opt/anaconda3/lib/python3.9/site-packages/tqdm/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
# for data analysis
import pandas as pd
import numpy as np


#For data Modeling
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from category_encoders import TargetEncoder
import torch
import torch.nn as nn
import torch.optim as optim



#For NLP
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


#Visualization
import matplotlib.pyplot as plt
from wordcloud import WordCloud


#Miscellaneous

# for progress  bars


#regular expressions
import re

#for .pkl file
import joblib


#for hyperparameter tuning
from hyperopt import fmin, tpe, hp

from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

ImportError: cannot import name 'tqdm' from partially initialized module 'tqdm' (most likely due to a circular import) (/opt/anaconda3/lib/python3.9/site-packages/tqdm/__init__.py)

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/batfleck06/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df = pd.read_csv("mbti.csv") 
df.head(5)

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [4]:
# Calculate the target sample size per class
total_samples = df.shape[0]
num_classes = len(df['type'].unique())
target_samples_per_class = total_samples // num_classes

In [5]:
target_samples_per_class

542

In [6]:
total_samples

8675

In [7]:
num_classes

16

In [8]:
# Dictionary to store posts for each personality type
personality_posts = {ptype: [] for ptype in df['type'].unique()}

# Iterate through each row and populate the dictionary
for index, row in df.iterrows():
    personality_posts[row['type']].append(row['posts'])

# Lists to store balanced data
balanced_features = []
balanced_labels = []

In [9]:
# Iterate through each personality type
for personality_type, posts in personality_posts.items():
    num_samples = len(posts)
    
    if num_samples >= target_samples_per_class:
        # Sample random indices
        sampled_indices = np.random.choice(num_samples, target_samples_per_class, replace=False)
        
        # Add the sampled data to the balanced sets
        balanced_features.extend([posts[i] for i in sampled_indices])
        balanced_labels.extend([personality_type] * target_samples_per_class)
    else:
        # If fewer posts than target_samples_per_class, use all available posts
        balanced_features.extend(posts)
        balanced_labels.extend([personality_type] * num_samples)

In [10]:
# Shuffle the data
shuffled_indices = np.random.permutation(len(balanced_features))
balanced_features = [balanced_features[i] for i in shuffled_indices]
balanced_labels = [balanced_labels[i] for i in shuffled_indices]

# Split into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(
    balanced_features, balanced_labels, test_size=0.2, random_state=42
)

In [11]:
def clean_text(data):
    data_length = []
    lemmatizer = WordNetLemmatizer()
    cleaned_text = []
    for sentence in tqdm(data):
        sentence = sentence.lower()

        # Remove URLs
        sentence = re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+', ' ', sentence)

        # Remove non-alphanumeric characters
        sentence = re.sub('[^0-9a-z]', ' ', sentence)

        data_length.append(len(sentence.split()))
        cleaned_text.append(sentence)
    return cleaned_text, data_length



In [12]:
# Clean train and test features
cleaned_train_features, train_data_lengths = clean_text(train_features)


100%|█████████████████████████████████████| 3896/3896 [00:02<00:00, 1432.05it/s]


In [13]:
cleaned_test_features, test_data_lengths = clean_text(test_features)

100%|███████████████████████████████████████| 974/974 [00:00<00:00, 1442.96it/s]


In [14]:
class Lemmatizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()

    def __call__(self, sentence):
        return [self.lemmatizer.lemmatize(word) for word in sentence.split() if len(word) > 2]

# Initialize the Lemmatizer
lemmatizer = Lemmatizer()

In [15]:
# Lemmatize cleaned train and test features
lemmatized_train_features = [lemmatizer(sentence) for sentence in cleaned_train_features]
lemmatized_test_features = [lemmatizer(sentence) for sentence in cleaned_test_features]

# Convert lemmatized features back to sentences
lemmatized_train_sentences = [' '.join(sentence) for sentence in lemmatized_train_features]
lemmatized_test_sentences = [' '.join(sentence) for sentence in lemmatized_test_features]

# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

In [16]:
# Fit the vectorizer on lemmatized training data and transform training and testing data
train_post = vectorizer.fit_transform(lemmatized_train_sentences)
test_post = vectorizer.transform(lemmatized_test_sentences)
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [17]:
label_encoder = LabelEncoder()

# Fit and transform labels for both training and testing sets
train_target = label_encoder.fit_transform(train_labels)
test_target = label_encoder.transform(test_labels)
# Save the label encoder for later use
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [18]:
# Get the mapping between encoded labels and original labels
encoded_to_original_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
encoded_to_original_mapping

{'ENFJ': 0,
 'ENFP': 1,
 'ENTJ': 2,
 'ENTP': 3,
 'ESFJ': 4,
 'ESFP': 5,
 'ESTJ': 6,
 'ESTP': 7,
 'INFJ': 8,
 'INFP': 9,
 'INTJ': 10,
 'INTP': 11,
 'ISFJ': 12,
 'ISFP': 13,
 'ISTJ': 14,
 'ISTP': 15}

In [19]:
models_accuracy={}

In [20]:
model_xgb=XGBClassifier(max_depth=5, n_estimators=50, learning_rate=0.1)
model_xgb.fit(train_post,train_target)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=50, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)

In [21]:
print('train classification report \n ',classification_report(train_target,model_xgb.predict(train_post),target_names=label_encoder.inverse_transform([i for i in range(16)])))
print('test classification report \n ',classification_report(test_target,model_xgb.predict(test_post),target_names=label_encoder.inverse_transform([i for i in range(16)])))

train classification report 
                precision    recall  f1-score   support

        ENFJ       1.00      0.99      1.00       145
        ENFP       0.98      0.98      0.98       442
        ENTJ       1.00      1.00      1.00       190
        ENTP       1.00      0.98      0.99       433
        ESFJ       1.00      0.94      0.97        32
        ESFP       1.00      1.00      1.00        39
        ESTJ       1.00      1.00      1.00        31
        ESTP       1.00      1.00      1.00        74
        INFJ       0.98      0.99      0.98       419
        INFP       0.97      0.99      0.98       442
        INTJ       0.98      0.99      0.99       446
        INTP       0.99      0.99      0.99       426
        ISFJ       0.99      0.99      0.99       138
        ISFP       1.00      0.99      1.00       210
        ISTJ       1.00      1.00      1.00       156
        ISTP       0.99      0.99      0.99       273

    accuracy                           0.99      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
models_accuracy['XGBoost Classifier']=accuracy_score(test_target,model_xgb.predict(test_post))

In [23]:
joblib.dump(model_xgb, "xgb_model.pkl")

['xgb_model.pkl']

In [24]:
#Logistic Reg

In [25]:
model_log=LogisticRegression(max_iter=3000,C=0.5,n_jobs=-1)
model_log.fit(train_post,train_target)
joblib.dump(model_log, "logistic_reg_model.pkl")



['logistic_reg_model.pkl']

In [26]:
print('train classification report \n ',classification_report(train_target,model_log.predict(train_post),target_names=label_encoder.inverse_transform([i for i in range(16)])))

train classification report 
                precision    recall  f1-score   support

        ENFJ       0.85      0.35      0.50       145
        ENFP       0.73      0.85      0.79       442
        ENTJ       0.88      0.50      0.64       190
        ENTP       0.72      0.87      0.79       433
        ESFJ       0.00      0.00      0.00        32
        ESFP       0.00      0.00      0.00        39
        ESTJ       0.00      0.00      0.00        31
        ESTP       1.00      0.12      0.22        74
        INFJ       0.72      0.83      0.77       419
        INFP       0.67      0.86      0.76       442
        INTJ       0.71      0.87      0.78       446
        INTP       0.71      0.89      0.79       426
        ISFJ       0.88      0.44      0.59       138
        ISFP       0.88      0.58      0.70       210
        ISTJ       0.87      0.47      0.61       156
        ISTP       0.85      0.81      0.83       273

    accuracy                           0.74      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
models_accuracy['logistic regression']=accuracy_score(test_target,model_log.predict(test_post))


In [28]:
accuracy=pd.DataFrame(models_accuracy.items(),columns=['Models','Test accuracy'])
accuracy

Unnamed: 0,Models,Test accuracy
0,XGBoost Classifier,0.675565
1,logistic regression,0.616016


## MLP

In [24]:
# Create an MLP classifier
model_mlp = MLPClassifier(hidden_layer_sizes=(200, 100), max_iter=1000, alpha=1e-4,
                         solver='adam', verbose=10, random_state=42, learning_rate_init=0.001)

# Fit the MLP classifier on the training data
model_mlp.fit(train_post, train_target)



Iteration 1, loss = 2.66798203
Iteration 2, loss = 2.41894378
Iteration 3, loss = 2.19807920
Iteration 4, loss = 1.87780397
Iteration 5, loss = 1.50938358
Iteration 6, loss = 1.14694537
Iteration 7, loss = 0.82868617
Iteration 8, loss = 0.58325271
Iteration 9, loss = 0.39832961
Iteration 10, loss = 0.27044612
Iteration 11, loss = 0.18658638
Iteration 12, loss = 0.13114428
Iteration 13, loss = 0.09251532
Iteration 14, loss = 0.06755116
Iteration 15, loss = 0.05086311
Iteration 16, loss = 0.03898696
Iteration 17, loss = 0.03075298
Iteration 18, loss = 0.02502991
Iteration 19, loss = 0.02083341
Iteration 20, loss = 0.01762096
Iteration 21, loss = 0.01512534
Iteration 22, loss = 0.01325279
Iteration 23, loss = 0.01170937
Iteration 24, loss = 0.01043851
Iteration 25, loss = 0.00942500
Iteration 26, loss = 0.00856818
Iteration 27, loss = 0.00782015
Iteration 28, loss = 0.00718733
Iteration 29, loss = 0.00664802
Iteration 30, loss = 0.00618330
Iteration 31, loss = 0.00577977
Iteration 32, los

MLPClassifier(hidden_layer_sizes=(200, 100), max_iter=1000, random_state=42,
              verbose=10)

In [25]:
# Generate classification reports for the training and test data
print('Train classification report:\n', classification_report(train_target, model_mlp.predict(train_post), target_names=label_encoder.inverse_transform([i for i in range(16)])))
print('Test classification report:\n', classification_report(test_target, model_mlp.predict(test_post), target_names=label_encoder.inverse_transform([i for i in range(16)])))

Train classification report:
               precision    recall  f1-score   support

        ENFJ       1.00      1.00      1.00       147
        ENFP       1.00      1.00      1.00       441
        ENTJ       1.00      1.00      1.00       185
        ENTP       1.00      1.00      1.00       443
        ESFJ       1.00      1.00      1.00        33
        ESFP       1.00      1.00      1.00        36
        ESTJ       1.00      1.00      1.00        30
        ESTP       1.00      1.00      1.00        72
        INFJ       1.00      1.00      1.00       425
        INFP       1.00      1.00      1.00       439
        INTJ       1.00      1.00      1.00       443
        INTP       1.00      1.00      1.00       436
        ISFJ       1.00      1.00      1.00       133
        ISFP       1.00      1.00      1.00       217
        ISTJ       1.00      1.00      1.00       159
        ISTP       1.00      1.00      1.00       257

    accuracy                           1.00      3

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
# Assuming you already have the MLP model named model_mlp
mlp_accuracy = accuracy_score(test_target, model_mlp.predict(test_post))

# Store the accuracy score in the models_accuracy dictionary
models_accuracy['MLP'] = mlp_accuracy

# Save the trained MLP model to a file using joblib
joblib.dump(model_mlp, "mlp_model.pkl")

['mlp_model.pkl']

In [27]:
accuracy=pd.DataFrame(models_accuracy.items(),columns=['Models','Test accuracy'])
accuracy

Unnamed: 0,Models,Test accuracy
0,MLP,0.514374


In [25]:
# Load the saved model
ml_model = joblib.load("ml_model.pkl")

# Load the TF-IDF vectorizer
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Preprocess the random text
random_text = "That's another silly misconception. That approaching is logically is going to be the key to unlocking whatever it is you think you are entitled to.   Nobody wants to be approached with BS"


# Clean the random text
cleaned_random_text, _ = clean_text([random_text])

# Lemmatize the cleaned random text
lemmatized_random_text = lemmatizer(cleaned_random_text[0])

# Convert lemmatized random text back to a sentence
lemmatized_random_text_sentence = ' '.join(lemmatized_random_text)

# Transform the lemmatized random text using the TF-IDF vectorizer
text_features = tfidf_vectorizer.transform([lemmatized_random_text_sentence])

# Make predictions
ml_prediction = ml_model.predict(text_features)
ml_prediction_label = label_encoder.inverse_transform(ml_prediction)[0]

print("Predicted personality type:", ml_prediction_label)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 8774.69it/s]

Predicted personality type: INTJ





In [None]:
# Convert data to PyTorch tensors
train_data_tensor = torch.Tensor(train_post.toarray())
train_target_tensor = torch.LongTensor(train_target)
test_data_tensor = torch.Tensor(test_post.toarray())
test_target_tensor = torch.LongTensor(test_target)

In [33]:
# Define a simple RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Assuming batch_first=True
        return out

# Hyperparameters
input_size = train_data_tensor.shape[1]
hidden_size = 64
output_size = len(label_encoder.classes_)
learning_rate = 0.001
num_epochs = 10
batch_size = 64

# Initialize the RNN model
rnn_model = SimpleRNN(input_size, hidden_size, output_size)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn_model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for i in range(0, len(train_data_tensor), batch_size):
        inputs = train_data_tensor[i:i+batch_size]
        targets = train_target_tensor[i:i+batch_size]
        
        # Forward pass
        outputs = rnn_model(inputs)
        loss = criterion(outputs, targets)
        
        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# Save the trained RNN model
torch.save(rnn_model.state_dict(), "trained_rnn_model.pth")

# Load the trained RNN model
loaded_rnn_model = SimpleRNN(input_size, hidden_size, output_size)
loaded_rnn_model.load_state_dict(torch.load("trained_rnn_model.pth"))

# Make predictions using the trained RNN model
with torch.no_grad():
    test_outputs = loaded_rnn_model(test_data_tensor)
    _, predicted = torch.max(test_outputs, 1)
    predicted_labels = label_encoder.inverse_transform(predicted.numpy())

print("Predicted personality types:", predicted_labels)

IndexError: too many indices for tensor of dimension 2

In [None]:




# Define the objective function
def objective(params):
    model = XGBClassifier(**params)
    # Change the scoring metric as needed
    scores = cross_val_score(model, train_post, train_target, cv=5, scoring='accuracy')
    return -scores.mean()  # Minimize negative accuracy

# Define the search space for hyperparameters
param_space = {
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'n_estimators': hp.quniform('n_estimators', 50, 300, 1),
    'learning_rate': hp.loguniform('learning_rate', -3, -0.1)
}

# Perform Bayesian Optimization
best = fmin(fn=objective, space=param_space, algo=tpe.suggest, max_evals=50)

print("Best Hyperparameters:", best)

In [30]:
%pip install Pillow


Note: you may need to restart the kernel to use updated packages.
