In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [12]:
cleaned_data = pd.read_csv('data/cleaned.csv')

In [14]:
type_counts = cleaned_data['type'].value_counts()
print(f"Count of 'I': {type_counts.get('I', 0)}")
print(f"Count of 'E': {type_counts.get('E', 0)}")

cleaned_data[cleaned_data['type'] == 'E']['posts'].head(10)

Count of 'I': 80677
Count of 'E': 25390


53580    yeah quit like idea actual think could number ...
53581    idea thing thing valu other perceiv relev what...
53582    respect say year sound bite silli might soft r...
53583    interact unavoid wit student internship pro bo...
53584    core probabl see relationship work also sound ...
53585    golden retriev ever got ta take walk sometim t...
53586    agre hop someon alreadi type ye obvious differ...
53587    wish well pm need someon talk ask question hmm...
53588    tempera question find easi lay around day rela...
53589    associ high ti xntp lot intellectu willpow xst...
Name: posts, dtype: object

In [None]:
import pandas as pd
import textstat
from collections import Counter
import re

# Load CSV dataset
old_dataset = pd.read_csv('old_dataset.csv')  # Path to your old dataset
new_dataset = pd.read_csv('new_dataset.csv')  # Path to your current dataset

# Assuming both datasets have a 'text' column
old_text = " ".join(old_dataset['text'].dropna())  # Join all text data into one string
new_text = " ".join(new_dataset['text'].dropna())  # Join all text data into one string


In [6]:
cleaned_data['type'] = cleaned_data['type'].map({'I': 0, 'E': 1})

In [7]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(cleaned_data['posts'], cleaned_data['type'], test_size=0.2, random_state=42)

# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [8]:
# Train logistic regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9002
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.96      0.94     16134
           1       0.86      0.70      0.77      5080

    accuracy                           0.90     21214
   macro avg       0.88      0.83      0.85     21214
weighted avg       0.90      0.90      0.90     21214



In [9]:
def predict_mbti(post):
    post_tfidf = vectorizer.transform([post])
    prediction = model.predict(post_tfidf)[0]
    return 'E' if prediction == 1 else 'I'

# Example usage
# user_post = "i always go outside see how world is "
# print(f"Predicted MBTI type: {predict_mbti(user_post)}")

In [10]:
import streamlit as st

# Streamlit UI
st.title("MBTI Personality Predictor")
st.write("Enter a post to predict whether the author is an Introvert (I) or Extrovert (E)")

user_input = st.text_area("Enter your post here:")
if st.button("Predict"):
    if user_input.strip():
        result = predict_mbti(user_input)
        st.write(f"Predicted MBTI type: {result}")
    else:
        st.write("Please enter a post to get a prediction.")

2025-03-05 13:51:24.821 
  command:

    streamlit run C:\Users\24746\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-03-05 13:51:24.821 Session state does not function when running a script without `streamlit run`
