In [14]:
from google.colab import files
uploaded = files.upload()


Saving MBTI Dataset.zip to MBTI Dataset (1).zip


In [15]:
# Step 1: Unzip the File

import zipfile

# Unzip the uploaded file
with zipfile.ZipFile("MBTI Dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("mbti_data")


In [16]:
# Step 2: Locate and Read the CSV

import pandas as pd

# Load the CSV (adjust name if different)
df = pd.read_csv("mbti_data/mbti_1.csv")
df.head()


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [17]:
#  Step 3: Text Cleaning Function

import string
import re

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

# Apply to 'posts' column
df['cleaned_posts'] = df['posts'].apply(clean_text)
df[['type', 'cleaned_posts']].head()


Unnamed: 0,type,cleaned_posts
0,INFJ,and intj moments sportscenter not top ten ...
1,ENTP,im finding the lack of me in these posts very ...
2,INTP,good one course to which i say i know tha...
3,INTJ,dear intp i enjoyed our conversation the oth...
4,ENTJ,youre firedthats another silly misconception t...


In [18]:
# Step 2: Apply TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # Limit to 1000 features for speed

# Fit & transform
X = vectorizer.fit_transform(df['cleaned_posts'])

# Labels (Y)
y = df['type']

print("TF-IDF Matrix Shape:", X.shape)


TF-IDF Matrix Shape: (8675, 1000)


In [19]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['type'])  # Now y will be numeric


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [22]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6414985590778098
              precision    recall  f1-score   support

           0       0.62      0.20      0.30        41
           1       0.65      0.58      0.61       125
           2       0.71      0.45      0.56        44
           3       0.66      0.57      0.61       135
           4       0.00      0.00      0.00         7
           5       0.00      0.00      0.00         8
           6       0.00      0.00      0.00         7
           7       0.50      0.13      0.21        15
           8       0.64      0.68      0.66       288
           9       0.62      0.82      0.71       370
          10       0.58      0.68      0.63       193
          11       0.67      0.77      0.71       293
          12       0.93      0.29      0.44        45
          13       0.77      0.32      0.45        53
          14       0.76      0.36      0.49        44
          15       0.70      0.48      0.57        67

    accuracy                           0.64      17

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
# Model Improvement Using SVM

# 1. Import & Train SVM

from sklearn.svm import LinearSVC

# Train SVM model
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)


In [24]:
# 2. Make Predictions

y_pred_svm = svm_model.predict(X_test)


In [25]:
#  3. Evaluate the Model

from sklearn.metrics import classification_report, accuracy_score

# Print accuracy
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", svm_accuracy)

# Print detailed classification report
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.6155619596541787
              precision    recall  f1-score   support

           0       0.58      0.34      0.43        41
           1       0.56      0.58      0.57       125
           2       0.62      0.57      0.60        44
           3       0.60      0.58      0.59       135
           4       0.00      0.00      0.00         7
           5       0.00      0.00      0.00         8
           6       0.50      0.29      0.36         7
           7       0.50      0.27      0.35        15
           8       0.61      0.65      0.63       288
           9       0.65      0.74      0.69       370
          10       0.53      0.62      0.57       193
          11       0.65      0.69      0.67       293
          12       0.74      0.44      0.56        45
          13       0.69      0.38      0.49        53
          14       0.63      0.43      0.51        44
          15       0.58      0.45      0.50        67

    accuracy                           0.62    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
# Train SVM with RBF Kernel

from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Train SVM with RBF kernel
svm_rbf = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_rbf.fit(X_train, y_train)

# Predict
y_pred_rbf = svm_rbf.predict(X_test)

# Evaluate
print("SVM RBF Accuracy:", accuracy_score(y_test, y_pred_rbf))
print(classification_report(y_test, y_pred_rbf))


SVM RBF Accuracy: 0.6403458213256484
              precision    recall  f1-score   support

           0       0.71      0.29      0.41        41
           1       0.70      0.58      0.64       125
           2       0.77      0.45      0.57        44
           3       0.65      0.56      0.60       135
           4       0.00      0.00      0.00         7
           5       0.00      0.00      0.00         8
           6       0.00      0.00      0.00         7
           7       0.80      0.27      0.40        15
           8       0.62      0.63      0.62       288
           9       0.61      0.80      0.69       370
          10       0.61      0.68      0.64       193
          11       0.66      0.78      0.71       293
          12       0.94      0.38      0.54        45
          13       0.72      0.40      0.51        53
          14       0.70      0.36      0.48        44
          15       0.69      0.54      0.61        67

    accuracy                           0.64

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
