In [1]:
import numpy as np 
import os 
import pandas as pd 
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt 
import pickle


In [2]:
raw_df= pd.read_csv(r"C:\Users\USER\Desktop\mbti_1.csv", delimiter=',')
raw_df.dataframeName = 'mbti_1.csv'
nRow, nCol = raw_df.shape
print(f'There are {nRow} rows and {nCol} columns')

raw_df.head()

There are 8675 rows and 2 columns


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [3]:
split_df = raw_df[['type']].copy()

split_df['E-I'] = raw_df['type'].str.extract('(.)[N,S]',1)
split_df['N-S'] = raw_df['type'].str.extract('[E,I](.)[F,T]',1)
split_df['T-F'] = raw_df['type'].str.extract('[N,S](.)[J,P]',1)
split_df['J-P'] = raw_df['type'].str.extract('[F,T](.)',1)

# Encode letters to numeric values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encoded_df = split_df[['type']].copy()
encoded_df['E0-I1'] = le.fit_transform(split_df['E-I'])
encoded_df['N0-S1'] = le.fit_transform(split_df['N-S'])
encoded_df['F0-T1'] = le.fit_transform(split_df['T-F'])
encoded_df['J0-P1'] = le.fit_transform(split_df['J-P'])

binary_type = encoded_df.drop(columns='type')


corrMatrix = binary_type.corr()
corrMatrix.style.background_gradient(cmap='coolwarm')

Unnamed: 0,E0-I1,N0-S1,F0-T1,J0-P1
E0-I1,1.0,0.045899,-0.069573,-0.161939
N0-S1,0.045899,1.0,0.080954,0.014922
F0-T1,-0.069573,0.080954,1.0,0.004673
J0-P1,-0.161939,0.014922,0.004673,1.0


In [4]:
# create a column without http sting
p = "(http.*?\s)"
raw_df['no_url']=raw_df['posts'].replace(p," ",regex=True)
raw_df.head()

Unnamed: 0,type,posts,no_url
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,' and intj moments sportscenter not top ten...
1,ENTP,'I'm finding the lack of me in these posts ver...,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...,"'Good one _____ course, to which I say I k..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...,'You're fired.|||That's another silly misconce...


In [5]:
# create a column without ||| sting
p = "(\|\|\|)"
raw_df['text']=raw_df['no_url'].replace(p," ",regex=True)
raw_df.head()

Unnamed: 0,type,posts,no_url,text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,' and intj moments sportscenter not top ten...,' and intj moments sportscenter not top ten...
1,ENTP,'I'm finding the lack of me in these posts ver...,'I'm finding the lack of me in these posts ver...,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...,"'Good one _____ course, to which I say I k...","'Good one _____ course, to which I say I k..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...,'You're fired.|||That's another silly misconce...,'You're fired. That's another silly misconcept...


In [6]:
# remove all punctuations
p = "[^\w\s]"
raw_df['text']=raw_df['text'].replace(p," ",regex=True)
raw_df.head()

Unnamed: 0,type,posts,no_url,text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,' and intj moments sportscenter not top ten...,and intj moments sportscenter not top ten...
1,ENTP,'I'm finding the lack of me in these posts ver...,'I'm finding the lack of me in these posts ver...,I m finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...,"'Good one _____ course, to which I say I k...",Good one _____ course to which I say I k...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o...",Dear INTP I enjoyed our conversation the o...
4,ENTJ,'You're fired.|||That's another silly misconce...,'You're fired.|||That's another silly misconce...,You re fired That s another silly misconcept...


In [7]:
# remove underscore
p = "\_"
raw_df['text']=raw_df['text'].replace(p," ",regex=True)
raw_df.head()

Unnamed: 0,type,posts,no_url,text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,' and intj moments sportscenter not top ten...,and intj moments sportscenter not top ten...
1,ENTP,'I'm finding the lack of me in these posts ver...,'I'm finding the lack of me in these posts ver...,I m finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...,"'Good one _____ course, to which I say I k...",Good one course to which I say I k...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o...",Dear INTP I enjoyed our conversation the o...
4,ENTJ,'You're fired.|||That's another silly misconce...,'You're fired.|||That's another silly misconce...,You re fired That s another silly misconcept...


In [8]:
# remove all numbers
p = "\d+"
raw_df['text']=raw_df['text'].replace(p," ",regex=True)
raw_df.head()

Unnamed: 0,type,posts,no_url,text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,' and intj moments sportscenter not top ten...,and intj moments sportscenter not top ten...
1,ENTP,'I'm finding the lack of me in these posts ver...,'I'm finding the lack of me in these posts ver...,I m finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...,"'Good one _____ course, to which I say I k...",Good one course to which I say I k...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o...",Dear INTP I enjoyed our conversation the o...
4,ENTJ,'You're fired.|||That's another silly misconce...,'You're fired.|||That's another silly misconce...,You re fired That s another silly misconcept...


In [9]:
# remove one letter words
p = "\W*\b\w\b"
raw_df['text']=raw_df['text'].replace(p," ",regex=True)
raw_df.head()

Unnamed: 0,type,posts,no_url,text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,' and intj moments sportscenter not top ten...,and intj moments sportscenter not top ten...
1,ENTP,'I'm finding the lack of me in these posts ver...,'I'm finding the lack of me in these posts ver...,I m finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...,"'Good one _____ course, to which I say I k...",Good one course to which I say I k...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o...",Dear INTP I enjoyed our conversation the o...
4,ENTJ,'You're fired.|||That's another silly misconce...,'You're fired.|||That's another silly misconce...,You re fired That s another silly misconcept...


In [10]:
# make everything lowercase
raw_df['text'] = raw_df['text'].str.lower()
raw_df.head()

Unnamed: 0,type,posts,no_url,text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,' and intj moments sportscenter not top ten...,and intj moments sportscenter not top ten...
1,ENTP,'I'm finding the lack of me in these posts ver...,'I'm finding the lack of me in these posts ver...,i m finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...,"'Good one _____ course, to which I say I k...",good one course to which i say i k...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o...",dear intp i enjoyed our conversation the o...
4,ENTJ,'You're fired.|||That's another silly misconce...,'You're fired.|||That's another silly misconce...,you re fired that s another silly misconcept...


In [11]:
# save the cleaned df
cleaned_df = raw_df[['type','text']]
cleaned_df.head()

Unnamed: 0,type,text
0,INFJ,and intj moments sportscenter not top ten...
1,ENTP,i m finding the lack of me in these posts ver...
2,INTP,good one course to which i say i k...
3,INTJ,dear intp i enjoyed our conversation the o...
4,ENTJ,you re fired that s another silly misconcept...


In [12]:
# Encode letters to numeric values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

encoded_df = cleaned_df[['type','text']].copy()
encoded_df['E0-I1'] = le.fit_transform(split_df['E-I'])
encoded_df['N0-S1'] = le.fit_transform(split_df['N-S'])
encoded_df['F0-T1'] = le.fit_transform(split_df['T-F'])
encoded_df['J0-P1'] = le.fit_transform(split_df['J-P'])

encoded_df.head()

Unnamed: 0,type,text,E0-I1,N0-S1,F0-T1,J0-P1
0,INFJ,and intj moments sportscenter not top ten...,1,0,0,0
1,ENTP,i m finding the lack of me in these posts ver...,0,0,1,1
2,INTP,good one course to which i say i k...,1,0,1,1
3,INTJ,dear intp i enjoyed our conversation the o...,1,0,1,0
4,ENTJ,you re fired that s another silly misconcept...,0,0,1,0


In [13]:
# Define X and y
X = encoded_df["text"].values
y_all = encoded_df.drop(columns=['type', 'text'])
# Split training and testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_all_train, y_all_test = train_test_split(X, y_all, random_state=0)

In [14]:
# Define TFIDF verctorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=17000,
    min_df=7,
    max_df=0.8,
    stop_words="english",
    ngram_range=(1,3),
)


In [15]:
# create vectors for X
X_train = vectorizer.fit_transform(X_train)
pickle.dump(vectorizer, open(r"C:\Users\USER\Desktop\vectorizer.pkl.txt", 'wb'))
X_test = vectorizer.transform(X_test)

In [16]:
# create log reg model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

In [17]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)

y_EI_train = y_all_train['E0-I1']
y_EI_test = y_all_test['E0-I1']

X_resampled_ros, y_EI_resampled_ros = ros.fit_resample(X_train, y_EI_train)

from collections import Counter
Counter(y_EI_resampled_ros)

Counter({1: 5008, 0: 5008})

In [18]:
classifier.fit(X_resampled_ros, y_EI_resampled_ros)

In [19]:
y_EI_pred_ros = classifier.predict(X_test)
EI_result = pd.DataFrame({"Prediction": y_EI_pred_ros, "Actual": y_EI_test})

In [20]:
pickle.dump(classifier, open(r"C:\Users\USER\Desktop\E-I.pkl.txt",'wb'))

In [21]:
y_NS_train = y_all_train['N0-S1']
y_NS_test = y_all_test['N0-S1']

X_resampled_ros, y_NS_resampled_ros = ros.fit_resample(X_train, y_NS_train)

Counter(y_NS_resampled_ros)

Counter({1: 5592, 0: 5592})

In [22]:
# Fit N-S combination with oversampled x_train and y_NS_train
classifier.fit(X_resampled_ros, y_NS_resampled_ros)

In [23]:
# Predict outcomes for test data set
y_NS_pred_ros = classifier.predict(X_test)
NS_result = pd.DataFrame({"Prediction": y_NS_pred_ros, "Actual": y_NS_test})
NS_result.head(5)

Unnamed: 0,Prediction,Actual
4587,0,1
2786,0,0
2813,0,0
3705,0,0
5957,1,1


In [24]:
pickle.dump(classifier, open(r"C:\Users\USER\Desktop\N-S.pkl.txt", 'wb'))

In [25]:
y_FT_train = y_all_train['F0-T1']
y_FT_test = y_all_test['F0-T1']

X_resampled_ros, y_FT_resampled_ros = ros.fit_resample(X_train, y_FT_train)

Counter(y_FT_resampled_ros)

Counter({1: 3521, 0: 3521})

In [26]:
# Fit F-T combination with oversampled x_train and y_FT_train
classifier.fit(X_resampled_ros, y_FT_resampled_ros)

In [27]:
# Predict outcomes for test data set
y_FT_pred_ros = classifier.predict(X_test)
FT_result = pd.DataFrame({"Prediction": y_FT_pred_ros, "Actual": y_FT_test})
FT_result.head(5)

Unnamed: 0,Prediction,Actual
4587,0,0
2786,0,0
2813,0,0
3705,1,1
5957,0,0


In [28]:
pickle.dump(classifier, open(r"C:\Users\USER\Desktop\F-T.pkl.txt", 'wb'))

In [29]:
# resample J-P combination
y_JP_train = y_all_train['J0-P1']
y_JP_test = y_all_test['J0-P1']

X_resampled_ros, y_JP_resampled_ros = ros.fit_resample(X_train, y_JP_train)

Counter(y_JP_resampled_ros)

Counter({1: 3948, 0: 3948})

In [30]:
classifier.fit(X_resampled_ros, y_JP_resampled_ros)

In [31]:
# Predict outcomes for test data set
y_JP_pred_ros = classifier.predict(X_test)
JP_result = pd.DataFrame({"Prediction": y_JP_pred_ros, "Actual": y_JP_test})
JP_result.head(5)

Unnamed: 0,Prediction,Actual
4587,1,1
2786,1,0
2813,0,1
3705,1,1
5957,1,1


In [32]:
pickle.dump(classifier, open(r"C:\Users\USER\Desktop\J-P.pkl.txt", 'wb'))

In [33]:
loaded_model = pickle.load(open(r"C:\Users\USER\Desktop\E-I.pkl.txt", 'rb'))
result = loaded_model.predict(X_test)

In [34]:
print(result)

[0 1 1 ... 1 1 1]


In [35]:
from sklearn.metrics import accuracy_score
print(f" Logistic regression model accuracy for E-I: {accuracy_score(y_EI_test, y_EI_pred_ros):.3f}")
print(f" Logistic regression model accuracy for N-S: {accuracy_score(y_NS_test, y_NS_pred_ros):.3f}")
print(f" Logistic regression model accuracy for F-T: {accuracy_score(y_FT_test, y_FT_pred_ros):.3f}")
print(f" Logistic regression model accuracy for J-P: {accuracy_score(y_JP_test, y_JP_pred_ros):.3f}")

 Logistic regression model accuracy for E-I: 0.841
 Logistic regression model accuracy for N-S: 0.902
 Logistic regression model accuracy for F-T: 0.857
 Logistic regression model accuracy for J-P: 0.811


In [36]:
# check out classigication report
from sklearn.metrics import confusion_matrix, classification_report

report_EI = classification_report(y_EI_test, y_EI_pred_ros)
print(f"Classification report for E0-I1 group:")
print(report_EI)
print(f"Accuracy score: {accuracy_score(y_EI_test, y_EI_pred_ros):.3f}")
print("--------------------------")

report_NS = classification_report(y_NS_test, y_NS_pred_ros)
print(f"Classification report for N0-S1 group:")
print(report_NS)
print(f"Accuracy score: {accuracy_score(y_NS_test, y_NS_pred_ros):.3f}")
print("--------------------------")

report_FT = classification_report(y_FT_test, y_FT_pred_ros)
print(f"Classification report for F0-T1 group:")
print(report_FT)
print(f"Accuracy score: {accuracy_score(y_FT_test, y_FT_pred_ros):.3f}")
print("--------------------------")

report_JP = classification_report(y_JP_test, y_JP_pred_ros)
print(f"Classification report for J0-P1 group:")
print(report_JP)
print(f"Accuracy score: {accuracy_score(y_JP_test, y_JP_pred_ros):.3f}")

Classification report for E0-I1 group:
              precision    recall  f1-score   support

           0       0.66      0.64      0.65       501
           1       0.89      0.90      0.90      1668

    accuracy                           0.84      2169
   macro avg       0.78      0.77      0.77      2169
weighted avg       0.84      0.84      0.84      2169

Accuracy score: 0.841
--------------------------
Classification report for N0-S1 group:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94      1886
           1       0.61      0.68      0.65       283

    accuracy                           0.90      2169
   macro avg       0.78      0.81      0.79      2169
weighted avg       0.91      0.90      0.90      2169

Accuracy score: 0.902
--------------------------
Classification report for F0-T1 group:
              precision    recall  f1-score   support

           0       0.88      0.85      0.86      1173
           1       0.8

In [37]:
X_test.shape

(2169, 17000)

In [38]:
message = ["Hello How are you"]
message = vectorizer.transform(message)

In [39]:
message.shape

(1, 17000)

In [40]:
vectorizer = pickle.load(open(r"C:\Users\USER\Desktop\vectorizer.pkl.txt", 'rb'))


In [41]:
message = ["Hello How are you"]
message = vectorizer.transform(message)