## Importing the Necessary Libraries

In [1]:
from keras.preprocessing import sequence, text
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding, LSTM
from keras.layers import Conv1D, Flatten, MaxPooling1D
from keras.utils import pad_sequences

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

2023-04-04 11:00:04.689565: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Setting the Hyperparameters
These will be required for building the neural network. We can play around with these and they will largely affect the accuracy of our model

In [40]:
vocab_size = 1000
max_len =1000
batch_size = 32
embedding_dims =10
filters = 16
ker_size = 3 # kernel size
hidden_dims = 250
epochs_i = 10

## Reading the Dataset from CSV file

In [41]:
data = pd.read_csv('mbti_cleaned.csv')
data.dropna(inplace=True)  # ignoring the NaN values

In [42]:
data.head(100)

Unnamed: 0.1,Unnamed: 0,type,Number of posts,Posts
0,0,INFJ,50,intj moments sportscenter plays pra...
1,1,ENTP,50,finding lack these posts very alarmingsex...
2,2,INTP,50,good course which know thats blessi...
3,3,INTJ,50,dear intp enjoyed conversation other es...
4,4,ENTJ,50,youre firedthats another silly misconception t...
...,...,...,...,...
95,109,INTJ,50,that even anatomically possibleu
96,111,INFP,50,have this toothough theyre usually almost p...
97,112,INFP,50,feel like everyone this thread just needs c...
98,113,ESTP,50,splinter cell blacklist xbox generally well...


In [43]:
# add empty columns for trait pairs
data = data.reindex(columns = data.columns.tolist() + ['E_I','N_S','F_T','J_P'])
# split MBTI into trait pair column values
for i in range(data.shape[0]):
    data.loc[i,'E_I'] = data.iloc[i,1][0]
    data.loc[i,'N_S'] = data.iloc[i,1][1]
    data.loc[i,'F_T'] = data.iloc[i,1][2]
    data.loc[i,'J_P'] = data.iloc[i,1][3]
# create dictionary of trait pair binary encoding values
mbti_binary_values = {'E': 1, 'I': 0, 'N': 1, 'S': 0, 'F': 1, 'T': 0, 'J': 1, 'P': 0}
# create columns of trait pairs as binary encoded values
for col in data.columns[-4:]:
    data[f'{col}_code'] = data[col].map(mbti_binary_values)
# create column of type as binary code
data['type_code'] = data.iloc[:,-4:].apply(lambda x: ''.join(x.values.astype(str)), axis=1)  

data.head()

Unnamed: 0.1,Unnamed: 0,type,Number of posts,Posts,E_I,N_S,F_T,J_P,E_I_code,N_S_code,F_T_code,J_P_code,type_code
0,0.0,INFJ,50.0,intj moments sportscenter plays pra...,I,N,F,J,0.0,1.0,1.0,1.0,0.01.01.01.0
1,1.0,ENTP,50.0,finding lack these posts very alarmingsex...,E,N,T,P,1.0,1.0,0.0,0.0,1.01.00.00.0
2,2.0,INTP,50.0,good course which know thats blessi...,I,N,T,P,0.0,1.0,0.0,0.0,0.01.00.00.0
3,3.0,INTJ,50.0,dear intp enjoyed conversation other es...,I,N,T,J,0.0,1.0,0.0,1.0,0.01.00.01.0
4,4.0,ENTJ,50.0,youre firedthats another silly misconception t...,E,N,T,J,1.0,1.0,0.0,1.0,1.01.00.01.0


In [57]:
# One Hot encoding on the dataset output classes 
y = pd.DataFrame(data[['E_I_code','N_S_code','F_T_code', 'J_P_code']])


In [58]:
x_train,x_test,y_train,y_test = train_test_split(data['Posts'], y,random_state=0)

In [59]:
x_train = x_train.fillna('')
x_test = x_test.fillna('')

In [60]:
tokenizer = text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(x_train)

In [61]:
x_train = tokenizer.texts_to_matrix(x_train)
x_test = tokenizer.texts_to_matrix(x_test)

In [62]:
x_train = pad_sequences(x_train,maxlen=max_len)
x_test = pad_sequences(x_test,maxlen=max_len)

## Building the Sequential Neural Network using Keras

In [63]:
model = Sequential()
# First we add an embedding layer 
model.add(Embedding(vocab_size,embedding_dims,input_length=max_len)) 
# Adding a 1D convolutional Layer
model.add(Conv1D(filters, ker_size, padding='valid', activation='relu'))
# Max Pooling the Convolutions
model.add(MaxPooling1D())
# Again Computing the Convolutions
model.add(Flatten())
model.add(Dense(hidden_dims, activation='relu'))
model.add(Dense(4, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [64]:
# Training the modeL
model.fit(x_train,y_train, batch_size=batch_size, epochs=5, validation_data=(x_test, y_test),)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f85d23688b0>

## Evaluating the Performance of the Model

In [77]:
model.evaluate(x_test,y_test)[1]*100  # Accuracy of the model



26.9899845123291

In [None]:
# Saving the model into a pickle file 
import pickle
pickle.dump(model,open('cnn_model.pkl','wb'))

In [None]:
pickle.dump(tokenizer,open('tokenizer','wb'))

# For Single Input 

In [69]:
s ='live or not live thats a problem'
s = pd.Series(s)
s= tokenizer.texts_to_matrix(s)
s = pad_sequences(s)
l = model.predict(s)



In [37]:
a,b= l[0][0]*(1/1999), l[0][1]*(1/1197)
a = a/(1/1999)+(1/1197)
b = b/(1/1999)+(1/1197)

In [38]:
l = [a,b,l[0][2],l[0][3]]

In [39]:
s=''
if l[0] >0.5:
    s +='E'
else:
    s+='I'
if l[1] >0.5:
    s+='S'
else:
    s+='N'
if l[2] >0.5:
    s+='T'
else:
    s+='F'
if l[3] >0.5:
    s+='J'
else:
    s+='P'
print('Your Personality is:',s)

Your Personality is: INFP
