In [2]:
from google.colab import drive
drive.mount('/content/drive')

!ls '/content/drive'

KeyboardInterrupt: ignored

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import time
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import os
import plotly.express as px
import plotly
import seaborn as sns

filePath = "../data/"

df = pd.read_csv(filePath,header=0)
df.head(10)

In [None]:
fig = plt.gcf()
fig.set_size_inches(50, 20)
sns.catplot(x="type", kind="count", data=df,height=8.27, aspect=11.7/8.27)

In [None]:
#
# Just striping the string incase of any whitespace before or after the string
df["type"] = df["type"].str.strip()
# Seperate the the label into four different parts
target_multi_label = df["type"].str.split("" , expand=True)
target_multi_label = target_multi_label.iloc[: , 1:-1]
target_multi_label.columns = ["Personality-1","Personality-2","Personality-3","Personality-4"]

df = pd.concat([df,target_multi_label] , axis=1)

df.head()

In [None]:
fig = plt.gcf()
fig.set_size_inches(50, 20)
sns.catplot(x="Personality-1", kind="count", data=df,height=5, aspect=4/5)
sns.catplot(x="Personality-2", kind="count", data=df,height=5, aspect=4/5)
sns.catplot(x="Personality-3", kind="count", data=df,height=5, aspect=4/5)
sns.catplot(x="Personality-4", kind="count", data=df,height=5, aspect=4/5)

In [None]:
#version1 of text pre-processing

#source:https://towardsdatascience.com/nlp-text-preprocessing-a-practical-guide-and-template-d80874676e79
!pip install Unidecode
!pip install contractions
import nltk
nltk.download('wordnet')

from bs4 import BeautifulSoup
import spacy
import unidecode 
#from word2number import w2n
import contractions
from nltk.stem import WordNetLemmatizer 
import re

def preprocessing_v1(text):
    #remove html information
    soup = BeautifulSoup(text, "html.parser")
    processed = soup.get_text(separator=" ")
    
    #remove http// 
    processed = re.sub(r"http\S+", "", processed)

    #remove ||| seperate
    processed = re.sub(r'\|\|\|', r' ', processed)

    #lower case
    processed = processed.lower()

    #expand shortened words, e.g. don't to do not
    processed = contractions.fix(processed)

    #remove accented char
    processed = unidecode.unidecode(processed)

    #remove white space
    #processed = processed.strip()
    #processed = " ".join(processed.split())

    # Lemmatizing 
    lemmatizer = WordNetLemmatizer() 
    processed=lemmatizer.lemmatize(processed)


    return processed

In [None]:
df['posts'] = df['posts'].apply(preprocessing_v1)
posts = df['posts']
df.head()

In [None]:
posts = df['posts']
for i in range(10):
    print(posts[i])

In [None]:
# Labels for introversion vs extraversion
first_dimension = df['Personality-1']
# The model can't predict a float from a char, so we replace the I with 0 and E with 1
# I couldn't do this in place for some reason so i made a new array. This runtime is pretty shit cuz of append

# Make this np.zeros_lke first dim to fix runtime (?)
first_dim = []

for idx, letter in enumerate(first_dimension):
    if letter is 'I':
        first_dim.append(0)
    else:
        first_dim.append(1)

first_dim = np.array(first_dim)
print(first_dim)

In [None]:
full_type_labels = df['type']
# Enumerating the personality types so that our model can work with numbers
personality_dict = {"ENTJ" : 0, "INTJ" : 1, "ENTP" : 2, "INTP" : 3, "INFJ" : 4, "INFP" : 5, "ENFJ" : 6 , 
                    "ENFP" : 7, "ESTP" : 8, "ESTJ" : 9, "ISTP" : 10, "ISTJ" : 11, "ISFJ" : 12, "ISFP" : 13, 
                    "ESFJ" : 14, "ESFP" : 15}

type_labels = []

# Go through the array and turn the personality type into its corresponding number
for idx, personality in enumerate(full_type_labels):
    type_labels.append(personality_dict[personality])

type_labels = np.array(type_labels)

print(type_labels)
# print(full_type_labels)


In [None]:
###
#   Commenting out this whole block for now because it only predicts one letter
#
####

# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# # Only considering the top 10000 most common words
# vocab_size = 10000
# max_length = 2016
# # We have 8675 rows
# training_size = 8675//2
# training_posts = posts[0:training_size]
# testing_posts = posts[training_size:]
# # Right now is only predicting introversion or extroversion
# training_labels = first_dim[0:training_size]
# testing_labels = first_dim[training_size:]

# # We only want to fit the tokenizer on the training, not the testing
# tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<OOV>")
# tokenizer.fit_on_texts(training_posts)

# word_index = tokenizer.word_index

# # Puts the padding (which are 0) at the end of the vectorized sentence.
# # The longest post in our dataset is 2016, but we should truncate='post earlier than 2016 words
# training_sequences = tokenizer.texts_to_sequences(training_posts)
# training_padded = pad_sequences(training_sequences, padding = 'post', maxlen = max_length)
# # training_sequences = np.array(training_sequences)
# training_padded = np.array(training_padded)

# testing_sequences = tokenizer.texts_to_sequences(testing_posts)
# testing_padded = pad_sequences(testing_sequences, padding = 'post', maxlen=max_length)
# # testing_sequences = np.array(testing_sequences)
# training_padded = np.array(training_padded)


# print(word_index)
# print(training_padded[0])
# print(training_padded.shape)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Only considering the top 10000 most common words
vocab_size = 10000
max_length = 2016
# We have 8675 rows
training_size = 8675//2
training_posts = posts[0:training_size]
testing_posts = posts[training_size:]
# Right now is only predicting introversion or extroversion
training_labels = type_labels[0:training_size]
testing_labels = type_labels[training_size:]

# We only want to fit the tokenizer on the training, not the testing
tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<OOV>")
tokenizer.fit_on_texts(training_posts)

word_index = tokenizer.word_index

# Puts the padding (which are 0) at the end of the vectorized sentence.
# The longest post in our dataset is 2016, but we should truncate='post' earlier than 2016 words
training_sequences = tokenizer.texts_to_sequences(training_posts)
training_padded = pad_sequences(training_sequences, padding = 'post', maxlen = max_length)
# training_sequences = np.array(training_sequences)
training_padded = np.array(training_padded)

testing_sequences = tokenizer.texts_to_sequences(testing_posts)
testing_padded = pad_sequences(testing_sequences, padding = 'post', maxlen=max_length)
# testing_sequences = np.array(testing_sequences)
training_padded = np.array(training_padded)


print(word_index)
print(training_padded[0])
print(training_padded.shape)

In [None]:
# # Commenting out for similar reasons

# #Second parameter is the output dimension. Therefore, when we are changing this to predict 4 dimensions of personality we should change it to 4
# embedding_dim = 1

# model = tf.keras.Sequential([ 
#                              tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
#                              tf.keras.layers.GlobalAveragePooling1D(),
#                              tf.keras.layers.Dense(24, activation='relu'),
#                              tf.keras.layers.Dense(1, activation='sigmoid')
# ])

# model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
#Second parameter is the output dimension. Therefore, when we are changing this to predict 4 dimensions of personality we should change it to 4
# ^^ actually i dont know if that is true
embedding_dim = 256

model = tf.keras.Sequential([ 
                             tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.LSTM(128),
                            #  tf.keras.layers.Dense(1000, activation='relu'),
                            #  tf.keras.layers.Dense(400, activation='relu'),
                             tf.keras.layers.Dense(128, activation='relu'),
                             tf.keras.layers.Dense(48, activation='relu'),
                             tf.keras.layers.Dense(16, activation='softmax')
])

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
model.summary()

In [None]:
num_epochs = 30

history = model.fit(training_padded, training_labels, epochs = num_epochs, validation_data=(testing_padded, testing_labels), verbose = 1)

^^ This is super overfit. Probably should have stopped at epoch 10 because the validation accuracy starts decreasing. Also this isn't even using an LSTM or batching, so those need to be implemented.