In [188]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import time
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import os
import plotly.express as px
import plotly
import seaborn as sns
import csv
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from bs4 import BeautifulSoup
import spacy
from copy import copy, deepcopy
import unidecode 
import contractions
from nltk.stem import WordNetLemmatizer 
import re
nltk.download('wordnet')
POSTS = "posts"
TYPE = "type"

[nltk_data] Downloading package wordnet to /home/ivan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [156]:
'''
This will load the csv
'''

class CsvToDf:
    '''
    This class will simply turn the given data to a dataframe
    '''
    def __init__(self,filename,batchSize=None,cols=None,preProc=False,postCol=False,toReplace=None):
        #batchSize is the size of data to be read incrementally. This is for data that is to big to fit
        #into memory
        self._toReplace = toReplace
        self._preProc = preProc
        self._postCol = postCol
        self._cols = cols
        self._header = None
        self._filename = filename
        self._curIndex = 0     #this will be the current index that we are in the csv
        self._isRead = False
        self._df = None
        self._storeHeader()
        self._batchSize = batchSize
    def preprocessing_v1(self,text):
        #remove html information
        if not(isinstance(text,str)):
            return text
        soup = BeautifulSoup(text, "html.parser")
        processed = soup.get_text(separator=" ")

        #remove http// 
        processed = re.sub(r"http\S+", "", processed)

        #remove ||| seperate
        processed = re.sub(r'\|\|\|', r' ', processed)

        #lower case
        processed = processed.lower()

        #expand shortened words, e.g. don't to do not
        processed = contractions.fix(processed)

        #remove accented char
        processed = unidecode.unidecode(processed)

        #remove white space
        #processed = processed.strip()
        #processed = " ".join(processed.split())

        # Lemmatizing 
        lemmatizer = WordNetLemmatizer() 
        processed=lemmatizer.lemmatize(processed)
        return processed
    def _storeHeader(self):
        with open(self._filename) as csvFile:
            f = csv.reader(csvFile)
            self._header = next(f)
    def getWholeCsv(self):
        if not(self._isRead):
            if self._cols != None:
                self._df = pd.read_csv(self._filename,usecols=self._cols)
            else:
                self._df = pd.read_csv(self._filename)
            self._isRead = True
        return self._df
    def getHeader(self):
        return self._header
    def _checkIfRead(self):
        if not(self._isRead):
            if self._cols != None:
                self._df = pd.read_csv(self._filename,iterator=True,chunksize=self._batchSize,usecols=self._cols)
            else:
                self._df = pd.read_csv(self._filename,iterator=True,chunksize=self._batchSize)
            self._isRead = True
            return False
        return True
    def removeWord(self,text):
        if not(isinstance(text,str)):
            return text
        words = text.split()
        cleanWords = [i for i in words if i not in self._toReplace]
        return " ".join(cleanWords)
    def getNextBatchCsv(self):
        self._checkIfRead()
        out = next(self._df,None)
        if self._preProc and isinstance(out,pd.DataFrame):
            out[self._postCol] = out[self._postCol].apply(self.preprocessing_v1)
        if self._toReplace != None and isinstance(out,pd.DataFrame):
            out[self._postCol] = out[self._postCol].apply(self.removeWord)
        return out

In [157]:
#================ counting the smallest number of data
TEST = "test"
TRAIN = "train"
class Combiner:
    '''
    - Given multiple CsvToDf that correspond to a dataset combine them to a single dataframe
    - return this dataframe
    - need to return a dataframe that only has type and post as its columns
    '''
    def __init__(self,dataList,columnList):
        '''
        dataList is the CsvToDf that contains all the data and columnList is a list that contains the necessary
        column names for a corresponding entry in dataList.
        '''
        assert len(dataList) == len(columnList),"incorrect sizes for data"
        self._dataList = dataList
        self._data = [None for i in range(len(dataList))]
        self._necessaryCol = columnList
        self._typeCol = "type"
        self._postCol = "posts"
        self._incrementData()
    def getNextBatch(self):
        '''
        return a dataframe that contains all the aggregated data
        '''
        outData = pd.DataFrame(columns=[self._typeCol,self._postCol])
        for data,colList in zip(self._data,self._necessaryCol):
            if isinstance(data,pd.DataFrame):
                renamedData = data[[colList[0],colList[1]]]
                renamedData.columns = [self._typeCol,self._postCol]
                
                outData = outData.append(renamedData,ignore_index=True)
        self._incrementData()
        if (len(outData.index)) == 0:
            return None
        else:
            return outData
    def _incrementData(self):
        for idx,i in enumerate(self._dataList):
            self._data[idx] = i.getNextBatchCsv()
class Balancer:
    '''
    - Balance the count
    - Decide what the training and test dat will be
    - Needs to output three data frames the train the test and the remainder
    - make the remainder the training set
    '''
    def __init__(self,combiner,trainFreq,testFreq):
        #personSize is minimum size of the number of people in a single personality group
        self._combiner = combiner
        self._typeCol = "type"
        self._postCol = "posts"
        self._personality_count = {"ENTJ" : {TRAIN:0,TEST:0}, "INTJ" : {TRAIN:0,TEST:0}, "ENTP" : {TRAIN:0,TEST:0}, "INTP" : {TRAIN:0,TEST:0}, "INFJ" : {TRAIN:0,TEST:0}, "INFP" : {TRAIN:0,TEST:0}, "ENFJ" : {TRAIN:0,TEST:0} , 
                    "ENFP" : {TRAIN:0,TEST:0}, "ESTP" : {TRAIN:0,TEST:0}, "ESTJ" : {TRAIN:0,TEST:0}, "ISTP" : {TRAIN:0,TEST:0}, "ISTJ" : {TRAIN:0,TEST:0}, "ISFJ" : {TRAIN:0,TEST:0}, "ISFP" : {TRAIN:0,TEST:0}, 
                    "ESFJ" : {TRAIN:0,TEST:0}, "ESFP" : {TRAIN:0,TEST:0}}
        self._trainFreq = trainFreq
        self._testFreq = testFreq
        self._training = []
        self._testing = []
    def createDataSets(self):
        self.reset()
        while not(self._trainIsUniform()) or not(self._testIsUniform()):
            #the three conditionals above will check if test and train dataset have uniform data 
            batch = self._combiner.getNextBatch()
            if not(isinstance(batch,pd.DataFrame)):
                break
            for idx,row in batch.iterrows():
                if isinstance(row[self._typeCol],str):
                    personality = row[self._typeCol].upper()
                    if personality in self._personality_count:
                        if self._personality_count[personality][TRAIN] < self._trainFreq:
                            self._training.append({self._typeCol:personality,self._postCol:row[self._postCol]})
                            self._personality_count[personality][TRAIN] += 1
                        elif self._personality_count[personality][TEST] < self._testFreq:
                            self._testing.append({self._typeCol:personality,self._postCol:row[self._postCol]})
                            self._personality_count[personality][TEST] += 1
        return True
    def reset(self):
        self._training = []
        self._testing = []
        self._personality_count = {"ENTJ" : {TRAIN:0,TEST:0}, "INTJ" : {TRAIN:0,TEST:0}, "ENTP" : {TRAIN:0,TEST:0}, "INTP" : {TRAIN:0,TEST:0}, "INFJ" : {TRAIN:0,TEST:0}, "INFP" : {TRAIN:0,TEST:0}, "ENFJ" : {TRAIN:0,TEST:0} , 
                    "ENFP" : {TRAIN:0,TEST:0}, "ESTP" : {TRAIN:0,TEST:0}, "ESTJ" : {TRAIN:0,TEST:0}, "ISTP" : {TRAIN:0,TEST:0}, "ISTJ" : {TRAIN:0,TEST:0}, "ISFJ" : {TRAIN:0,TEST:0}, "ISFP" : {TRAIN:0,TEST:0}, 
                    "ESFJ" : {TRAIN:0,TEST:0}, "ESFP" : {TRAIN:0,TEST:0}}
    def getTrainSet(self):
        return pd.DataFrame(self._training)
    def getTestSet(self):
        return pd.DataFrame(self._testing)
    def _trainIsUniform(self):
        #checks if personality count has equal distribution
        for key in self._personality_count:
            if self._personality_count[key][TRAIN] < self._trainFreq:
                return False
        return True
    def _testIsUniform(self):
        #checks if personality count has equal distribution
        for key in self._personality_count:
            if self._personality_count[key][TEST] < self._testFreq:
                return False
        return True
#======================================================

In [158]:
def getPersonalityDict():
    personality_dict = {"ENTJ" : 0, "INTJ" : 0, "ENTP" : 0, "INTP" : 0, "INFJ" : 0, "INFP" : 0, "ENFJ" : 0, 
                    "ENFP" : 0, "ESTP" : 0, "ESTJ" : 0, "ISTP" : 0, "ISTJ" : 0, "ISFJ" : 0, "ISFP" : 0, 
                    "ESFJ" : 0, "ESFP" : 0}
    for idx,keys in enumerate(personality_dict):
        oneVec = np.zeros((16,))
        oneVec[idx] = 1
        personality_dict[keys] = oneVec
    return personality_dict

In [159]:
def counterDf(df):
    personality_dict = {"ENTJ" : 0, "INTJ" : 0, "ENTP" : 0, "INTP" : 0, "INFJ" : 0, "INFP" : 0, "ENFJ" : 0, 
                    "ENFP" : 0, "ESTP" : 0, "ESTJ" : 0, "ISTP" : 0, "ISTJ" : 0, "ISFJ" : 0, "ISFP" : 0, 
                    "ESFJ" : 0, "ESFP" : 0}

    for idx,row in df.iterrows():
        if isinstance(row["type"],str):
            personality = row["type"].upper()
            if personality in personality_dict:
                personality_dict[personality] += 1
    return personality_dict

In [160]:
TYPE = "type"
def convertLabels(labelDf):
    '''
    this will turn the string labels to floats
    '''
    personality_dict = getPersonalityDict()
    type_labels = []
    # Go through the array and turn the personality type into its corresponding number
    for idx,personality in enumerate(labelDf):
        if isinstance(personality,str):
            type_labels.append(personality_dict[personality.upper()])
    return np.array(type_labels)

In [161]:
TYPE = "type"
def get4Dim(df):
    personality_dict = {"ENTJ" : 0, "INTJ" : 0, "ENTP" : 0, "INTP" : 0, "INFJ" : 0, "INFP" : 0, "ENFJ" : 0, 
                    "ENFP" : 0, "ESTP" : 0, "ESTJ" : 0, "ISTP" : 0, "ISTJ" : 0, "ISFJ" : 0, "ISFP" : 0, 
                    "ESFJ" : 0, "ESFP" : 0}
    out = [[0 for i in range(len(df.index))],[0 for i in range(len(df.index))],[0 for i in range(len(df.index))],[0 for i in range(len(df.index))]]
    for idx,row in enumerate(df):
        personality = row
        if isinstance(personality,str) and personality in personality_dict:
            personality = personality.upper()
            if personality[0] == "E":
                out[0][idx] = 1
            if personality[1] == "S":
                out[1][idx] = 1
            if personality[2] == "T":
                out[2][idx] = 1
            if personality[3] == "J":
                out[3][idx] = 1
    return out

In [162]:
def createDimModel():
    vocab_size = 10000
    max_length = 2016
    embedding_dim = 256
    return tf.keras.Sequential([ 
                            tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                            tf.keras.layers.GRU(64, return_sequences=True),
                            tf.keras.layers.SimpleRNN(64),
                            tf.keras.layers.Dense(1, activation='sigmoid'),])
def train4Dim(trainPost,trainLabels,num_epochs):
    models = [createDimModel(),createDimModel(),createDimModel(),createDimModel()]
    for idx,dims in enumerate(trainLabels):
        print(f"training dim {idx+1}")
        models[idx].compile(loss = tf.keras.losses.BinaryCrossentropy(), optimizer = 'adam', metrics = ["accuracy"])
        models[idx].fit(trainPost, np.array(trainLabels[idx]), epochs = num_epochs, verbose = 1)
    return models
def test4Dim(models,testPost,testLabels):
    out = np.array([])
    for idx,model in enumerate(models):
        if len(out) == 0:
            out = np.array([np.squeeze(model.predict(testPost))])
        else:
            print(out)
            out = np.append(out,np.squeeze(model.predict(testPost)),axis=1)
    label = np.array([[]])
    for i in testPost:
        if len(out) == 0:
            label = np.array([label])
        else:
            label = np.append(label,i,axis=1)
    print(f"total accuracy for personality classification = {np.mean((np.sum(label-out,axis=1)) == 0)}")

In [163]:
def removeWord(se,text):
        if not(isinstance(text,str)):
            return text
        words = text.split()
        cleanWords = [i for i in words if i not in self._toReplace]

In [164]:
def dfFactory(fileList,columns,replaceWords,personTrain):
    '''
    This will create a dataframe without the specified words
    '''
    files = []
    for idx,i in enumerate(fileList):
        files.append(CsvToDf(i,batchSize=400,preProc=True,postCol=columns[idx][1],toReplace=replaceWords))
    combine = Combiner(files,columns)
    balancer = Balancer(combine,personTrain,0)
    balancer.createDataSets()
    return balancer

In [165]:
vocab_size = 10000
max_length = 2016
def tokenize(postsSet):
    tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<OOV>")
    tokenizer.fit_on_texts(postsSet)
    training_sequences = tokenizer.texts_to_sequences(postsSet)
    training_padded = pad_sequences(training_sequences, padding = 'post', maxlen = 2016)
    # training_sequences = np.array(training_sequences)
    training_padded = np.array(training_padded)
    return training_padded

In [166]:
def createTrain(fileList,columns,replaceWords,personTrain):
    out = []
    for i in replaceWords:
        balancer = dfFactory(fileList,columns,i,personTrain)
        out.append(balancer.getTrainSet())
    return out

In [153]:

def createModels(trainSets,toReplace):
    '''
    trainSets must be a list of dataframes with columns posts and type
    '''
    out = []
    for idx,i in enumerate(trainSets):
        print(f"words removed: {toReplace[idx]}")
        out.append(train4Dim(tokenize(i[POSTS]),get4Dim(i[TYPE]),1))
    return out

SyntaxError: EOL while scanning string literal (<ipython-input-153-1f8764a73df4>, line 7)

In [154]:
def testModels(models):
    '''
    models will be a list of list containing a model for each dimension and each model with some words taken out
    '''
    for i in models:
        test4Dim(i)

In [73]:
filesList = ["../data/mbti_1.csv"]
columns = [["type","posts"]]
toReplace = [[],["suspect"],["teaching"]]
trainSets = createTrain(filesList,columns,toReplace,30)

In [79]:
models = createModels(trainSets,toReplace)



KeyboardInterrupt: 

In [None]:
print(models[0][0].predict(tokenize(trainSets[0]["posts"])))

In [76]:
print(get4Dim(trainSets[0]["type"]))

[[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,

In [213]:
file1 = CsvToDf("../data/mbti_1.csv")
training_size = 6675
df = file1.getWholeCsv()
training_labels = get4Dim(df[0:training_size]["type"])
testing_labels = get4Dim(df[training_size:]["type"])
testing_posts = df[training_size:]["posts"]
training_posts = df[0:training_size]["posts"]

In [214]:
# Only considering the top 10000 most common words
vocab_size = 10000
max_length = 2016
# We only want to fit the tokenizer on the training, not the testing
tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<OOV>")
tokenizer.fit_on_texts(training_posts)

word_index = tokenizer.word_index

# Puts the padding (which are 0) at the end of the vectorized sentence.
# The longest post in our dataset is 2016, but we should truncate='post' earlier than 2016 words
training_sequences = tokenizer.texts_to_sequences(training_posts)
training_padded = pad_sequences(training_sequences, padding = 'post', maxlen = max_length)
# training_sequences = np.array(training_sequences)
training_padded = np.array(training_padded)


testing_sequences = tokenizer.texts_to_sequences(testing_posts)
testing_padded = pad_sequences(testing_sequences, padding = 'post', maxlen=max_length)
# testing_sequences = np.array(testing_sequences)
testing_padded = np.array(testing_padded)



In [215]:
def shuffleLabel(labels):
    label_copy = deepcopy(labels)
    for idx,i in enumerate(label_copy): #this will shuffle the labels
        np.random.shuffle(label_copy[idx])
    return label_copy

In [216]:
def permutationTest(training_padded,training_labels,testing_labels,testing_padded,permutation):
    result = []
    for i in range(permutation):
        label_copy = deepcopy(training_labels)
        for idx,i in enumerate(label_copy): #this will shuffle the labels
            np.random.shuffle(label_copy[idx])
        models = train4Dim(training_padded,label_copy,1)
        getAccuracy(models,testing_labels,testing_padded)
        getTotalAccuracy(models,testing_labels,testing_padded)

In [None]:
permutationTest(training_padded,training_labels,testing_labels,testing_padded,10)

training dim 1
training dim 2
training dim 3
training dim 4
accuracy for dim 1 personality classification = 0.771
accuracy for dim 2 personality classification = 0.86
accuracy for dim 3 personality classification = 0.5345
accuracy for dim 4 personality classification = 0.599
total accuracy for personality classification = 0.197
training dim 1
training dim 2
training dim 3
training dim 4
accuracy for dim 1 personality classification = 0.771
accuracy for dim 2 personality classification = 0.86
accuracy for dim 3 personality classification = 0.5405
accuracy for dim 4 personality classification = 0.599
total accuracy for personality classification = 0.202
training dim 1
training dim 2
training dim 3
training dim 4
accuracy for dim 1 personality classification = 0.771
accuracy for dim 2 personality classification = 0.86
accuracy for dim 3 personality classification = 0.5205
accuracy for dim 4 personality classification = 0.599
total accuracy for personality classification = 0.194
training d

In [137]:
models = train4Dim(training_padded,training_labels,5)

training dim 1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
training dim 2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
training dim 3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
training dim 4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [168]:
def getAccuracy(models,testing_labels,testing_padded):
    for idx,model in enumerate(models):
        test = testing_labels[idx]
        modelOut = np.round(models[idx].predict(testing_padded))
        print(f"accuracy for dim {idx+1} personality classification = {np.mean(abs(np.squeeze(modelOut)-np.squeeze(test)) == 0)}")
#getAccuracy(models,testing_labels)

In [169]:
def getTotalAccuracy(models,testing_labels,testing_padded):
    total = None
    isEmpty = True
    for idx,model in enumerate(models):
        test = testing_labels[idx]
        modelOut = np.squeeze(np.round(models[idx].predict(testing_padded)))
        if isEmpty:
            total = np.array(modelOut)
            isEmpty = False
        else:
            total = np.column_stack((total,modelOut))
    labels = None
    isEmpty = True
    for idx,col in enumerate(testing_labels):
        if isEmpty:
            labels = np.array(col)
            isEmpty = False
        else:
            labels = np.column_stack((labels,col))
    print(f"total accuracy for personality classification = {np.mean(np.sum(abs(total-labels),axis=1) == 0)}")
#getTotalAccuracy(models,testing_labels)

In [None]:
test4Dim(models,testing_padded,testing_labels)

In [9]:
#Second parameter is the output dimension. Therefore, when we are changing this to predict 4 dimensions of personality we should change it to 4
# ^^ actually i dont know if that is true
embedding_dim = 256
'''
Embedding layer will always have vocab_size*embedding_dim parameters. Since vocab_size is 10,000 the number of parameters on this layer will always be large
'''
model = tf.keras.Sequential([ 
                            tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                            tf.keras.layers.GRU(256, return_sequences=True),
                            tf.keras.layers.SimpleRNN(128),
                            tf.keras.layers.Dense(16, activation='softmax'),
])

model.compile(loss = tf.keras.losses.CategoricalCrossentropy(), optimizer = 'sgd', metrics = ["accuracy"])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2016, 256)         2560000   
_________________________________________________________________
gru (GRU)                    (None, 2016, 256)         393984    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 128)               49280     
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
Total params: 3,005,328
Trainable params: 3,005,328
Non-trainable params: 0
_________________________________________________________________


In [None]:
num_epochs = 10
history = model.fit(training_padded, training_labels, epochs = num_epochs, verbose = 1)

In [None]:
history = model.fit(training_padded, training_labels, epochs = 1, verbose = 1)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
 128/1600 [=>............................] - ETA: 3:03 - loss: 2.8088 - acc: 0.0547

In [None]:
res = np.argmax(model.predict(testing_padded),axis=1)
label = np.argmax(testing_labels,axis=1)
print(f"accuracy = {np.mean((label-res) == 0)}")