In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import time
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import os
import plotly.express as px
import plotly
import seaborn as sns
import csv


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
5,INTJ,'18/37 @.@|||Science is not perfect. No scien...
6,INFJ,"'No, I can't draw on my own nails (haha). Thos..."
7,INTJ,'I tend to build up a collection of things on ...
8,INFJ,"I'm not sure, that's a good question. The dist..."
9,INTP,'https://www.youtube.com/watch?v=w8-egj0y8Qs||...


In [7]:
'''
This will load the csv
'''
class CsvToDf:
    '''
    This class will simply turn the given data to a dataframe
    '''
    def __init__(self,filename,batchSize=None,cols=None):
        #batchSize is the size of data to be read incrementally. This is for data that is to big to fit
        #into memory
        self._cols = cols
        self._header = None
        self._filename = filename
        self._curIndex = 0     #this will be the current index that we are in the csv
        self._isRead = False
        self._df = None
        self._storeHeader()
        self._batchSize = batchSize
    def _storeHeader(self):
        with open(self._filename) as csvFile:
            f = csv.reader(csvFile)
            self._header = next(f)
    def getWholeCsv(self):
        if not(self._isRead):
            if self._cols != None:
                self._df = pd.read_csv(self._filename,usecols=self._cols)
            else:
                self._df = pd.read_csv(self._filename)
            self._isRead = True
        return self._df
    def getHeader(self):
        return self._header
    def _checkIfRead(self):
        if not(self._isRead):
            if self._cols != None:
                self._df = pd.read_csv(self._filename,iterator=True,chunksize=self._batchSize,usecols=self._cols)
            else:
                self._df = pd.read_csv(self._filename,iterator=True,chunksize=self._batchSize)
            self._isRead = True
            return False
        return True
    def getNextBatchCsv(self):
        self._checkIfRead()
        return next(self._df,None)

In [6]:
#================ counting the smallest number of data
TEST = "test"
TRAIN = "train"
class Combiner:
    '''
    - Given multiple CsvToDf that correspond to a dataset combine them to a single dataframe
    - return this dataframe
    - need to return a dataframe that only has type and post as its columns
    '''
    def __init__(self,dataList,columnList):
        '''
        dataList is the CsvToDf that contains all the data and columnList is a list that contains the necessary
        column names for a corresponding entry in dataList.
        '''
        assert len(dataList) == len(columnList),"incorrect sizes for data"
        self._dataList = dataList
        self._data = [None for i in range(len(dataList))]
        self._necessaryCol = columnList
        self._typeCol = "type"
        self._postCol = "posts"
        self._incrementData()
    def getNextBatch(self):
        '''
        return a dataframe that contains all the aggregated data
        '''
        outData = pd.DataFrame(columns=[self._typeCol,self._postCol])
        for data,colList in zip(self._data,self._necessaryCol):
            if isinstance(data,pd.DataFrame):
                renamedData = data[[colList[0],colList[1]]]
                renamedData.columns = [self._typeCol,self._postCol]
                
                outData = outData.append(renamedData,ignore_index=True)
        self._incrementData()
        if (len(outData.index)) == 0:
            return None
        else:
            return outData
    def _incrementData(self):
        for idx,i in enumerate(self._dataList):
            self._data[idx] = i.getNextBatchCsv()
class Balancer:
    '''
    - Balance the count
    - Decide what the training and test dat will be
    - Needs to output three data frames the train the test and the remainder
    - make the remainder the training set
    '''
    def __init__(self,combiner,trainFreq,testFreq):
        #personSize is minimum size of the number of people in a single personality group
        self._combiner = combiner
        self._typeCol = "type"
        self._postCol = "posts"
        self._personality_count = {"ENTJ" : {TRAIN:0,TEST:0}, "INTJ" : {TRAIN:0,TEST:0}, "ENTP" : {TRAIN:0,TEST:0}, "INTP" : {TRAIN:0,TEST:0}, "INFJ" : {TRAIN:0,TEST:0}, "INFP" : {TRAIN:0,TEST:0}, "ENFJ" : {TRAIN:0,TEST:0} , 
                    "ENFP" : {TRAIN:0,TEST:0}, "ESTP" : {TRAIN:0,TEST:0}, "ESTJ" : {TRAIN:0,TEST:0}, "ISTP" : {TRAIN:0,TEST:0}, "ISTJ" : {TRAIN:0,TEST:0}, "ISFJ" : {TRAIN:0,TEST:0}, "ISFP" : {TRAIN:0,TEST:0}, 
                    "ESFJ" : {TRAIN:0,TEST:0}, "ESFP" : {TRAIN:0,TEST:0}}
        self._trainFreq = trainFreq
        self._testFreq = testFreq
        self._training = []
        self._testing = []
    def createDataSets(self):
        self.reset()
        while not(self._trainIsUniform()) or not(self._testIsUniform()):
            #the three conditionals above will check if test and train dataset have uniform data 
            batch = self._combiner.getNextBatch()
            if not(isinstance(batch,pd.DataFrame)):
                break
            for idx,row in batch.iterrows():
                if isinstance(row[self._typeCol],str):
                    personality = row[self._typeCol].upper()
                    if personality in self._personality_count:
                        if self._personality_count[personality][TRAIN] < self._trainFreq:
                            self._training.append({self._typeCol:personality,self._postCol:row[self._postCol]})
                            self._personality_count[personality][TRAIN] += 1
                        elif self._personality_count[personality][TEST] < self._testFreq:
                            self._testing.append({self._typeCol:personality,self._postCol:row[self._postCol]})
                            self._personality_count[personality][TEST] += 1
        return True
    def reset(self):
        self._training = []
        self._testing = []
        self._personality_count = {"ENTJ" : {TRAIN:0,TEST:0}, "INTJ" : {TRAIN:0,TEST:0}, "ENTP" : {TRAIN:0,TEST:0}, "INTP" : {TRAIN:0,TEST:0}, "INFJ" : {TRAIN:0,TEST:0}, "INFP" : {TRAIN:0,TEST:0}, "ENFJ" : {TRAIN:0,TEST:0} , 
                    "ENFP" : {TRAIN:0,TEST:0}, "ESTP" : {TRAIN:0,TEST:0}, "ESTJ" : {TRAIN:0,TEST:0}, "ISTP" : {TRAIN:0,TEST:0}, "ISTJ" : {TRAIN:0,TEST:0}, "ISFJ" : {TRAIN:0,TEST:0}, "ISFP" : {TRAIN:0,TEST:0}, 
                    "ESFJ" : {TRAIN:0,TEST:0}, "ESFP" : {TRAIN:0,TEST:0}}
    def getTrainSet(self):
        return pd.DataFrame(self._training)
    def getTestSet(self):
        return pd.DataFrame(self._testing)
    def _trainIsUniform(self):
        #checks if personality count has equal distribution
        for key in self._personality_count:
            if self._personality_count[key][TRAIN] < self._trainFreq:
                return False
        return True
    def _testIsUniform(self):
        #checks if personality count has equal distribution
        for key in self._personality_count:
            if self._personality_count[key][TEST] < self._testFreq:
                return False
        return True
#======================================================

In [None]:
def counter(ctd):
    personality_dict = {"ENTJ" : 0, "INTJ" : 0, "ENTP" : 0, "INTP" : 0, "INFJ" : 0, "INFP" : 0, "ENFJ" : 0, 
                    "ENFP" : 0, "ESTP" : 0, "ESTJ" : 0, "ISTP" : 0, "ISTJ" : 0, "ISFJ" : 0, "ISFP" : 0, 
                    "ESFJ" : 0, "ESFP" : 0}
    curCtd = ctd.getNextBatchCsv()
    while isinstance(curCtd,pd.DataFrame):
        for idx,row in curCtd.iterrows():
            if isinstance(row["type"],str):
                personality = row["type"].upper()
                if personality in personality_dict:
                    personality_dict[personality] += 1
        curCtd = ctd.getNextBatchCsv()
    return personality_dict
print(counter(file1))
print(counter(file2))
print(counter(file3))
print(counter(file4))

{'ENTJ': 231, 'INTJ': 1091, 'ENTP': 685, 'INTP': 1304, 'INFJ': 1470, 'INFP': 1832, 'ENFJ': 190, 'ENFP': 675, 'ESTP': 89, 'ESTJ': 39, 'ISTP': 337, 'ISTJ': 205, 'ISFJ': 166, 'ISFP': 271, 'ESFJ': 42, 'ESFP': 48}
{'ENTJ': 358, 'INTJ': 1837, 'ENTP': 624, 'INTP': 2313, 'INFJ': 1023, 'INFP': 1070, 'ENFJ': 206, 'ENFP': 605, 'ESTP': 88, 'ESTJ': 53, 'ISTP': 445, 'ISTJ': 236, 'ISFJ': 134, 'ISFP': 161, 'ESFJ': 34, 'ESFP': 65}
{'ENTJ': 13886, 'INTJ': 79680, 'ENTP': 29330, 'INTP': 102763, 'INFJ': 31243, 'INFP': 43876, 'ENFJ': 4460, 'ENFP': 22744, 'ESTP': 3808, 'ESTJ': 1354, 'ISTP': 11679, 'ISTJ': 11470, 'ISFJ': 4036, 'ISFP': 3773, 'ESFJ': 400, 'ESFP': 1583}


In [54]:
file1 = CsvToDf("../data/mbti_1.csv",batchSize=400)
file2 = CsvToDf("../data/mbti9k_comments.csv",batchSize=100) 
file3 = CsvToDf("../data/typed_posts.csv",batchSize=100)
file4 = CsvToDf("../data/typed_comments.csv",batchSize=100)

combine = Combiner([file1,file2,file3,file4],[["type","posts"],["type","comment"],["type","title"],["type","comment"]])
balancer = Balancer(combine,100,10)
balancer.createDataSets()

True

In [3]:
def getPersonalityDict():
    personality_dict = {"ENTJ" : 0, "INTJ" : 0, "ENTP" : 0, "INTP" : 0, "INFJ" : 0, "INFP" : 0, "ENFJ" : 0, 
                    "ENFP" : 0, "ESTP" : 0, "ESTJ" : 0, "ISTP" : 0, "ISTJ" : 0, "ISFJ" : 0, "ISFP" : 0, 
                    "ESFJ" : 0, "ESFP" : 0}
    for idx,keys in enumerate(personality_dict):
        oneVec = np.zeros((16,))
        oneVec[idx] = 1
        personality_dict[keys] = oneVec
    return personality_dict
print(getPersonalityDict())

{'ENTJ': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'INTJ': array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'ENTP': array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'INTP': array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'INFJ': array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'INFP': array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'ENFJ': array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'ENFP': array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]), 'ESTP': array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]), 'ESTJ': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]), 'ISTP': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]), 'ISTJ': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]), 'ISFJ': array([0., 0., 0., 

In [None]:
def counterDf(df):
    personality_dict = {"ENTJ" : 0, "INTJ" : 0, "ENTP" : 0, "INTP" : 0, "INFJ" : 0, "INFP" : 0, "ENFJ" : 0, 
                    "ENFP" : 0, "ESTP" : 0, "ESTJ" : 0, "ISTP" : 0, "ISTJ" : 0, "ISFJ" : 0, "ISFP" : 0, 
                    "ESFJ" : 0, "ESFP" : 0}

    for idx,row in df.iterrows():
        if isinstance(row["type"],str):
            personality = row["type"].upper()
            if personality in personality_dict:
                personality_dict[personality] += 1
    return personality_dict
counterDf(balancer.getTrainSet())

In [55]:
TYPE = "type"
def convertLabels(labelDf):
    '''
    this will turn the string labels to floats
    '''
    personality_dict = getPersonalityDict()
    type_labels = []
    # Go through the array and turn the personality type into its corresponding number
    for idx,personality in enumerate(labelDf):
        if isinstance(personality,str):
            type_labels.append(personality_dict[personality.upper()])
    return np.array(type_labels)

In [56]:
training_posts = balancer.getTrainSet()["posts"]
training_labels = convertLabels(balancer.getTrainSet()["type"])
testing_posts = balancer.getTestSet()["posts"]
testing_labels = convertLabels(balancer.getTestSet()["type"])
print(testing_labels)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]]


In [57]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Only considering the top 10000 most common words
vocab_size = 10000
max_length = 2016
# We only want to fit the tokenizer on the training, not the testing
tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<OOV>")
tokenizer.fit_on_texts(training_posts)

word_index = tokenizer.word_index

# Puts the padding (which are 0) at the end of the vectorized sentence.
# The longest post in our dataset is 2016, but we should truncate='post' earlier than 2016 words
training_sequences = tokenizer.texts_to_sequences(training_posts)
training_padded = pad_sequences(training_sequences, padding = 'post', maxlen = max_length)
# training_sequences = np.array(training_sequences)
training_padded = np.array(training_padded)


testing_sequences = tokenizer.texts_to_sequences(testing_posts)
testing_padded = pad_sequences(testing_sequences, padding = 'post', maxlen=max_length)
# testing_sequences = np.array(testing_sequences)
testing_padded = np.array(testing_padded)



In [58]:
#Second parameter is the output dimension. Therefore, when we are changing this to predict 4 dimensions of personality we should change it to 4
# ^^ actually i dont know if that is true
embedding_dim = 256
'''
Embedding layer will always have vocab_size*embedding_dim parameters. Since vocab_size is 10,000 the number of parameters on this layer will always be large
'''
model = tf.keras.Sequential([ 
                            tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                            tf.keras.layers.GRU(256, return_sequences=True),
                            tf.keras.layers.SimpleRNN(128),
                            tf.keras.layers.Dense(16, activation='softmax'),
])

model.compile(loss = tf.keras.losses.CategoricalCrossentropy(), optimizer = 'sgd', metrics = ["accuracy"])

In [59]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 2016, 256)         2560000   
_________________________________________________________________
gru_8 (GRU)                  (None, 2016, 256)         393984    
_________________________________________________________________
simple_rnn_8 (SimpleRNN)     (None, 128)               49280     
_________________________________________________________________
dense_8 (Dense)              (None, 16)                2064      
Total params: 3,005,328
Trainable params: 3,005,328
Non-trainable params: 0
_________________________________________________________________


In [60]:
num_epochs = 10
history = model.fit(training_padded, training_labels, epochs = num_epochs, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 

In [None]:
history = model.fit(training_padded, training_labels, epochs = 1, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
 192/1600 [==>...........................] - ETA: 3:03 - loss: 2.7852 - acc: 0.0521

In [None]:
res = np.argmax(model.predict(testing_padded),axis=1)
label = np.argmax(testing_labels,axis=1)
print(f"accuracy = {np.mean((label-res) == 0)}")