In [4]:
DataFilePath = "../../personality_dataset"

In [5]:
import datasets
from Config import *


def data_load(path):
    train = datasets.load_from_disk(path + "/train")
    valid = datasets.load_from_disk(path + "/valid")
    return train, valid

In [6]:
train, valid = data_load(DataFilePath)

In [7]:
train

Dataset({
    features: ['content', 'personality'],
    num_rows: 58972
})

In [8]:
import numpy as np

def data2np(data):
    words = []
    labels = []
    for content, label in zip(data["content"], data["personality"]):
        word = content.split("|||")
        word.remove("")
        words.append(word)
        labels.append(label)
    return words, labels

In [9]:
data_train = data2np(train)
data_valid = data2np(valid)

In [10]:
data_train[0]

[['just watched the american in line in front of me search "barking ramz remix" on apple music',
  "@user that one's hot summer, hot hot summer",
  "girls aloud were right, it's gonna be a long hot summer",
  "for those that don't know i produce wank pop here you go link",
  'late to party but that spoken word lipsync was great',
  'lady in subway asked for "chipottle" sauce',
  'hey they gotta isaac emoji \u200d',
  'boyfriend said "girl in front of fire" and meant this pic link',
  '@user my tweet refers exclusively to abortion and not sterilisation tho, i feel that this is a separate t… link',
  "what i mean to say is it's explicitly a women's rights issue - they are trying to regain total control over women's… link",
  'yes some men can get pregnant but restricting abortions is an act of oppression targeted at women specifically, and… link',
  'purchased hatsune miku game see you in a month',
  'enough link',
  'just said to myself "fishy fingers for the girly wirlies". i have a 39

In [11]:
import spacy


nlp = spacy.load("en_core_web_sm", exclude=["tok2vec", "tagger", "senter", "attribute_ruler", "lemmatizer"])

In [12]:
import tqdm

def document_creator(words):
    document = ""
    for word in tqdm.tqdm(words, position=0):
        document = "".join([document, " ".join(word)])
    return document


def word_counter(nlp, document):
    document = nlp(document)
    
    word_count = {}
    for token in tqdm.tqdm(document):
        if token.text in word_count.keys():
            word_count[token.text] += 1
        else:
            word_count[token.text] = 1

    return word_count

In [13]:
document = document_creator(data_train[0][0 : 100])

100%|██████████| 100/100 [00:00<00:00, 50123.14it/s]


In [14]:
document



In [15]:
word_count = word_counter(nlp, document)

100%|██████████| 35233/35233 [00:00<00:00, 748534.43it/s]


In [16]:
word_count

{'just': 131,
 'watched': 5,
 'the': 686,
 'american': 1,
 'in': 295,
 'line': 2,
 'front': 3,
 'of': 334,
 'me': 217,
 'search': 2,
 '"': 57,
 'barking': 1,
 'ramz': 1,
 'remix': 2,
 'on': 170,
 'apple': 1,
 'music': 10,
 '@user': 857,
 'that': 321,
 'one': 79,
 "'s": 203,
 'hot': 8,
 'summer': 8,
 ',': 788,
 'girls': 8,
 'aloud': 2,
 'were': 30,
 'right': 24,
 'it': 354,
 'gon': 22,
 'na': 25,
 'be': 184,
 'a': 527,
 'long': 10,
 'for': 203,
 'those': 29,
 'do': 225,
 "n't": 190,
 'know': 88,
 'i': 292,
 'produce': 1,
 'wank': 1,
 'pop': 3,
 'here': 31,
 'you': 370,
 'go': 37,
 'link': 654,
 'late': 3,
 'to': 688,
 'party': 2,
 'but': 169,
 'spoken': 3,
 'word': 6,
 'lipsync': 1,
 'was': 125,
 'great': 9,
 'lady': 3,
 'subway': 1,
 'asked': 5,
 'chipottle': 1,
 'sauce': 2,
 'hey': 2,
 'they': 92,
 'got': 38,
 'ta': 5,
 'isaac': 1,
 'emoji': 2,
 '\u200d': 14,
 'boyfriend': 2,
 'said': 28,
 'girl': 11,
 'fire': 3,
 'and': 507,
 'meant': 6,
 'this': 188,
 'pic': 7,
 'my': 253,
 'tweet':

In [17]:
def classifier_data(train):
    classification = [[] for i in range(8)]
    true_labels = ["I", "S", "T", "J"]
    
    for words, label in zip(train[0][0 : 100], train[1][0 : 100]):
        for i in range(4):
            if label[i] == true_labels[i]:
                classification[i * 2].append(words)
            else:
                classification[i * 2 + 1].append(words)
                
    return classification

In [18]:
classification = classifier_data(data_train)

In [19]:
classification

[[['just watched the american in line in front of me search "barking ramz remix" on apple music',
   "@user that one's hot summer, hot hot summer",
   "girls aloud were right, it's gonna be a long hot summer",
   "for those that don't know i produce wank pop here you go link",
   'late to party but that spoken word lipsync was great',
   'lady in subway asked for "chipottle" sauce',
   'hey they gotta isaac emoji \u200d',
   'boyfriend said "girl in front of fire" and meant this pic link',
   '@user my tweet refers exclusively to abortion and not sterilisation tho, i feel that this is a separate t… link',
   "what i mean to say is it's explicitly a women's rights issue - they are trying to regain total control over women's… link",
   'yes some men can get pregnant but restricting abortions is an act of oppression targeted at women specifically, and… link',
   'purchased hatsune miku game see you in a month',
   'enough link',
   'just said to myself "fishy fingers for the girly wirlies

In [20]:
def count_one(nlp, classification):
    classification_count_every = [[] for i in range(8)]
    for utype in range(8):
        for one in classification[utype]:
            document = " ".join(one)
            classification_count_every[utype].append(word_counter(nlp, document))
    return classification_count_every

In [21]:
classification_count_every = count_one(nlp, classification)

100%|██████████| 323/323 [00:00<00:00, 323408.97it/s]
100%|██████████| 394/394 [00:00<00:00, 395348.27it/s]
100%|██████████| 360/360 [00:00<?, ?it/s]
100%|██████████| 354/354 [00:00<?, ?it/s]
100%|██████████| 327/327 [00:00<00:00, 328040.52it/s]
100%|██████████| 354/354 [00:00<00:00, 355466.51it/s]
100%|██████████| 352/352 [00:00<00:00, 353542.87it/s]
100%|██████████| 329/329 [00:00<00:00, 329337.95it/s]
100%|██████████| 349/349 [00:00<00:00, 349859.49it/s]
100%|██████████| 339/339 [00:00<00:00, 339267.25it/s]
100%|██████████| 346/346 [00:00<?, ?it/s]
100%|██████████| 323/323 [00:00<?, ?it/s]
100%|██████████| 355/355 [00:00<00:00, 355959.34it/s]
100%|██████████| 346/346 [00:00<?, ?it/s]
100%|██████████| 354/354 [00:00<?, ?it/s]
100%|██████████| 384/384 [00:00<00:00, 385498.50it/s]
100%|██████████| 378/378 [00:00<00:00, 379112.13it/s]
100%|██████████| 330/330 [00:00<00:00, 331050.06it/s]
100%|██████████| 320/320 [00:00<00:00, 320941.48it/s]
100%|██████████| 381/381 [00:00<00:00, 382303.

In [22]:
classification_count_every[0][0]

{'just': 4,
 'watched': 1,
 'the': 5,
 'american': 1,
 'in': 6,
 'line': 1,
 'front': 2,
 'of': 4,
 'me': 1,
 'search': 1,
 '"': 8,
 'barking': 1,
 'ramz': 1,
 'remix': 1,
 'on': 1,
 'apple': 1,
 'music': 1,
 '@user': 4,
 'that': 5,
 'one': 2,
 "'s": 5,
 'hot': 4,
 'summer': 3,
 ',': 6,
 'girls': 1,
 'aloud': 1,
 'were': 2,
 'right': 1,
 'it': 3,
 'gon': 1,
 'na': 1,
 'be': 2,
 'a': 7,
 'long': 1,
 'for': 3,
 'those': 1,
 'do': 5,
 "n't": 2,
 'know': 1,
 'i': 5,
 'produce': 1,
 'wank': 1,
 'pop': 1,
 'here': 1,
 'you': 3,
 'go': 1,
 'link': 10,
 'late': 1,
 'to': 6,
 'party': 1,
 'but': 2,
 'spoken': 1,
 'word': 1,
 'lipsync': 1,
 'was': 1,
 'great': 1,
 'lady': 1,
 'subway': 1,
 'asked': 1,
 'chipottle': 1,
 'sauce': 1,
 'hey': 1,
 'they': 3,
 'got': 2,
 'ta': 1,
 'isaac': 1,
 'emoji': 1,
 '\u200d': 1,
 'boyfriend': 1,
 'said': 2,
 'girl': 2,
 'fire': 1,
 'and': 3,
 'meant': 1,
 'this': 3,
 'pic': 1,
 'my': 2,
 'tweet': 1,
 'refers': 1,
 'exclusively': 1,
 'abortion': 1,
 'not': 1,
 '

In [23]:
def count_four(classification):
    counts = []
    for i in range(8):
        document = document_creator(classification[i])
        word_count = word_counter(nlp, document)
        counts.append(word_count)
    return counts

In [24]:
I_count, E_count, S_count, N_count, T_count, F_count, J_count, P_count = count_four(classification)

100%|██████████| 74/74 [00:00<00:00, 74271.00it/s]
100%|██████████| 26177/26177 [00:00<00:00, 687688.58it/s]
100%|██████████| 26/26 [00:00<00:00, 26120.22it/s]
100%|██████████| 9056/9056 [00:00<00:00, 825425.76it/s]
100%|██████████| 14/14 [00:00<00:00, 14017.73it/s]
100%|██████████| 4883/4883 [00:00<00:00, 819953.02it/s]
100%|██████████| 86/86 [00:00<00:00, 86273.65it/s]
100%|██████████| 30352/30352 [00:00<00:00, 811209.34it/s]
100%|██████████| 47/47 [00:00<00:00, 47127.01it/s]
100%|██████████| 16531/16531 [00:00<00:00, 637543.46it/s]
100%|██████████| 53/53 [00:00<?, ?it/s]
100%|██████████| 18708/18708 [00:00<00:00, 815606.34it/s]
100%|██████████| 48/48 [00:00<?, ?it/s]
100%|██████████| 17006/17006 [00:00<00:00, 763956.58it/s]
100%|██████████| 52/52 [00:00<00:00, 50486.99it/s]
100%|██████████| 18231/18231 [00:00<00:00, 870469.08it/s]


In [25]:
I_count

{'just': 100,
 'watched': 4,
 'the': 531,
 'american': 1,
 'in': 223,
 'line': 1,
 'front': 2,
 'of': 273,
 'me': 171,
 'search': 1,
 '"': 49,
 'barking': 1,
 'ramz': 1,
 'remix': 1,
 'on': 117,
 'apple': 1,
 'music': 9,
 '@user': 545,
 'that': 255,
 'one': 61,
 "'s": 171,
 'hot': 6,
 'summer': 8,
 ',': 651,
 'girls': 5,
 'aloud': 2,
 'were': 22,
 'right': 20,
 'it': 277,
 'gon': 16,
 'na': 18,
 'be': 141,
 'a': 407,
 'long': 9,
 'for': 164,
 'those': 23,
 'do': 167,
 "n't": 158,
 'know': 71,
 'i': 179,
 'produce': 1,
 'wank': 1,
 'pop': 3,
 'here': 26,
 'you': 275,
 'go': 26,
 'link': 414,
 'late': 3,
 'to': 536,
 'party': 2,
 'but': 132,
 'spoken': 3,
 'word': 5,
 'lipsync': 1,
 'was': 97,
 'great': 7,
 'lady': 2,
 'subway': 1,
 'asked': 4,
 'chipottle': 1,
 'sauce': 2,
 'hey': 2,
 'they': 67,
 'got': 32,
 'ta': 5,
 'isaac': 1,
 'emoji': 2,
 '\u200d': 7,
 'boyfriend': 2,
 'said': 24,
 'girl': 5,
 'fire': 3,
 'and': 392,
 'meant': 4,
 'this': 143,
 'pic': 6,
 'my': 202,
 'tweet': 3,
 

In [26]:
def remove_special_word(count):
    total_num = len(count)
    min_limited = total_num * 0.001
    del_list = []
    for key, val in count.items():
        if val < min_limited:
            del_list.append(key)
    
    for key in del_list:
        count.pop(key)
    return count

In [27]:
new_dict = remove_special_word(I_count)

In [28]:
new_dict

{'just': 100,
 'the': 531,
 'in': 223,
 'of': 273,
 'me': 171,
 '"': 49,
 'on': 117,
 'music': 9,
 '@user': 545,
 'that': 255,
 'one': 61,
 "'s": 171,
 'hot': 6,
 'summer': 8,
 ',': 651,
 'were': 22,
 'right': 20,
 'it': 277,
 'gon': 16,
 'na': 18,
 'be': 141,
 'a': 407,
 'long': 9,
 'for': 164,
 'those': 23,
 'do': 167,
 "n't": 158,
 'know': 71,
 'i': 179,
 'here': 26,
 'you': 275,
 'go': 26,
 'link': 414,
 'to': 536,
 'but': 132,
 'was': 97,
 'great': 7,
 'they': 67,
 'got': 32,
 '\u200d': 7,
 'said': 24,
 'and': 392,
 'this': 143,
 'pic': 6,
 'my': 202,
 'not': 115,
 'feel': 52,
 'is': 252,
 '…': 137,
 'what': 79,
 'mean': 24,
 'say': 36,
 'women': 11,
 '-': 82,
 'are': 99,
 'over': 25,
 'yes': 8,
 'some': 38,
 'can': 73,
 'get': 65,
 'an': 78,
 'at': 84,
 'see': 35,
 'enough': 8,
 'myself': 19,
 '.': 814,
 'have': 150,
 'will': 21,
 'never': 28,
 'understand': 9,
 'people': 74,
 'who': 40,
 'we': 49,
 'same': 14,
 'school': 13,
 'man': 12,
 'with': 128,
 'like': 126,
 'ME': 13,
 'S

In [29]:
I_count_r = remove_special_word(I_count)
E_count_r = remove_special_word(E_count)
S_count_r = remove_special_word(S_count)
N_count_r = remove_special_word(N_count)
T_count_r = remove_special_word(T_count)
F_count_r = remove_special_word(F_count)
J_count_r = remove_special_word(J_count)
P_count_r = remove_special_word(P_count)

In [30]:
words_IE = list(set(list(I_count_r.keys())+list(E_count_r.keys())))
words_SN = list(set(list(S_count_r.keys())+list(N_count_r.keys())))
words_TF = list(set(list(T_count_r.keys())+list(F_count_r.keys())))
words_JP = list(set(list(J_count_r.keys())+list(P_count_r.keys())))

In [31]:
words_IE

['nice',
 'Sometimes',
 'should',
 'talk',
 'gay',
 '10',
 'anyway',
 '@user_marvels',
 'OMG',
 'doing',
 'day',
 'were',
 'Maybe',
 'join',
 'game',
 'oh',
 'TO',
 'poshmark',
 'me',
 'completely',
 'argument',
 '_',
 'DO',
 'family',
 'tend',
 'hurt',
 'tomorrow',
 'test',
 'It',
 'MUCH',
 'see',
 '!',
 'or',
 'side',
 'ME',
 'from',
 'room',
 'know',
 'gone',
 'agree',
 'sure',
 'Oh',
 "'re",
 'work',
 'VA',
 'social',
 'idea',
 '—',
 'any',
 ':D',
 'we',
 'only',
 'taking',
 '-',
 'make',
 'LOOK',
 'OF',
 'lot',
 'weeks',
 'going',
 'if',
 'sorry',
 'There',
 '>',
 'school',
 'IM',
 'omg',
 'remember',
 'link',
 'our',
 'human',
 'girl',
 'learn',
 '5',
 'each',
 "'d",
 'RN',
 'dating',
 'still',
 '"',
 'makes',
 'Thanks',
 'your',
 'you',
 'which',
 'find',
 'hope',
 'and',
 '6',
 'sense',
 'Yeah',
 'yes',
 'feel',
 'seems',
 'HELP',
 'wars',
 'UP',
 'literally',
 'listen',
 'did',
 'done',
 'Manifesting',
 'single',
 'took',
 '3',
 'You',
 'would',
 'AND',
 '..',
 'last',
 '^^',


In [32]:
def normalize(classification_count_every, valid_words):
    nml_count = [[] for i in range(8)]
    for utype in range(8):
        for one in classification_count_every[utype]:
            vec = []
            for word in valid_words[utype // 2]:
                if word in one.keys():
                    vec.append(one[word])
                else:
                    vec.append(0)
            nml_count[utype].append(vec)
    return nml_count

In [33]:
nml_count = normalize(classification_count_every, [words_IE, words_SN, words_TF, words_JP])

In [34]:
def get_feature_and_labels(nml_count):
    features = []
    labels = []
    for i in range(4):
        features.append(np.append(np.array(nml_count[2 * i]), np.array(nml_count[2 * i + 1]), axis=0))
        labels.append(np.append(np.array([1 for i in range(len(nml_count[2 * i]))]), np.array([0 for i in range(len(nml_count[2 * i + 1]))])))
    
    return features, labels

In [35]:
features, labels = get_feature_and_labels(nml_count)

In [36]:
from sklearn.neighbors import KNeighborsClassifier


knn0 = KNeighborsClassifier(n_neighbors=3, weights="distance").fit(features[0], labels[0])
knn1 = KNeighborsClassifier(n_neighbors=3, weights="distance").fit(features[1], labels[1])
knn2 = KNeighborsClassifier(n_neighbors=3, weights="distance").fit(features[2], labels[2])
knn3 = KNeighborsClassifier(n_neighbors=3, weights="distance").fit(features[3], labels[3])

In [38]:
"I" if knn0.predict([features[0][85]]) == 1 else "E"

'E'