# Follow these steps 

1. Download the dataset essays.csv from this link: https://raw.githubusercontent.com/addy1997/Task9-personality-prediction/main/essays.csv

2. Data cleansing step 

3. Download GoogleNews-vectors-negative300.bin file from Link: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/view?usp=sharing

4. Download GoogleNews-vectors-negative300-SLIM.bin file from Link: https://github.com/eyaler/word2vec-slim/blob/master/GoogleNews-vectors-negative300-SLIM.bin.gz (recommended if running without GPU)

5. In the home directory, find this file '__trait__.pickle'.

In [None]:
#import dependencies

import joblib
import numpy as np
import pandas as pd
import re
import gensim
from collections import defaultdict
import pickle

In [None]:
# load the data

data = pd.read_csv('essays.csv', encoding='latin')
data.shape
data

In [None]:
# data processing

def build_data(data, train_ratio=0.8, clean_string=True):
    """
    load training data and split it into 20:80
    """
    revs = []
    vocab = defaultdict(float)
    for i in range(data.shape[0]):
        line = data['TEXT'][i]
        y = data['cNEU'][i]
        rev = []
        rev.append(line.strip())
        if clean_string:
            orig_rev = clean_str(''.join(rev))
        else:
            orig_rev = ''.join(rev).lower()
        words = set(orig_rev.split())
        for word in words:
            vocab[word] +=1
        datum = {'y': y,
                 'text': orig_rev,
                 'num_words':len(orig_rev.split()),
                 'split':int(np.random.rand()<train_ratio)}
        revs.append(datum)
        
    return revs, vocab

def get_W(word_vecs, k=300):
    """
    Get word matrix. W[i] is the vector for word indexed by i
    """
    vocab_size= len(word_vecs)
    word_idx_map = dict()
    W = np.zeros(shape=(vocab_size+1,k), dtype="float64")
    W[0] = np.zeros(k, dtype="float64")
    i=1
    for word in word_vecs:
        W[i] = word_vecs[word]
        word_idx_map [word] = i
        i+=1
    return W, word_idx_map

def load_bin_vec(fname, vocab):
    """
    loads 300X1 word vectors from Google (Mikolov) word2vec
    """
    word_vecs = {}
    model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
    for word in vocab:
        try:
            word_vecs[word] = model.get_vector(word)
        except KeyError:
            pass
    return word_vecs 

def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
    """
    For words that occur in at least min_df documents, create a separate word vector.    
    0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
    """
    i=0.0
    for word in vocab:
        if word not in word_vecs and vocab[word] >= min_df:
            i+=1
            word_vecs[word] = np.random.uniform(-0.25,0.25,k)
            print(word)
            
    print(i*100/len(vocab))
    
def clean_str(string, TREC=False):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s ", string)
    string = re.sub(r"\'ve", " have ", string)
    string = re.sub(r"n\'t", " not ", string)
    string = re.sub(r"\'re", " are ", string)
    string = re.sub(r"\'d", " would ", string)
    string = re.sub(r"\'ll", " will ", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " \? ", string)
    #    string = re.sub(r"[a-zA-Z]{4,}", "", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip() if TREC else string.strip().lower()

In [None]:
# save the data in .pickle format


w2v_file = 'GoogleNews-vectors-negative300.bin'
revs, vocab = build_data(data, train_ratio=0.8, clean_string=True)
num_words = pd.DataFrame(revs)["num_words"]
max_l = np.max(num_words)
print('data loaded!!!')
print('number of sentences:'+ str(len(revs)))
print("vocab size: " + str(len(vocab))) 
print("max sentence length: " + str(max_l))
print("loading word2vec vectors...")
w2v = load_bin_vec(w2v_file, vocab)
#print(w2v)
print('word2vec loaded!')
print('num words already in word2vec:'+str(len(w2v)))

add_unknown_words(w2v, vocab)
W, word_idx_map = get_W(w2v)
#rand_vecs = {}
#add_unknown_words(rand_vecs, vocab)
#W2, _ = get_W(w2v)
file = '__trait__.pickle'
with open('__trait__.pickle', 'wb') as handle:
    pickle.dump([revs, W, word_idx_map, vocab], handle, protocol=None)
#file = 'imdb-train-val-testN.pickle'
#joblib.dump([revs, W, word_idx_map, vocab],filename, protocol=2)
print("dataset created")