<a href="https://colab.research.google.com/github/addy1997/Task9-personality-prediction/blob/main/Training_improved.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#import packages
import pickle
import numpy as np

from keras import backend
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.constraints import unitnorm
from keras.regularizers import l2
from keras.initializers import random_uniform
from keras.callbacks import TensorBoard

from sklearn.metrics import roc_auc_score

In [7]:
def get_idx_from_sent(sent, word_idx_map, max_l=51, kernel_size=5):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    pad = kernel_size - 1
    for i in range(pad):
        x.append(0)
    words = sent.split()
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
    while len(x) < max_l+2*pad:
        x.append(0)
    return x

def make_idx_data(revs, word_idx_map, max_l=51, kernel_size=5):
    """
    Transforms sentences into a 2-d matrix.
    """
    train, val, test = [], [], []
    for rev in revs:
        sent = get_idx_from_sent(rev['text'], word_idx_map, max_l, kernel_size)
        sent.append(rev['y'])
        if rev['split'] == 1:
            train.append(sent)
        elif rev['split'] == 0:
            val.append(sent)
    train = np.array(train, dtype=np.int)
    val = np.array(val, dtype=np.int)
    return [train, val]


from google.colab import files
file = files.upload()


print ("loading data...")

with open("imdb-train-val-testN.pickle", 'rb') as f:
    x = pickle.load(f, encoding='latin')
revs, W, word_idx_map, vocab = x[0], x[1], x[2], x[3]
print ("data loaded!")

datasets = make_idx_data(revs, word_idx_map, max_l=2721,kernel_size=5)

Saving imdb-train-val-testN.pickle to imdb-train-val-testN.pickle
loading data...
data loaded!


In [8]:
# Train data preparation
N = datasets[0].shape[0]
conv_input_width = W.shape[1]
conv_input_height = int(datasets[0].shape[1]-1)

# For each word write a word index (not vector) to X tensor
train_X = np.zeros((N, conv_input_height), dtype=np.int)
train_Y = np.zeros((N, 2), dtype=np.int)
for i in range(N):
    for j in range(conv_input_height):
        train_X[i, j] = datasets[0][i, j]
    
print ('train_X.shape = {}'.format(train_X.shape))
print ('train_Y.shape = {}'.format(train_Y.shape))

train_X.shape = (1997, 2729)
train_Y.shape = (1997, 2)


In [9]:
import pandas as pd
url = 'https://raw.githubusercontent.com/addy1997/Task9-personality-prediction/main/essays.csv'
data_train = pd.read_csv(url, encoding='latin')
for i in range(N):
    train_Y[i,data_train.iloc[i,3]] = 1

In [10]:
print(train_X.shape)
print(train_Y)

(1997, 2729)
[[0 1]
 [1 0]
 [0 1]
 ...
 [1 0]
 [1 0]
 [0 1]]


In [11]:
# Validation data preparation
Nv = datasets[1].shape[0]

# For each word write a word index (not vector) to X tensor
val_X = np.zeros((Nv, conv_input_height), dtype=np.int)
val_Y = np.zeros((Nv, 2), dtype=np.int)
for i in range(Nv):
    for j in range(conv_input_height):
        val_X[i, j] = datasets[1][i, j]
    
print ('val_X.shape = {}'.format(val_X.shape))
print ('val_Y.shape = {}'.format(val_Y.shape))
for i in range(Nv):
    val_Y[i,data_train.iloc[i,3]] = 1


val_X.shape = (470, 2729)
val_Y.shape = (470, 2)


In [12]:
import pandas as pd

url = 'https://raw.githubusercontent.com/Atin17/Personality_Prediction_using_Twitter/master/essays.csv'
data_train = pd.read_csv(url, encoding='latin')
for i in range(N):
    train_Y[i,data_train.iloc[i,3]] = 1


In [13]:
print(train_X.shape)
print(train_Y)

(1997, 2729)
[[0 1]
 [1 0]
 [0 1]
 ...
 [1 0]
 [1 0]
 [0 1]]


In [21]:
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Reshape
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Convolution2D, MaxPooling2D
from tensorflow.keras.constraints import UnitNorm
from tensorflow.keras.regularizers import l2
from tensorflow.keras.initializers import random_uniform
from tensorflow.keras.optimizers import Adadelta
backend.set_image_data_format('channels_first')

# Number of feature maps (outputs of convolutional layer)
N_fm = 300
# kernel size of convolutional layer
kernel_size = 8



In [24]:
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Reshape
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Convolution2D, MaxPooling2D
from tensorflow.keras.constraints import UnitNorm
from tensorflow.keras.regularizers import l2
from tensorflow.keras.initializers import random_uniform
from tensorflow.keras.optimizers import Adadelta
backend.set_image_data_format('channels_first')

# Number of feature maps (outputs of convolutional layer)
N_fm = 300
# kernel size of convolutional layer
kernel_size = 8

model = Sequential()
# Embedding layer (lookup table of trainable word vectors)
model.add(Embedding(input_dim=W.shape[0], 
                    output_dim=W.shape[1], 
                    input_length=conv_input_height,
                    weights=[W]))
# Reshape word vectors from Embedding to tensor format suitable for Convolutional layer
model.add(Reshape((1, conv_input_height, conv_input_width)))

# first convolutional layer
model.add(Convolution2D(N_fm, 
                        kernel_size, 
                        conv_input_width, 
                        padding='same', 
                        kernel_regularizer=l2(0.0001)))
# ReLU activation
model.add(Activation('relu'))

# aggregate data in every feature map to scalar using MAX operation
model.add(MaxPooling2D(pool_size=(conv_input_height-kernel_size+1, 1), padding='same'))

model.add(Flatten())
model.add(Dropout(0.5))
# Inner Product layer (as in regular neural network, but without non-linear activation function)
model.add(Dense(2))
# SoftMax activation; actually, Dense+SoftMax works as Multinomial Logistic Regression
model.add(Activation('softmax'))

# Custom optimizers could be used, though right now standard adadelta is employed
opt = Adadelta(lr=1.0, rho=0.95, epsilon=1e-6)
model.compile(loss='categorical_crossentropy', 
              optimizer=opt,
              metrics=['accuracy'])

In [25]:
model.weights

[<tf.Variable 'embedding_2/embeddings:0' shape=(30395, 300) dtype=float32, numpy=
 array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.00714111,  0.00448608,  0.02062988, ...,  0.01416016,
         -0.06689453, -0.15136719],
        [ 0.16894531,  0.390625  ,  0.08642578, ...,  0.01123047,
         -0.02941895,  0.15722656],
        ...,
        [-0.18699868,  0.16601674,  0.0797045 , ...,  0.21439357,
         -0.18920986, -0.02926444],
        [ 0.14319344, -0.17904061,  0.16860904, ..., -0.00733663,
          0.17451356,  0.11404853],
        [ 0.07610573, -0.07467093,  0.11883443, ..., -0.11819206,
         -0.13819139,  0.05161416]], dtype=float32)>,
 <tf.Variable 'conv2d_2/kernel:0' shape=(8, 8, 1, 300) dtype=float32, numpy=
 array([[[[ 1.41785853e-02,  1.17775686e-02, -4.67845425e-03, ...,
           -1.56073961e-02,  8.14453699e-03,  1.87661499e-04]],
 
         [[-2.76058167e-03,  1.74114518e-02, -2.02888716e-03, ...

In [29]:
model.fit(x=train_X,y=train_Y,batch_size=32,epochs=15,verbose=1, validation_data=(val_X,val_Y))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f6c4e31c240>