In [1]:
import pandas as pd
import numpy as np

try:
    import gensim
except:
    !pip install gensim
    import gensim
    
try:
    import keras
except:
    !pip install tensorflow
    !pip install keras
    import keras

try:
    from sklearn.model_selection import train_test_split
except:
    !pip install sklearn
    from sklearn.model_selection import train_test_split

try:
    import matplotlib.pyplot as plt
except:
    !pip install matplotlib
    import matplotlib.pyplot as plt
    
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.backend.tensorflow_backend import set_session
from keras.backend.tensorflow_backend import clear_session
from keras.backend.tensorflow_backend import get_session
import tensorflow as tf
import gc

Using TensorFlow backend.


Collecting matplotlib
  Downloading matplotlib-3.3.4-cp37-cp37m-win_amd64.whl (8.5 MB)
Collecting pillow>=6.2.0
  Downloading Pillow-8.1.1-cp37-cp37m-win_amd64.whl (2.2 MB)
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.3.1-cp37-cp37m-win_amd64.whl (51 kB)
Collecting cycler>=0.10
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Installing collected packages: pillow, kiwisolver, cycler, matplotlib
Successfully installed cycler-0.10.0 kiwisolver-1.3.1 matplotlib-3.3.4 pillow-8.1.1


In [2]:
def file_reader(file_location):
    if(file_location.endswith('csv')):
        return pd.read_csv( file_location , engine = 'python', index_col=0)
    elif (file_location.endswith('tsv')):
        return pd.read_csv( file_location , engine = 'python' ,sep = '\t')

def read_dataset( file_location ):
    df = file_reader(file_location)
    token_list  = []
    for i in range( len( df ) ):
        token_list.append( df['TEXT'][i].split() )
    df['TOKENS'] = token_list
    return df

In [3]:
def tokens_to_sequence( tokenizer , texts , max_length ):
    sequences   = tokenizer.texts_to_sequences( texts )
    padded_data = pad_sequences( sequences , maxlen=max_length , padding='post' )
    return padded_data
    
def tokenize_dataset(df, max_length):
    num_words   = len(sorted(set([word for tok in df['TOKENS'] for word in tok])))
    tokenizer   = Tokenizer( num_words=num_words, lower=True, char_level=False , oov_token = "<OOV>")
    tokenizer.fit_on_texts( df['TEXT'].tolist() )
    return (tokenizer, num_words)

def get_embeddings(word_index):
    word2vec_path = 'input_files/GoogleNews-vectors-negative300.bin'
    word2vec      = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
    embeddings = np.zeros( (len(word_index)+1, embedding_dim))
    for word,index in word_index.items():
        if word in word2vec :
            embeddings[index,:] = word2vec[word] 
    del word2vec
    return embeddings

def embedded_values(data, embeddings):
    emb = []
    for value in data:
        t = np.array( np.mean([embeddings[w] for w in value if w < len(embeddings)] or [np.zeros(300)], axis=0))
        emb.append(t)
    return np.array(emb)

In [4]:
def reset_keras():
    sess = tf.compat.v1.Session()
    tf.keras.backend.clear_session()
    sess.close()
    sess = tf.compat.v1.Session()

    try:
        del model1 
    except:
        pass
    try:
        del model0 
    except:
        pass

    print(gc.collect()) 


In [5]:
def generate_model(input_len, embeddings):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding( 
                                   input_dim = embeddings.shape[0], 
                                   output_dim = embedding_dim, 
                                   weights = [embeddings], 
                                   input_length = input_len , 
                                   trainable = False
                                  ),
        tf.keras.layers.Conv1D( 
                                filters=128,
                                kernel_size=4,
                                activation='relu',
                              ),
        tf.keras.layers.Conv1D( 
                                filters=8,
                                kernel_size=4,
                                activation='relu',
                              ),
        tf.keras.layers.Conv1D( 
                                filters=256,
                                kernel_size=4,
                                activation='relu',
                                #kernel_regularizer = tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-4),
                                #bias_regularizer = tf.keras.regularizers.l2(1e-4),
                                #activity_regularizer = tf.keras.regularizers.l2(1e-5)
                              ),
        tf.keras.layers.GlobalAvgPool1D(),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(4,activation='sigmoid'),
        tf.keras.layers.Dense(1,activation='sigmoid')
    ])
    model.compile( loss='binary_crossentropy' , optimizer = 'adam', metrics=['acc'] )
    return model

In [6]:
def train_model(X, Y, embeddings, num_epoch):
    model1 = generate_model(X.shape[1], embeddings)
    model0 = generate_model(X.shape[1], embeddings)
    hist1 = model1.fit(X, (Y == 1).astype(int), epochs = num_epoch , shuffle = True , batch_size = 200, verbose=1)
    hist0 = model0.fit(X, (Y == 0).astype(int), epochs = num_epoch , shuffle = True , batch_size = 200, verbose=1)
    return (model1, model0, hist1, hist0)

def predict_model(model1, model0, X):
    pred1 = model1.predict(X)
    pred0 = model0.predict(X)
    return (pred1.T[0] > pred0.T[0]).astype(int)

def test_model(model1, model0, X_test, Y_test, num_epoch):
    ptrain = predict_model(model1, model0, X_train)
    ptest  = predict_model(model1, model0, X_test)
    train_score = np.mean( (ptrain == Y_train).astype(int) )         
    test_score  = np.mean( (ptest == Y_test).astype(int) ) 
    return train_score, test_score


def gen_LOOCV_sets(X, Y, i):
    X_train = np.delete(X, i, axis = 0)
    X_test  = np.array( [X[i]] )
    Y_train = df_essays.drop(df_essays.index[i])
    Y_test  = df_essays.iloc[i]
    return (X_train, X_test, Y_train, Y_test)

In [7]:
trait_names   = ['cEXT' , 'cNEU' , 'cAGR' , 'cCON' , 'cOPN']
max_length    = 300
embedding_dim = 300
num_epochs    = 80
batch_size    = 100

In [8]:
df_essay             = read_dataset('processed_datasets/essays.csv')
tokenizer, num_words = tokenize_dataset(df_essay, max_length)
embeddings           = get_embeddings(tokenizer.word_index)
essay_sequences      = tokens_to_sequence(tokenizer, df_essay['TEXT'], max_length)
X_Train              = embedded_values(essay_sequences, embeddings)

In [9]:
df_tweets           = file_reader('processed_datasets/tweets_processed.csv')
pre_tweet_seq    = tokens_to_sequence(tokenizer, df_tweets['PRETEXT'], max_length )
post_tweet_seq   = tokens_to_sequence(tokenizer, df_tweets['POSTTEXT'], max_length )

In [None]:
tweets_preds = pd.DataFrame()
tweets_preds['PERSON'] = df_tweets.index
hists = {}
for trait in trait_names:
    m1, m0, h1, h0    = train_model(X_Train, df_essay[trait], embeddings, 20)
    tweets_preds_pre  = predict_model(m1, m0, pre_tweet_seq)
    tweets_preds_post = predict_model(m1, m0, post_tweet_seq)
    
    hists[trait] = [h0,h1]
    tweets_preds['pre_'+trait]  = tweets_preds_pre
    tweets_preds['post_'+trait] = tweets_preds_post
    print('Done   :', trait)

Train on 2468 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 2468 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Done   : cEXT
Train on 2468 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 2468 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20


Epoch 18/20
Epoch 19/20
Epoch 20/20
Done   : cNEU
Train on 2468 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
 200/2468 [=>............................] - ETA: 0s - loss: 0.6907 - acc: 0.5350

In [19]:
tweets_preds.to_csv('result/preds_CNN.csv')