In [1]:
import pandas as pd
import numpy as np

try:
    import gensim
except:
    !pip install gensim
    import gensim
    
try:
    import keras
except:
    !pip install tensorflow
    !pip install keras
    import keras
    
try:
    from sklearn.model_selection import train_test_split
except:
    !pip install sklearn
    from sklearn.model_selection import train_test_split

try:
    import matplotlib.pyplot as plt
except:
    !pip install matplotlib
    import matplotlib.pyplot as plt
    
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
def file_reader(file_location):
    if(file_location.endswith('csv')):
        return pd.read_csv( file_location , engine = 'python', index_col=0)
    elif (file_location.endswith('tsv')):
        return pd.read_csv( file_location , engine = 'python' ,sep = '\t')

In [3]:
def read_dataset( file_location ):
    df = file_reader(file_location)
    token_list  = []
    for i in range( len( df ) ):
        token_list.append( df['TEXT'][i].split() )
    df['TOKENS'] = token_list
    return df

In [4]:
def tokens_to_sequence( tokenizer , texts , max_length ):
    sequences   = tokenizer.texts_to_sequences( texts )
    padded_data = pad_sequences( sequences , maxlen=max_length , padding='post' )
    return padded_data
    
def tokenize_dataset(df, max_length):
    num_words   = len(sorted(set([word for tok in df['TOKENS'] for word in tok])))
    tokenizer   = Tokenizer( num_words=num_words, lower=True, char_level=False , oov_token = "<OOV>")
    tokenizer.fit_on_texts( df['TEXT'].tolist() )
    return (tokenizer, num_words)

def get_embeddings(word_index):
    word2vec_path = '0_additional_files\GoogleNews-vectors-negative300.bin'
    word2vec      = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
    embeddings = np.zeros( (len(word_index)+1, embedding_dim))
    for word,index in word_index.items():
        if word in word2vec :
            embeddings[index,:] = word2vec[word] 
    del word2vec
    return embeddings

def embedded_values(data, embeddings):
    emb = []
    for value in data:
        t = np.array( np.mean([embeddings[w] for w in value if w < len(embeddings)] or [np.zeros(300)], axis=0))
        emb.append(t)
    return np.array(emb)

In [5]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import LeaveOneOut,KFold
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [6]:
inp = '3_data_preprocessed/'
trait_names = ['cEXT' , 'cNEU' , 'cAGR' , 'cCON' , 'cOPN']
sklearn_models = [
    svm.SVC(kernel='rbf', gamma = 1.0 , C = 10)
    #GaussianNB(priors=[0.5,0.5]),
    #LogisticRegression(),
    #RandomForestClassifier(max_depth=2, random_state=0)
]
max_length = 300
embedding_dim = 300

In [7]:
df_essay   = read_dataset( inp + 'essays.csv' )
num_words  = len(sorted(set([word for tok in df_essay['TOKENS'] for word in tok])))
tokenizer, num_words = tokenize_dataset(df_essay, max_length)
embeddings = get_embeddings(tokenizer.word_index)

In [8]:
essay_sequences  = tokens_to_sequence(tokenizer, df_essay['TEXT'], max_length)
X = embedded_values(essay_sequences, embeddings)

In [9]:
csval = {}

for model in sklearn_models:
    
    model_name = type(model).__name__
    csval[model_name] = {}
    print(model_name)
    
    for trait in trait_names:
        print('-----', trait, end = "  : ")
        
        y      = df_essay[trait]
        cv     = LeaveOneOut()
        scores = cross_validate(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1,return_train_score=True, verbose=10)
        csval[model_name][trait] = scores
        
        #print('    Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

SVC
----- cEXT  : 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   39.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   52.7s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  5

----- cNEU  : 

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.8min
[Paralle

----- cAGR  : 

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   47.8s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.9min
[Paralle

----- cCON  : 

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   47.8s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  6.0min
[Paralle

----- cOPN  : 

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   44.9s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.7min
[Paralle

In [10]:
accuracies = {}
for model in sklearn_models:
    model_name = type(model).__name__
    accuracies[model_name] = {}
    print(model_name)
    for trait in trait_names:
        accuracies[model_name][trait] = {}
        accuracies[model_name][trait]['train_score'] = np.mean( csval[model_name][trait]['train_score'] )
        accuracies[model_name][trait]['test_score']  = np.mean( csval[model_name][trait]['test_score'] )
        

SVC


In [11]:
import json

with open("Cross Validation/acc_s.json", 'w') as fp:
    json.dump(accuracies, fp)

In [12]:
import pickle
with open('Cross Validation/data_s', 'wb') as fp:
    pickle.dump(csval, fp, protocol=pickle.HIGHEST_PROTOCOL)