<a href="https://colab.research.google.com/github/banooZahra/essay/blob/main/shenas_capsNet_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Set up

### 1.1. Import Libraries

In [None]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import nltk
nltk.download("all")
import matplotlib.pyplot as plt
import torch

%matplotlib inline

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

### 1.2. Download Datasets

In [None]:
import numpy as np
import pandas as pd
import re
import csv
import math
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 

types = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
         'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
types = [x.lower() for x in types]
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")
stop_words = [word.replace("'", '') for word in stop_words]
stop_words.append("type")
print(stop_words)

def lemmatize(text):
  for type_ in types: 
    text = text.replace(type_, '')
  lemmatized = ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ') if (word not in stop_words)])
  return lemmatized


def clean_text(text):
  regex = re.compile('[%s]' % re.escape('|'))
  text = regex.sub(" ", text)
  words = str(text).split()
  words = [i.lower() + " " for i in words]
  words = [i for i in words if not "http" in i]
  words = " ".join(words)
  words = words.translate(words.maketrans('', '', string.punctuation))
  words = re.sub(' +', ' ', words) 
  return words


def preprocess_text(sentence):
    # remove hyperlinks, hashtags, smileys, emojies
    sentence = lemmatize(clean_text(sentence))
    # Removing words with more than two consecutive characters
    sentence = re.sub('\\S*(\\S)\\1\\1\\S*\\s?', ' ', sentence)
    # Remove hyperlinks
    sentence = re.sub(r'http\S+', ' ', sentence)
    # Remove punctuations and numbers
    # sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # sentence = re.sub('[^a-zA-Z.?!,]', ' ', sentence)
    # Single character removal (except I)
    # sentence = re.sub(r"\s+[a-zA-HJ-Z]\s+", ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = re.sub(r'\|\|\|', ' ', sentence)

    return sentence


def load_essays_df(datafile):
    with open(datafile, "rt") as csvf:
        csvreader = csv.reader(csvf, delimiter=',', quotechar='"')
        first_line = True
        df = pd.DataFrame(columns=["user", "text", "token_len", "EXT", "NEU", "AGR", "CON", "OPN"])
        for line in csvreader:
            if first_line:
                first_line = False
                continue

            text = line[1]
            df = df.append({"user": line[0],
                            "text": preprocess_text(text),
                            "token_len": 0,
                            "EXT": 1 if line[2].lower() == 'y' else 0,
                            "NEU": 1 if line[3].lower() == 'y' else 0,
                            "AGR": 1 if line[4].lower() == 'y' else 0,
                            "CON": 1 if line[5].lower() == 'y' else 0,
                            "OPN": 1 if line[6].lower() == 'y' else 0}, ignore_index=True)

    print('EXT : ', df['EXT'].value_counts())
    print('NEU : ', df['NEU'].value_counts())
    print('AGR : ', df['AGR'].value_counts())
    print('CON : ', df['CON'].value_counts())
    print('OPN : ', df['OPN'].value_counts())

    return df


def essays_embeddings(datafile, tokenizer, token_length, mode):
    targets = []
    input_ids = []

    df = load_essays_df(datafile)
    cnt = 0

    # sorting all essays in ascending order of their length
    for ind in df.index:
        tokens = tokenizer.tokenize(df['text'][ind])
        df.at[ind, 'token_len'] = len(tokens)
    
    df.sort_values(by=['token_len', 'user'],inplace=True, ascending=True)
    tmp_df = df['user']
    tmp_df.to_csv('/content/personality-prediction/data/essays/author_id_order.csv', index_label='order')
    print(df['token_len'].mean())

    for ii in range(len(df)):
        text = preprocess_text(df['text'][ii])
        tokens = tokenizer.tokenize(text)

        if mode == 'normal' or mode == '512_head':
            input_ids.append(
                tokenizer.encode(tokens, add_special_tokens=True, max_length=token_length, pad_to_max_length=True))
        elif mode == '512_tail':
            input_ids.append(
                tokenizer.encode(tokens[-(token_length - 2):], add_special_tokens=True, max_length=token_length,
                                 pad_to_max_length=True))
        elif mode == '256_head_tail':
            input_ids.append(
                tokenizer.encode(tokens[:(token_length - 1)] + tokens[-(token_length - 1):], add_special_tokens=True,
                                 max_length=token_length, pad_to_max_length=True))

        elif mode == 'docbert':
            docmax_len = 2048
            subdoc_len = 512
            max_subdoc_num = docmax_len // subdoc_len
            subdoc_tokens = [tokens[i:i + subdoc_len] for i in range(0, len(tokens), subdoc_len)][:max_subdoc_num]
            # print(subdoc_tokens)
            token_ids = [tokenizer.encode(x, add_special_tokens=True, max_length=token_length, pad_to_max_length=True)
                         for x in subdoc_tokens]
            # print(token_ids)
            token_ids = np.array(token_ids).astype(int)

            buffer_len = docmax_len // subdoc_len - token_ids.shape[0]
            # print(buffer_len)
            tmp = np.full(shape=(buffer_len, token_length), fill_value=0, dtype=int)
            token_ids = np.concatenate((token_ids, tmp), axis=0)

            input_ids.append(token_ids)

        if (cnt < 3):
            print(input_ids[-1])

        targets.append([df['EXT'][ii], df['NEU'][ii], df['AGR'][ii], df['CON'][ii], df['OPN'][ii]])
        cnt += 1

    author_ids = np.array(df.index)
    print('loaded all input_ids and targets from the data file!')
    return author_ids, input_ids, targets


def load_Kaggle_df(datafile):
    with open(datafile, "rt", encoding='utf-8') as csvf:
        csvreader = csv.reader(csvf, delimiter=',', quotechar='"')
        first_line = True
        df = pd.DataFrame(columns=["user", "text", "E", "N", "F", "J"])
        for line in csvreader:
            if first_line:
                first_line = False
                continue

            text = line[1]
            if int(line[3]) == 3559:
              text = "input text with size zero so this is literally fake text"
            df = df.append({"user": line[3],
                            "text": preprocess_text(text),
                            "E": 1 if line[0][0] == 'E' else 0,
                            "N": 1 if line[0][1] == 'N' else 0,
                            "F": 1 if line[0][2] == 'F' else 0,
                            "J": 1 if line[0][3] == 'J' else 0, }, ignore_index=True)

    print('E : ', df['E'].value_counts())
    print('N : ', df['N'].value_counts())
    print('F : ', df['F'].value_counts())
    print('J : ', df['J'].value_counts())

    return df


def kaggle_embeddings(datafile, tokenizer, token_length):
    hidden_features = []
    targets = []
    token_len = []
    input_ids = []
    author_ids = []

    df = load_Kaggle_df(datafile)
    cnt = 0
    for ind in df.index:
        
        text = preprocess_text(df['text'][ind])
        tokens = tokenizer.tokenize(text)
        token_len.append(len(tokens))
        token_ids = tokenizer.encode(tokens, add_special_tokens=True, max_length=token_length, pad_to_max_length=True)
        if (cnt < 10):
            print(tokens[:10])

        input_ids.append(token_ids)
        targets.append([df['E'][ind], df['N'][ind], df['F'][ind], df['J'][ind]])
        author_ids.append(int(df['user'][ind]))
        cnt += 1

    print('average length : ', int(np.mean(token_len)))
    author_ids = np.array(author_ids)

    return author_ids, input_ids, targets


def load_persian_df(datafile):
    with open(datafile, "rt", encoding='utf-8') as csvf:
        csvreader = csv.reader(csvf, delimiter=',', quotechar='"')
        first_line = True
        user_id = 0
        df = pd.DataFrame(columns=["user", "text", "E", "N", "F", "J"])
        for line in csvreader:
            if first_line:
                first_line = False
                continue
            text = line[1]
            # text = line[0]
            user_id += 1
            df = df.append({"user": user_id,
                            "text": text,
                            "E": 1 if line[0][0] == 'E' else 0,
                            "N": 1 if line[0][1] == 'N' else 0,
                            "F": 1 if line[0][2] == 'F' else 0,
                            "J": 1 if line[0][3] == 'J' else 0, }, ignore_index=True)

    print('E : ', df['E'].value_counts())
    print('N : ', df['N'].value_counts())
    print('F : ', df['F'].value_counts())
    print('J : ', df['J'].value_counts())

    return df


def persian_embeddings(datafile, tokenizer, token_length):
    hidden_features = []
    targets = []
    token_len = []
    input_ids = []
    author_ids = []

    df = load_persian_df(datafile)
    cnt = 0
    for ind in df.index:
        
        text = preprocess_text(df['text'][ind])
        tokens = tokenizer.tokenize(text)
        token_len.append(len(tokens))
        token_ids = tokenizer.encode(tokens, add_special_tokens=True, max_length=token_length, pad_to_max_length=True)
        if (cnt < 10):
            print(tokens[:10])

        input_ids.append(token_ids)
        targets.append([df['E'][ind], df['N'][ind], df['F'][ind], df['J'][ind]])
        author_ids.append(int(df['user'][ind]))
        cnt += 1

    print('average length : ', int(np.mean(token_len)))
    author_ids = np.array(author_ids)

    return author_ids, input_ids, targets

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import numpy as np
import pandas as pd
import re
from scipy.io import arff

def read_and_process(path):
    arff = open(path, 'r')
    attributes = []
    values = []
    is_attr = True
    arff.readline()
    arff.readline()
    while is_attr:
        line = arff.readline()
        if len(line.split()) == 0:
            is_attr = False
            continue
        type = line.split()[0]
        attr = ' '.join(line.split()[1:])
        if type == "@attribute":
            attributes.append(attr)
        else:
            is_attr = False
    for line in arff.readlines():
        if len(line.split(",")) < 10:
            continue
        else:
            components = line.split(",")
            values.append(components)
            name = components[0].replace("\'", "").split("\\\\")[-1]
            values[-1][0] = name
    df = pd.DataFrame(columns=attributes, data=values)
    df['idx'] = [int(re.sub('id_', '', i)) for i in df[df.columns[0]]]
    df = df.drop(df.columns[0], axis=1)
    df = df.set_index(['idx'])
    df = df.apply(pd.to_numeric, errors='coerce')
    df = df.sort_index()
    return df


def load_features(dir, dataset):
    idx = 'id'
    if dataset == 'kaggle':
        drop_cols = ['BROWN-FREQ numeric', 'K-F-FREQ numeric', 'K-F-NCATS numeric', 'K-F-NSAMP numeric',
                     'T-L-FREQ numeric', 'Extraversion numeric'
            , '\'Emotional stability\' numeric', 'Agreeableness numeric', 'Conscientiousness numeric',
                     '\'Openness to experience\' numeric']
        mairesse = read_and_process(dir + dataset + '_mairesse_labeled.arff')
        mairesse = mairesse.drop(drop_cols, axis=1)
    elif dataset == 'essays':
        idx = '#AUTHID'
        mairesse = pd.read_csv(dir + dataset + '_mairesse_labeled.csv')
        mairesse = mairesse.set_index(mairesse.columns[0])
    nrc = pd.read_csv(dir + dataset + '_nrc.csv').set_index([idx])
    # nrc = nrc.sort_values(by=['id'])
    # nrc = nrc.drop(['id'], axis=1)
    nrc_vad = pd.read_csv(dir + dataset + '_nrc_vad.csv').set_index([idx])
    # nrc_vad = nrc_vad.sort_values(by=['id'])
    # nrc_vad = nrc_vad.drop(['id'], axis=1)
    # affectivespace = pd.read_csv(dir + 'essays_affectivespace.csv').set_index(['#AUTHID'])
    # hourglass = pd.read_csv(dir + dataset + '_hourglass.csv').set_index([idx])
    readability = pd.read_csv(dir + dataset + '_readability.csv').set_index([idx])

    return [nrc, nrc_vad, readability, mairesse]


def get_psycholinguist_data(dump_data, dataset, feature_flags):
    features = load_features('/content/personality-prediction/data/' + dataset + '/psycholinguist_features/', dataset)

    first = 1
    for feature, feature_flag in zip(features, feature_flags):
        if feature_flag:
            if first:
                df = feature
                first = 0
            else:
                df = pd.merge(df, feature, left_index=True, right_index=True)
    if dataset == 'essays':
        labels = dump_data[['user', 'text', 'EXT', 'NEU', 'AGR', 'CON', 'OPN']]
    if dataset == 'kaggle':
        labels = dump_data[['user', 'text', 'E', 'N', 'F', 'J']]
    labels = labels.set_index('user')
    if dataset == 'kaggle':
        labels.index = pd.to_numeric(labels.index, errors='coerce')
        df.index = pd.to_numeric(df.index, errors='coerce')
    merged = pd.merge(df, labels, left_index=True, right_index=True).fillna(0)
    
    return merged

In [None]:
!git clone https://github.com/yashsmehta/personality-prediction.git

Cloning into 'personality-prediction'...
remote: Enumerating objects: 839, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 839 (delta 6), reused 2 (delta 0), pack-reused 826[K
Receiving objects: 100% (839/839), 53.41 MiB | 16.09 MiB/s, done.
Resolving deltas: 100% (499/499), done.
Updating files: 100% (56/56), done.


In [None]:
import tensorflow as tf
import numpy as np
import csv
import re
import pickle
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset


def get_inputs(op_dir, dataset):
    """ Read data from pkl file and prepare for training. """
    file = open(op_dir + dataset + '.pkl', 'rb')
    data = pickle.load(file)
    orders, data_x, data_y = list(zip(*data))
    file.close()

    layer = 11
    n_hl = 12
    # alphaW is responsible for which BERT layer embedding we will be using
    if (layer == 'all'):
        alphaW = np.full([n_hl], 1 / n_hl)

    else:
        alphaW = np.zeros([n_hl])
        alphaW[int(layer) - 1] = 1

    # just changing the way data is stored (tuples of minibatches) and getting the output for the required layer of BERT using alphaW
    inputs = []
    targets = []
    author_ids = []

    n_batches = len(data_y)
    print(len(orders))

    for ii in range(n_batches):
        inputs.extend(np.einsum('k,kij->ij', alphaW, data_x[ii]))
        targets.extend(data_y[ii])
        author_ids.extend(orders[ii])

    print('inputs shape: ', np.array(inputs).shape)
    print('author_ids shape: ', np.array(author_ids).shape)

    inputs = pd.DataFrame(np.array(inputs))
    inputs['order'] = author_ids
    inputs = inputs.set_index(['order'])
    full_targets = pd.DataFrame(np.array(targets))
    full_targets['order'] = author_ids
    full_targets = full_targets.set_index(['order'])

    if dataset == 'test':
        trait_labels = ['E', 'N', 'F', 'J']
        return trait_labels, inputs

    elif dataset == 'essays':
        dump_data = load_essays_df('/content/personality-prediction/data/essays/essays.csv')
        trait_labels = ['EXT', 'NEU', 'AGR', 'CON', 'OPN']

    elif dataset == 'kaggle':
        dump_data = load_Kaggle_df('/content/personality-prediction/data/kaggle/kaggle.csv')
        trait_labels = ['E', 'N', 'F', 'J']

    data_other_features_df = get_psycholinguist_data(dump_data, dataset, feature_flags)
    data_features_df = merge_features(inputs, data_other_features_df, trait_labels)

    return trait_labels, data_features_df


def merge_features(embedding, other_features, trait_labels):
    """ Merge BERT and Psychologic features. """
    if dataset == 'essays':
        orders = pd.read_csv('/content/personality-prediction/data/essays/author_id_order.csv').set_index(['order'])
        df = pd.merge(embedding, orders, left_index=True, right_index=True).set_index(['user'])
    else:
        df = embedding
    df = pd.merge(df, other_features, left_index=True, right_index=True)

    return df

In [None]:
dataset = "kaggle"
op_dir = '/content/drive/MyDrive/personality_detection_XLM-RoBERTa/document_xlm-roberta_features/'
n_classes = 2
features_dim = 123
seed = 789
np.random.seed(seed)
tf.random.set_seed(seed)

nrc, nrc_vad, readability, mairesse = [True, True, True, True]
feature_flags = [nrc, nrc_vad, readability, mairesse]
kaggle_trait_labels, kaggle_data_features_df = get_inputs(op_dir, dataset)

FileNotFoundError: ignored

### concat with essays dataset

In [None]:
dataset = "essays"
essays_trait_labels, essays_data_features_df = get_inputs(op_dir, dataset)
essays_data_features_df['E'] = essays_data_features_df['EXT']
essays_data_features_df['N'] = essays_data_features_df['OPN']
essays_data_features_df['F'] = essays_data_features_df['AGR']
essays_data_features_df['J'] = essays_data_features_df['CON']
essays_data_features_df = essays_data_features_df.drop('EXT', 1)
essays_data_features_df = essays_data_features_df.drop('AGR', 1)
essays_data_features_df = essays_data_features_df.drop('CON', 1)
essays_data_features_df = essays_data_features_df.drop('OPN', 1)
essays_data_features_df = essays_data_features_df.drop('NEU', 1)
essays_data_features_df

In [None]:
kaggle_translate_df = pd.read_csv('/content/drive/MyDrive/personality_detection_XLM-RoBERTa/test/dataset/kaggle_translate.csv')
essays_translate_df = pd.read_csv('/content/drive/MyDrive/personality_detection_XLM-RoBERTa/test/dataset/essays_translate.csv')
kaggle_data_features_df['text'][0:8675] = kaggle_translate_df.translate
kaggle_data_features_df.text[3559] = 'الکی'
essays_data_features_df['text'][0:2467] = essays_translate_df.translate

In [None]:
test_data_features_df = pd.read_csv('/content/drive/MyDrive/personality_detection_XLM-RoBERTa/test/dataset/essays_translate_test.csv', index_col='order')
essays_data_features_df = essays_data_features_df.drop(test_data_features_df.index)

In [None]:
# first = True
# for e in range(0,2):
#     for n in range(0,2):
#         for f in range(0,2):
#             for j in range(0,2):
#                 res = essays_data_features_df.loc[(essays_data_features_df.E==e) & (essays_data_features_df.N==n) & (essays_data_features_df.F==f) & (essays_data_features_df.J==j)][10:20]
#                 essays_data_features_df = essays_data_features_df.drop(res.index)
#                 if first:
#                     test_data_features_df = res
#                     first = False
#                 else:
#                     test_data_features_df = test_data_features_df.append(res)
# test_data_features_df

In [None]:
data_features_df = kaggle_data_features_df.append(essays_data_features_df)
data_features_df

### 1.3. Set up GPU for Training

In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## 2. Data Preparation

### 2.1. Tokenize

In [None]:
from nltk.tokenize import word_tokenize
from collections import defaultdict

def tokenize(texts):
    """Tokenize texts, build vocabulary and find maximum sentence length.
    
    Args:
        texts (List[str]): List of text data
    
    Returns:
        tokenized_texts (List[List[str]]): List of list of tokens
        word2idx (Dict): Vocabulary built from the corpus
        max_len (int): Maximum sentence length
    """

    max_len = 0
    tokenized_texts = []
    word2idx = {}

    # Add <pad> and <unk> tokens to the vocabulary
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1

    # Building our vocab from the corpus starting from index 2
    idx = 2
    for sent in texts:
        tokenized_sent = word_tokenize(sent)

        # Add `tokenized_sent` to `tokenized_texts`
        tokenized_texts.append(tokenized_sent)

        # Add new token to `word2idx`
        for token in tokenized_sent:
            if token not in word2idx:
                word2idx[token] = idx
                idx += 1

        # Update `max_len`
        max_len = max(max_len, len(tokenized_sent))

    return tokenized_texts, word2idx, max_len

def encode(tokenized_texts, word2idx, max_len):
    """Pad each sentence to the maximum sentence length and encode tokens to
    their index in the vocabulary.

    Returns:
        input_ids (np.array): Array of token indexes in the vocabulary with
            shape (N, max_len). It will the input of our CNN model.
    """

    input_ids = []
    for tokenized_sent in tokenized_texts:
        # Pad sentences to max_len
        tokenized_sent += ['<pad>'] * (max_len - len(tokenized_sent))

        # Encode tokens to input_ids
        input_id = [word2idx.get(token) for token in tokenized_sent]
        input_ids.append(input_id)
    
    return np.array(input_ids)

Load Pretrained Vectors

We will load the pretrain vectors for each tokens in our vocabulary. For tokens with no pretraiend vectors, we will initialize random word vectors with the same length and variance.

In [None]:
# !pip install unrar
# !unrar x "/content/drive/MyDrive/personality_detection_XLM-RoBERTa/fastText/vecmap-glove-fasttext-720000.rar"

In [None]:
def load_pretrained_vectors(word2idx, fname):
    """Load pretrained vectors and create embedding layers.
    
    Args:
        word2idx (Dict): Vocabulary built from the corpus
        fname (str): Path to pretrained vector file

    Returns:
        embeddings (np.array): Embedding matrix with shape (N, d) where N is
            the size of word2idx and d is embedding dimension
    """

    # fname = '/content/drive/MyDrive/personality_detection_XLM-RoBERTa/fastText/crawl-300d-2M.vec'
    print("Loading pretrained vectors...")
    fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())

    # Initilize random embeddings
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
    embeddings[word2idx['<pad>']] = np.zeros((d,))

    # Load pretrained vectors
    count = 0
    for line in fin:
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2idx:
            count += 1
            embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)

    print(f"There are {count} / {len(word2idx)} pretrained vectors found.")

    return embeddings

### 2.2. Tokenize(persian fastText)

In [None]:
!gunzip /content/drive/MyDrive/personality_detection_XLM-RoBERTa/fastText/farsi_fasttext/cc.fa.300.vec.gz
!pip install fasttext

gzip: /content/drive/MyDrive/personality_detection_XLM-RoBERTa/fastText/farsi_fasttext/cc.fa.300.vec.gz: No such file or directory
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 5.2 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.8.1-py2.py3-none-any.whl (208 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3127081 sha256=8a574884801a1f5b6eb62bae89e48d152bbe474a2a249c5e280fc5fa1ab9860a
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.8.1


In [None]:
import io

def load_vectors(word2idx, fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
    embeddings[word2idx['<pad>']] = np.zeros((d,))
    count = 0
    for line in fin:
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2idx:
            count += 1
            embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)
    print(f"There are {count} / {len(word2idx)} pretrained vectors found.")
    return embeddings

## 3. Model

### 3.1. Create CapsNet Model

In [None]:
import torch as t
import torch.nn as nn
import torch.nn.functional as F

USE_CUDA = True
embedding_dim = 300
use_pretrained_embedding = True
BATCH_SIZE = 50
gru_len = 128
Routings = 5
Num_capsule = 16
Dim_capsule = 2
dropout_p = 0.25
T_epsilon = 1e-5
num_classes = 2


class Embed_Layer(nn.Module):
    def __init__(self, embedding_matrix=None, vocab_size=None, embedding_dim=300):
        super(Embed_Layer, self).__init__()
        self.encoder = nn.Embedding(vocab_size + 1, embedding_dim)
        if use_pretrained_embedding:
            self.encoder.weight.data.copy_(embedding_matrix)

    def forward(self, x, dropout_p=0.25):
        return nn.Dropout(p=dropout_p)(self.encoder(x))


class GRU_Layer(nn.Module):
    def __init__(self):
        super(GRU_Layer, self).__init__()
        self.gru = nn.GRU(input_size=300,
                          hidden_size=gru_len,
                          bidirectional=True)
        
    def init_weights(self):
        ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
        hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
        b = (param.data for name, param in self.named_parameters() if 'bias' in name)
        for k in ih:
            nn.init.xavier_uniform_(k)
        for k in hh:
            nn.init.orthogonal_(k)
        for k in b:
            nn.init.constant_(k, 0)

    def forward(self, x):
        return self.gru(x)


class Caps_Layer(nn.Module):
    def __init__(self, input_dim_capsule=gru_len * 2, num_capsule=Num_capsule, dim_capsule=Dim_capsule, \
                 routings=Routings, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Caps_Layer, self).__init__(**kwargs)

        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = self.squash
        else:
            self.activation = nn.ReLU(inplace=True)

        if self.share_weights:
            self.W = nn.Parameter(
                nn.init.xavier_normal_(t.empty(1, input_dim_capsule, self.num_capsule * self.dim_capsule)))
        else:
            self.W = nn.Parameter(
                t.randn(BATCH_SIZE, input_dim_capsule, self.num_capsule * self.dim_capsule))

    def forward(self, x):

        if self.share_weights:
            u_hat_vecs = t.matmul(x, self.W)
        else:
            print('add later')

        batch_size = x.size(0)
        input_num_capsule = x.size(1)
        u_hat_vecs = u_hat_vecs.view((batch_size, input_num_capsule,
                                      self.num_capsule, self.dim_capsule))
        u_hat_vecs = u_hat_vecs.permute(0, 2, 1, 3)  # 转成(batch_size,num_capsule,input_num_capsule,dim_capsule)
        b = t.zeros_like(u_hat_vecs[:, :, :, 0])  # (batch_size,num_capsule,input_num_capsule)

        for i in range(self.routings):
            b = b.permute(0, 2, 1)
            c = F.softmax(b, dim=2)
            c = c.permute(0, 2, 1)
            b = b.permute(0, 2, 1)
            outputs = self.activation(t.einsum('bij,bijk->bik', (c, u_hat_vecs)))  # batch matrix multiplication
            # outputs shape (batch_size, num_capsule, dim_capsule)
            if i < self.routings - 1:
                b = t.einsum('bik,bijk->bij', (outputs, u_hat_vecs))  # batch matrix multiplication
        return outputs  # (batch_size, num_capsule, dim_capsule)

    # text version of squash, slight different from original one
    def squash(self, x, axis=-1):
        s_squared_norm = (x ** 2).sum(axis, keepdim=True)
        scale = t.sqrt(s_squared_norm + T_epsilon)
        return x / scale


class Dense_Layer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Dense_Layer, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, output_dim),  # num_capsule*dim_capsule -> num_classes
            nn.Dropout(p=dropout_p, inplace=True)
        )


    def forward(self, x):
        batch_size = x.size(0)
        x = x.view(batch_size, -1)
        return self.fc(x)


class Capsule_Main(nn.Module):
    def __init__(self, eng_embedding_matrix=None, eng_vocab_size=None, fa_embedding_matrix=None, fa_vocab_size=None):
        super(Capsule_Main, self).__init__()
        self.eng_embed_layer = Embed_Layer(eng_embedding_matrix, eng_vocab_size)
        self.fa_embed_layer = Embed_Layer(fa_embedding_matrix, fa_vocab_size)
        self.gru_layer = GRU_Layer()
        self.gru_layer.init_weights()
        self.caps_layer = Caps_Layer()
        self.eng_dense_layer1 = Dense_Layer(Num_capsule * Dim_capsule + 891, 16)# + 891
        self.fa_dense_layer1 = Dense_Layer(Num_capsule * Dim_capsule + 891, 16)# + 891
        self.dense_layer2 = Dense_Layer(16, 2)

    def forward(self, content, features, lang):
        if lang == 'eng':
            content1 = self.eng_embed_layer(content)
        elif lang == 'fa':
            content1 = self.fa_embed_layer(content)
        content2, _ = self.gru_layer(content1)  
        # 这个输出是个tuple，一个output(batch_size, seq_len, num_directions * hidden_size)，一个hn
        content3 = self.caps_layer(content2)
        content4 = torch.flatten(content3, start_dim=1)
        content4 = torch.cat([content4, features], dim=-1)
        if lang == 'eng':
            content5 = self.eng_dense_layer1(content4)
        elif lang == 'fa':
            content5 = self.fa_dense_layer1(content4)
        content5 = F.relu(content5)
        output = self.dense_layer2(content5)
        return output

### 3.2. Optimizer

In [None]:
import torch.optim as optim

def initilize_model():
  
    # Instantiate CNN model
    capnet = Capsule_Main(eng_embeddings, len(eng_embeddings)-1, fa_embeddings, len(fa_embeddings)-1)
    
    # Send model to `device` (GPU/CPU)
    capnet.to(device)

    # Instantiate Adadelta optimizer
    # optimizer = optim.Adadelta(capnet.parameters(), lr=0.1, rho=0.95)
    # optimizer = optim.SGD(capnet.parameters(), lr=0.001, momentum=0.9)
    # optimizer = optim.ASGD(capnet.parameters(), lr=0.0001, lambd=0.001, alpha=0.5, t0=1000000.0, weight_decay=0)
    # optimizer = optim.AdamW(capnet.parameters(), lr=0.00001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=False)
    optimizer = optim.Adam(capnet.parameters(), lr=0.2e-4)#, eps=1e-08, weight_decay=0.01)

    return capnet, optimizer

### 3.3. Training Loop

In [None]:
import random
import time
# Specify loss function
loss_fn = torch.nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
def train(model, optimizer, train_dataloader, val_dataloader=None, epochs=10):

    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_features, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_features.float())
            logits = F.softmax(logits)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if val_dataloader is not None:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_loss = []
    label_predictions = []
    y_pred = []
    y_true = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_features, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_features.float())
            logits = F.softmax(logits)

        # Compute loss and accumulate the loss values
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        label_predictions.extend(logits[1].cpu().detach().tolist())
        # print(label_predictions)
        pred = torch.argmax(logits.cpu().detach(), dim=1).flatten()
        target = np.round(b_labels.cpu().detach())          
        y_pred.extend(pred.tolist())
        y_true.extend(target.tolist())

    # Compute the average accuracy and loss over the validation set.
    accuracy = accuracy_score(y_true,y_pred) * 100
    val_loss = np.mean(val_loss)

    return val_loss, accuracy

## 4. Evaluation and Testing Model (K-Fold)

In [None]:
# torch.cuda.empty_cache()

In [None]:
trait_labels = kaggle_trait_labels
n_splits = 10
set_seed(seed)
expdata = {}
expdata['train_acc'], expdata['val_acc'], expdata['val_loss'], expdata['trait'], expdata['fold'] = [], [], [], [], []

for trait_idx in range(len(trait_labels)):
#   if trait_idx != 0: continue
  print(trait_labels[trait_idx])
  print("before balancing")
  df = data_features_df.rename(columns={trait_labels[trait_idx]: 'toxic'})
  label_0 = df.loc[df.toxic == 0]
  label_1 = df.loc[df.toxic == 1]
  print(str(len(label_1)) + " , " + str(len(label_0)))
  min_length = min(len(label_1), len(label_0))
  print(min_length)

  df = pd.concat([
      df.query('toxic==0').sample(n=min_length, random_state=seed),
      df.query('toxic==1').sample(n=min_length, random_state=seed)])

  label_0 = df.loc[df.toxic == 0]
  label_1 = df.loc[df.toxic == 1]
  print("after balancing")
  print(str(len(label_1)) + " , " + str(len(label_0)))
  inputs = df[df.columns[:-1*(len(trait_labels)+1)]].values
  targets = df['toxic'].values

  # Tokenize, build vocabulary, encode tokens
  print("Tokenizing...\n")
  df['text'], word2idx, max_len = tokenize(df['text'])

  # Load pretrained vectors
#   embeddings = load_pretrained_vectors(word2idx)
#   embeddings = torch.tensor(embeddings)

#   Load fastText pretrained vectors
  eng_embeddings = load_pretrained_vectors(word2idx, '/content/drive/MyDrive/personality_detection_XLM-RoBERTa/fastText/crawl-300d-2M.vec')
  eng_embeddings = torch.tensor(eng_embeddings)

  fa_embeddings = load_vectors(word2idx, '/content/drive/MyDrive/personality_detection_XLM-RoBERTa/fastText/farsi_fasttext/cc.fa.300.vec')
  fa_embeddings = torch.tensor(fa_embeddings)

  expdata['trait'].extend([trait_labels[trait_idx]] * n_splits)
  expdata['fold'].extend(np.arange(1, n_splits + 1))

  skf = StratifiedKFold(n_splits=n_splits, shuffle=False)
  k = 0
  for train_index, test_index in skf.split(inputs, targets):

    train_df = df.iloc[train_index]
    val_df = df.iloc[test_index]

    train_input_ids = encode(train_df['text'], word2idx, max_len)
    train_inputs = np.array(train_input_ids)
    y_train = np.array(train_df['toxic'])
    f_train = np.array(train_df[train_df.columns[:(-1*(len(trait_labels)+1))]].values)

    val_inout_ids = encode(val_df['text'], word2idx, max_len)
    val_inputs = np.array(val_inout_ids)
    y_val = np.array(val_df['toxic'])
    f_val = np.array(val_df[train_df.columns[:(-1*(len(trait_labels)+1))]].values)

    # Convert all data types to torch.Tensor
    train_inputs = torch.tensor(train_inputs)
    val_inputs = torch.tensor(val_inputs)
    train_features = torch.tensor(f_train, dtype=torch.float32)
    val_features = torch.tensor(f_val, dtype=torch.float32)
    train_labels = torch.tensor(y_train)
    val_labels = torch.tensor(y_val)

    batch_size = 50

    # Create the DataLoader for our training set
    train_data = TensorDataset(train_inputs, train_features, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create the DataLoader for our validation set
    val_data = TensorDataset(val_inputs, val_features, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    # CNN-non-static: fastText pretrained word vectors are fine-tuned during training.
    set_seed(seed)
    capsNet, optimizer = initilize_model()
    train(capsNet, optimizer, train_dataloader, val_dataloader, epochs=15)

    _, train_accuracy = evaluate(capsNet, train_dataloader)
    expdata['train_acc'].append(train_accuracy)
    print("train accuracy:")
    print(train_accuracy)

    val_loss, val_accuracy = evaluate(capsNet, val_dataloader)
    expdata['val_loss'].append(val_loss)
    expdata['val_acc'].append(val_accuracy)
    print("val accuracy:")
    print(val_accuracy)
    k += 1
    print(k)

  # if trait_idx == 3: break

df = pd.DataFrame.from_dict(expdata)
df

In [None]:
df.mean()

# 5. Evaluation on Hole Data and Saving Model

In [None]:
trait_labels = kaggle_trait_labels
n_splits = 1
set_seed(seed)
expdata = {}
expdata['train_acc'], expdata['trait'], expdata['fold'] = [], [], []

for trait_idx in range(len(trait_labels)):
  if trait_idx == 0: continue
  print(trait_labels[trait_idx])
  print("before balancing")
  df = data_features_df.rename(columns={trait_labels[trait_idx]: 'toxic'})
  label_0 = df.loc[df.toxic == 0]
  label_1 = df.loc[df.toxic == 1]
  print(str(len(label_1)) + " , " + str(len(label_0)))
  min_length = min(len(label_1), len(label_0))
  print(min_length)

  df = pd.concat([
      df.query('toxic==0').sample(n=min_length, random_state=seed),
      df.query('toxic==1').sample(n=min_length, random_state=seed)])

  label_0 = df.loc[df.toxic == 0]
  label_1 = df.loc[df.toxic == 1]
  print("after balancing")
  print(str(len(label_1)) + " , " + str(len(label_0)))
  inputs = df[df.columns[:-1*(len(trait_labels)+1)]].values
  targets = df['toxic'].values

  # Tokenize, build vocabulary, encode tokens
  print("Tokenizing...\n")
  df['text'], word2idx, max_len = tokenize(df['text'])

  # Load vec map pretrained vectors
#   eng_embeddings = load_pretrained_vectors(word2idx, '/content/SRC-EN.vec-718000.txt')
#   eng_embeddings = torch.tensor(eng_embeddings)

#   fa_embeddings = load_pretrained_vectors(word2idx, '/content/TGT-FA.vec-718000.txt')
#   fa_embeddings = torch.tensor(fa_embeddings)

  # Load fastText pretrained vectors
  eng_embeddings = load_pretrained_vectors(word2idx, '/content/drive/MyDrive/personality_detection_XLM-RoBERTa/fastText/crawl-300d-2M.vec')
  eng_embeddings = torch.tensor(eng_embeddings)

  fa_embeddings = load_vectors(word2idx, '/content/drive/MyDrive/personality_detection_XLM-RoBERTa/fastText/farsi_fasttext/cc.fa.300.vec')
  fa_embeddings = torch.tensor(fa_embeddings)


  expdata['trait'].extend([trait_labels[trait_idx]] * n_splits)
  expdata['fold'].extend(np.arange(1, n_splits + 1))

  train_df = df

  train_input_ids = encode(train_df['text'], word2idx, max_len)
  train_inputs = np.array(train_input_ids)
  y_train = np.array(train_df['toxic'])
  f_train = np.array(train_df[train_df.columns[:(-1*(len(trait_labels)+1))]].values)


  # Convert all data types to torch.Tensor
  train_inputs = torch.tensor(train_inputs)
  train_features = torch.tensor(f_train, dtype=torch.float32)
  train_labels = torch.tensor(y_train)

  batch_size = 50

   # Create the DataLoader for our training set
  train_data = TensorDataset(train_inputs, train_features, train_labels)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

#   CNN-non-static: fastText pretrained word vectors are fine-tuned during training.
  set_seed(seed)
#   capsNet, optimizer = initilize_model()
#   train(capsNet, optimizer, train_dataloader, train_dataloader, epochs=10)
  cnn_non_static, optimizer = initilize_model(pretrained_embedding=fa_embeddings,
                                                freeze_embedding=False,
                                                learning_rate=0.1,
                                                dropout=0.5)
  train(cnn_non_static, optimizer, train_dataloader, train_dataloader, epochs=20)

  _, train_accuracy = evaluate(cnn_non_static, train_dataloader)
  expdata['train_acc'].append(train_accuracy)
  print("train accuracy:")
  print(train_accuracy)

  torch.save(cnn_non_static, '/content/drive/MyDrive/personality_detection_XLM-RoBERTa/fastText/models/cnn_(kaggle+essays-test)_fa_fast_xlm_' + trait_labels[trait_idx] + '.pth')

  # if trait_idx == 3: break

df = pd.DataFrame.from_dict(expdata)
df

In [None]:
df.mean()

train_acc    67.211658
fold          1.000000
dtype: float64