# Master Thesis on the Semantics of (made-up) Names

* Author: Aron Joosse
* Supervisor: Giovanni Cassani
* Institution: Tilburg University

Can take inspiration from: https://github.com/Masetto96/BA-Thesis-form-meaning-mapping/blob/master/form_meaning_mapping.ipynb

# Library Imports

In [1]:
!pip install fasttext --progress-bar off
!pip install -U spacy --progress-bar off
!python -m spacy download en_core_web_sm
!pip install keras

from google.colab import drive

# Preprocessing
import re
import os
import spacy
import pickle
import pandas as pd
from pandas import read_csv

# FastText
import fasttext
import fasttext.util

# MEN and SimLex Benchmarks
from os import listdir
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity

# ElasticNet and ANN
import sklearn
from sklearn.model_selection import cross_val_score, RepeatedKFold, train_test_split, StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, mean_squared_error
from sklearn.linear_model import ElasticNetCV

import numpy as np
from numpy import mean
from numpy import std
from numpy import absolute
from numpy.random import seed

from tensorflow import keras
from tensorflow.random import set_seed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import HeNormal
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.9.2-py2.py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3141909 sha256=6a73ed505a5a181d1b62cf388ae5586d480a9d15d9f7dba5c3d03575b26a062f
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.9.2
Collecting spacy
  Downloading spacy-3.2.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[?25l
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.2-py3-none-any.whl (7.2 kB)
Collecting spacy-legacy<3.1.0,>=3.0.8
  Downloading spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collec

# Data Import

In [2]:
## Getting the list of madeup names:
drive.mount("/content/drive", force_remount=True) 
ratings_csv = pd.read_csv("drive/MyDrive/Thesis/Data/giovanni_email_data/avgRatings_annotated.csv",
                          usecols = ["name", "name_type"])                      ## Importing the names and name types

ratings_csv.head(10)

madeup_names = []

for i in ratings_csv.index:                                                     ## Only choosing madeup names so I can filter them out of the FT vocab
  if ratings_csv["name_type"][i] == "madeup":
    madeup_names.append(str(ratings_csv["name"][i]))

madeup_names_lower = list(map(lambda x: x.lower(), madeup_names))               ## Lowercasting the names since my entire vocab will be lowercast

print(madeup_names[:5])
print(len(madeup_names))
print(madeup_names_lower[:5])
print(len(madeup_names_lower))

Mounted at /content/drive
['Alastor', 'Alecto', 'Amabala', 'Araminta', 'Arcturus']
60
['alastor', 'alecto', 'amabala', 'araminta', 'arcturus']
60


In [3]:
path = "drive/My Drive/Thesis/Data/CoCA/Text/"                                  ## These are the paths to easily export/import my dicts, txts, models, and pickles
dict_path = "drive/My Drive/Thesis/Data/CoCA/dict_pickles/"
unclean_path = path + "texts_combined/all_texts_combined.txt"
model_path = "drive/My Drive/Thesis/Data/CoCA/models/"
pickle_path = "drive/MyDrive/Thesis/Data/fastText and others/"
norms_path = "drive/My Drive/Thesis/Data/Norms/"

## COCA

In [None]:
unclean_corpus = open(unclean_path).read()                                      ## Importing the entire coca

In [None]:
print(len(unclean_corpus))                                                      ## Showing the length and first 100 characters of the coca
print(unclean_corpus[:100])

2977527143
@@4170367 Headnote # A puzzle has long pervaded the criminal law : why are two offenders who commit 


## Names

In [4]:
### Read CSV File and Delete Unimportant Columns (i.e., everything that isn't the name, name type, rating, or the author's choice)

### This is input for the FT model, which itself is the input for the ElasticNet and ANN regressions

names_ratings = read_csv("drive/MyDrive/Thesis/Data/giovanni_email_data/avgRatings_annotated.csv")

#print(names_ratings.head())

print(names_ratings['rating.mean_age'].notna().sum())                           ## Choosing only those rows where all columns are not NA
print(names_ratings['rating.mean_gender'].notna().sum())
print(names_ratings['rating.mean_valence'].notna().sum())

df_age = names_ratings.loc[names_ratings['rating.mean_age'].notna(), ['name', 'rating.mean_age', 'age', 'name_type']]   ## Choosing the relevant columns
print(df_age.head(), len(df_age))

df_gender = names_ratings.loc[names_ratings['rating.mean_gender'].notna(), ['name', 'rating.mean_gender', 'gender', 'name_type']]
print(df_gender.head(), len(df_gender))

df_polarity = names_ratings.loc[names_ratings['rating.mean_valence'].notna(), ['name', 'rating.mean_valence', 'polarity', 'name_type']]
print(df_polarity.head(), len(df_polarity))

119
179
63
       name  rating.mean_age    age name_type
0  Adelaide        -0.617647    old      real
2  Alasdair        18.709677  young      real
3   Alastor        13.812500    old    madeup
4    Alecto         3.593750    old    madeup
5     Alice       -13.969697  young      real 119
       name  rating.mean_gender  gender name_type
0  Adelaide           45.727273  female      real
1   Adelina           47.771429  female      real
2  Alasdair          -35.657143    male      real
3   Alastor          -38.833333    male    madeup
4    Alecto          -35.722222  female    madeup 179
        name  rating.mean_valence polarity name_type
1    Adelina            31.621622      bad      real
7    Amabala             5.935484     good    madeup
8      Apple            32.444444     good   talking
11  Arcturus           -11.166667     good    madeup
13   Arobynn             7.645161      bad    madeup 63


# Preprocessing


## Cleaning Corpus

In [None]:
## Loading the English spacy pipeline and removing stopwords (since we are interested in gender bias, it's best to leave these words in)

nlp = spacy.load("en_core_web_sm")
nlp.max_length = 10000000000

nlp.Defaults.stop_words.remove('him')
nlp.Defaults.stop_words.remove('her')
nlp.Defaults.stop_words.remove('hers')
nlp.Defaults.stop_words.remove('his')
nlp.Defaults.stop_words.remove('he')
nlp.Defaults.stop_words.remove('she')
nlp.Defaults.stop_words.remove('himself')
nlp.Defaults.stop_words.remove('herself')

In [None]:
def clean_corpus_sentenced(data, corpus_dict, index):
  ## Input: 
  # - Data = A (very) large string of corpus text
  # - Corpus_dict = a dictionary to store individual sentences in
  # - Index = the last index from the previous batch

  ## Process: 
  # Remove all unwanted tokens, and store individual sentences in the dictionary

  ## Output: 
  # - The dictionary of preprocessed sentences
  # - The last sentence index, for the next batch to continue with (so that the order of the sentences is kept)

  # Tokenization
  with nlp.select_pipes(disable=["lemmatizer", "tok2vec", "tagger", "parser"]):
    nlp.enable_pipe("senter")                                                   ## Helps with better segmenting into sentences
    doc = nlp(data)

  sentence = ""                                                                 ## Initialize an empty sentence

  for token in doc:
    if token.is_sent_start is True:                                             ## If token is the star of the sentence, add the previous sentence to the dictionary
      if sentence == "":                                                        ## and create a new, clean sentence
        continue
      else:
        corpus_dict[index] = sentence
        sentence = ""
        index += 1
    
    if token.is_upper is True:                                                  ## Remove all full-caps words
      continue
    elif token.is_stop is True:                                                 ## Remove all stopwords
      continue
    elif str(token).lower() in madeup_names_lower:                              ## Remove all words that are in my list of made-up names
      continue
    elif token.is_alpha:                                                        ## If the token has passed all previous tests, and it consists only of alphabetic
      sentence += str(token).lower() + " "                                      ## characters, lowercast it and add an extra space to the end; continue to the next
                                                                                ## token
  return corpus_dict, index

In [None]:
def corpus_dict_maker(data, start, end, index):
  ## Input: 
  # - Data = The entire uncleaned corpus
  # - Start = The character index to indicate the start of the current batch to preprocess
  # - End = The character index to indicate the end of the current batch to preprocess
  # - Index = The sentence index from the previous batch, to keep track of the number and order of the sentences

  ## Process: 
  # Preprocess the corpus in batches, since there was not enough RAM to preprocess the entire corpus at once
  # So, for every batch, all the characters between the start index and the end index are fed into the clean_corpus_sentenced() function
  # and then this dictionary of sentences is saved as a pickle to Google drive

  ## Output: 
  # Nothing; except that the sentence index is printed, which is used as input for the next batch (this was printed, so that it couldn't be lost if the 
  # runtime would disconnect (which it sadly did very often))

  drive.mount("/content/drive", force_remount=True)                             ## Connect to google drive
  
  corpus_dict = {}                                                              ## Create empty dictionary

  prev_i = (start-2)*1000000                                                    ## Start with preprocessing the two million characters before the previous index
                                                                                ## since the next range only preprocesses up to but not including the 'end' index

  for i in range(start, end, 2):                                                ## In batches of 2 million characters, feed the batch to clean_corpus_sentenced()
    print(i)                                                                   
    i *= 1000000
    corpus_dict, index = clean_corpus_sentenced(data[prev_i:i],
                                                corpus_dict,
                                                index)
    prev_i = i
  
  if prev_i == 2976000000:                                                      ## Hardcoded; if we get near the end of the corpus, don't preprocess in a batch of
    corpus_dict, index = clean_corpus_sentenced(data[prev_i:],                  ## 2 million characters (we would get an out of range error), but rather just 
                                                corpus_dict,                    ## preprocess the remaining characters, however many that may be
                                                index)

  print(index)

  pickle_out = open(dict_path + "corpus_dict_until_" + str(end) + ".pickle", "wb")  ## Save the dictionary as a pickle
  pickle.dump(corpus_dict, pickle_out)
  pickle_out.close()

  drive.flush_and_unmount()                                                     ## Flush the pickle to my drive
  print('All changes made in this colab session should now be visible in Drive.')



#### All of the individual batches:

In [None]:
## doing it in batches to 
## (1) make it possible in terms of time and the Google afk-checker captcha pop-up, and 
## (2) to not blow out the RAM and have it break down

#corpus_dict_maker(unclean_corpus, 2, 500, 0)                   
#corpus_dict_maker(unclean_corpus, 500, 640, 3217086)           
#corpus_dict_maker(unclean_corpus, 640, 760, 4232218)           
#corpus_dict_maker(unclean_corpus, 760, 800, 5439287)           
#corpus_dict_maker(unclean_corpus, 800, 900, 5888161)
#corpus_dict_maker(unclean_corpus, 900, 980, 7020129)
#corpus_dict_maker(unclean_corpus, 980, 1150, 7891661)
#corpus_dict_maker(unclean_corpus, 1150, 1200, 9903820) 
#corpus_dict_maker(unclean_corpus, 1200, 1204, 10502592)

In [None]:
######### This block is separate because it kept crashing if I did it otherwise:

#drive.mount("/content/drive", force_remount=True) 
#corpus_dict = {}

#prev_i = (1202)*1000000
#i = 1203*1000000
#corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:i],
#                                            corpus_dict,
#                                            10547544)
#prev_i = i

#print(index)

#pickle_out = open(path + "corpus_dict_until_" + str(1203) + "_post_1204" + ".pickle", "wb")
#pickle.dump(corpus_dict, pickle_out)
#pickle_out.close()

#drive.flush_and_unmount()
#print('All changes made in this colab session should now be visible in Drive.')

################################################################################

######### This block is separate because it kept crashing if I did it otherwise:

#drive.mount("/content/drive", force_remount=True) 
#corpus_dict = {}

#prev_i = (1203)*1000000
#i = 1204*1000000
#corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:i],
#                                            corpus_dict,
#                                            index + 1)
#prev_i = i

#print(index)

#pickle_out = open(path + "corpus_dict_until_" + str(1204) + "_post_1204" + ".pickle", "wb")
#pickle.dump(corpus_dict, pickle_out)
#pickle_out.close()

#drive.flush_and_unmount()
#print('All changes made in this colab session should now be visible in Drive.')

################################################################################

######### This block is separate because it kept crashing if I did it otherwise:

#drive.mount("/content/drive", force_remount=True) 
#corpus_dict = {}

#prev_i = (1204)*1000000
#i = 1205*1000000
#corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:i],
#                                            corpus_dict,
#                                            10580543)
#prev_i = i

#print(index)

#pickle_out = open(path + "corpus_dict_until_" + str(1205) + "_post_1204" + ".pickle", "wb")
#pickle.dump(corpus_dict, pickle_out)
#pickle_out.close()

#drive.flush_and_unmount()
#print('All changes made in this colab session should now be visible in Drive.')

################################################################################

######### This block is separate because it kept crashing if I did it otherwise:

#drive.mount("/content/drive", force_remount=True) 
#corpus_dict = {}

#prev_i = (1205)*1000000
#i = 1206*1000000
#corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:i],
#                                            corpus_dict,
#                                            index + 1)
#prev_i = i

#print(index)

#pickle_out = open(path + "corpus_dict_until_" + str(1206) + "_post_1204" + ".pickle", "wb")
#pickle.dump(corpus_dict, pickle_out)
#pickle_out.close()

#drive.flush_and_unmount()
#print('All changes made in this colab session should now be visible in Drive.')

################################################################################

In [None]:
#corpus_dict_maker(unclean_corpus, 1208, 1300, 10611881)
#corpus_dict_maker(unclean_corpus, 1300, 1500, 11486498)
#corpus_dict_maker(unclean_corpus, 1500, 1750, 13172710)
#corpus_dict_maker(unclean_corpus, 1750, 1846, 15332847)
#corpus_dict_maker(unclean_corpus, 1846, 1848, 16123425)
#corpus_dict_maker(unclean_corpus, 1848, 1850, 16147433)
#corpus_dict_maker(unclean_corpus, 1850, 1900, 16172965)
#corpus_dict_maker(unclean_corpus, 1900, 1968, 16855832)
#corpus_dict_maker(unclean_corpus, 1968, 1970, 17790964)
#corpus_dict_maker(unclean_corpus, 1970, 2000, 17819821)
#corpus_dict_maker(unclean_corpus, 2000, 2022, 18244076)
#corpus_dict_maker(unclean_corpus, 2022, 2024, 18536113)
#corpus_dict_maker(unclean_corpus, 2024, 2026, 18558956)
#corpus_dict_maker(unclean_corpus, 2026, 2068, 18583534)
#corpus_dict_maker(unclean_corpus, 2068, 2070, 19154020)
#corpus_dict_maker(unclean_corpus, 2070, 2100, 19179335)
#corpus_dict_maker(unclean_corpus, 2100, 2166, 19598984)
#corpus_dict_maker(unclean_corpus, 2166, 2168, 20488725)
#corpus_dict_maker(unclean_corpus, 2168, 2188, 20524278)

In [None]:
######### This block is separate because it kept crashing if I did it otherwise:

#drive.mount("/content/drive", force_remount=True) 
#corpus_dict = {}

#prev_i = (2186)*1000000
#i = 2187*1000000
#corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:i],
#                                            corpus_dict,
#                                            20779021)
#prev_i = i

#print(index)

#pickle_out = open(path + "corpus_dict_until_" + str(2187) + "_post_2188" + ".pickle", "wb")
#pickle.dump(corpus_dict, pickle_out)
#pickle_out.close()

#drive.flush_and_unmount()
#print('All changes made in this colab session should now be visible in Drive.')

################################################################################

######### This block is separate because it kept crashing if I did it otherwise:

#drive.mount("/content/drive", force_remount=True) 
#corpus_dict = {}

#prev_i = (2187)*1000000
#i = 2188*1000000
#corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:i],
#                                            corpus_dict,
#                                            index + 1)
#prev_i = i

#print(index)

#pickle_out = open(path + "corpus_dict_until_" + str(2188) + "_post_2188" + ".pickle", "wb")
#pickle.dump(corpus_dict, pickle_out)
#pickle_out.close()

#drive.flush_and_unmount()
#print('All changes made in this colab session should now be visible in Drive.')

################################################################################

In [None]:
#corpus_dict_maker(unclean_corpus, 2190, 2200, 20812123)
#corpus_dict_maker(unclean_corpus, 2200, 2310, 20945992)
#corpus_dict_maker(unclean_corpus, 2310, 2312, 22397914)
#corpus_dict_maker(unclean_corpus, 2312, 2400, 22424124)
#corpus_dict_maker(unclean_corpus, 2400, 2600, 23465826)
#corpus_dict_maker(unclean_corpus, 2600, 2800, 25199888)
#corpus_dict_maker(unclean_corpus, 2800, 2977, 26938737)

In [None]:
## This block of code opens all of the corpus dict pickles, and combines them together
## into one big dictionary: corpus_dict_complete

file_number = 1
file_list = []
for file_name in os.listdir(dict_path):                                         ## Locate all dicts in the folder
  with open(dict_path + str(file_name), 'rb') as f:
    exec("dict_" + str(file_number) + " = " + "pickle.load(f)")
    file_list.append("dict_" + str(file_number))    
    file_number += 1
  
corpus_dict_complete = {}
for file_name in file_list:                                                     
  corpus_dict_complete = {**corpus_dict_complete, **globals()[file_name]}       ## Add the contents of the dicts to dict_complete
  del globals()[file_name]

#print(len(corpus_dict_complete))
#del file_number
#del file_list

#pickle_out = open(dict_path + "corpus_dict_complete.pickle", "wb")             ## Create a new pickle
#pickle.dump(corpus_dict_complete, pickle_out)
#del corpus_dict_complete
#pickle_out.close()

#drive.flush_and_unmount()                                                      ## Flush the pickle to my drive
#print('All changes made in this colab session should now be visible in Drive.')

In [None]:
## This block of code opens the corpus_dict_complete pickle, and loops through the keys by index (from lowest to highest).
## So, we loop through every sentence, in order. These are stored in two different .txt files, one where the sentence structure is remained 
## (i.e., between every sentence, we add a newline character), and one that's unsentenced (i.e., no newline character between sentences).

with open(dict_path + "corpus_dict_complete.pickle", "rb") as d:                ## Open corpus_dict_complete
  corpus_dict_complete = pickle.load(d)

  with open(path + "cleaned_sentenced_corpus_complete.txt", "w") as f:          ## Open sentenced corpus .txt file
    for key in sorted(corpus_dict_complete):
      if len(str(corpus_dict_complete[key]).split()) < 2:                       ## Remove sentences with only 1 word (since there's no 'context' in that case)
        continue
      else:
        if str(corpus_dict_complete[key])[:2] in ["m ", "p ", "s "]:            ## I have to add this, because based on manual inspection, a significant amount of 
          f.write(str(corpus_dict_complete[key])[2:] + "\n")                    ## sentences start with just a "p", "m", or "s"
        else:
          f.write(str(corpus_dict_complete[key]) + "\n")

  with open(path + "cleaned_unsentenced_corpus_complete.txt", "w") as f2:       ## Open unsentenced corpus .txt file
    for key in sorted(corpus_dict_complete): 
      if len(str(corpus_dict_complete[key])) < 4:                               ## Remove sentences with less than 4 characers, since based on visual inspection, I
        continue                                                                ## saw that such sentences are mostly nonsense (i.e., not real words)
      else:
        if str(corpus_dict_complete[key])[:2] in ["m ", "p ", "s "]:
          f2.write(str(corpus_dict_complete[key])[2:])
        else:
          f2.write(str(corpus_dict_complete[key]))
  
drive.flush_and_unmount()                                                       ## Flush to drive
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.


## Training fastText and Validating on MEN and SimLex999

### fastText

In [None]:
# def fasttext_tuner(data_type):                                                ## I have chosen Skipgram, I won't play around with epochs or learning rate
#   for dimensionality in [100, 300]:                                           ## I'm first checking data type and dimensionality, and choosing the most promising combination.
#     for window_size in [2, 3, 4, 5, 6, 7]:                                    ## Then I'll check window size and choose the 3 most promising ones
#       for min_n in [1, 2, 3]:                                                 ## Then I want to check min_n 2 and 1 to see whether adding n-gram size of 1 makes any sense
#         for max_n in [5, 6, 7]:                                               ## Lastly, I will iterate through the 3 * 2 * 3 most promising models, finally choosing the best one and then double checking that with
#           model = fasttext.train_unsupervised(input = path + "cleaned_" +     ## the 100/300 sentenced/unsentenced options, just to be sure! In total, this means training 4 + 5 + ~16 + 3 = ~30 models instead of 216
#                                               data_type + 
#                                               "_corpus_complete.txt",
#                                               model = "skipgram",
#                                               dim = dimensionality, 
#                                               ws = window_size, 
#                                               minn = min_n,
#                                               maxn = max_n)
          
#           model.save_model(model_path + data_type + "_dim" + str(dimensionality) + 
#                             "_ws" + str(window_size) + "_minn" + str(min_n) + 
#                             "_maxn" + str(max_n) + ".bin")
          
#           print(data_type + "_dim" + str(dimensionality) + "_ws" + str(window_size) + 
#                 "_minn" + str(min_n) + "_maxn" + str(max_n))

#           del model

#   drive.flush_and_unmount()
#   print('All changes made in this colab session should now be visible in Drive.')

In [None]:
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')
ft.save_model(model_path + "pretrained_model.bin")

In [None]:
def fasttext_tuner(data_type, dimensionality = 300, window_size = 5, min_n = 3, max_n = 6):
  ## Input: 
  # - Data Type: Sentenced or unsentenced, to load the corpus type to train model
  # - Dimensionality: List of dimensionalities
  # - Window size: List of window sizes 
  # - Min_n: List of minimum n-gram sizes 
  # - Max-n: List of maximum n-gram sizes 

  ## Process: 
  # For every combination of input variables, a fastText model is trained and saved

  ## Output:
  # Nothing; fastText model is trained and saved

  for d in dimensionality:                                           
    for ws in window_size:                                    
      for minn in min_n:                                                 
        for maxn in max_n:
          drive.mount("/content/drive", force_remount=True)                     ## Mount drive

          model = fasttext.train_unsupervised(input = path + "cleaned_" + data_type + "_corpus_complete_without_p_s_m.txt",   ## Train model
                                              model = "skipgram", dim = d, ws = ws, minn = minn, maxn = maxn)

          model.save_model(model_path + data_type + "_dim" + str(d) + "_ws" + str(ws) + "_minn" + str(minn) + "_maxn" + str(maxn) + ".bin")   ## Save model

          print(data_type + "_dim" + str(d) + "_ws" + str(ws) + "_minn" + str(minn) + "_maxn" + str(maxn))

          drive.flush_and_unmount()                                             ## Flush model to drive
          print('All changes made in this colab session should now be visible in Drive.')

          del model

In [None]:
## Trying out some variations of fastText with Data = sentenced & Dimensionality = 100

fasttext_tuner("sentenced", [100], [2], [2], [3])
fasttext_tuner("sentenced", [100], [2, 3, 4, 5, 6, 7], [2], [5, 6, 7])
fasttext_tuner("sentenced", [100], [5], [3], [6])
fasttext_tuner("sentenced", [100], [5], [1], [6])

## Trying out some variations of fastText with Data = sentenced & Dimensionality = 300

fasttext_tuner("sentenced", [300], [3, 4], [2], [5, 6, 7])
fasttext_tuner("sentenced", [300], [5], [2], [5, 6])
fasttext_tuner("sentenced", [300], [5], [3], [6])
fasttext_tuner("sentenced", [300], [3], [3], [5])
fasttext_tuner("sentenced", [300], [3], [2], [4])
fasttext_tuner("sentenced", [300], [2, 6, 7], [2], [5])

## Trying out some variations of fastText with Data = unsentenced 

fasttext_tuner("unsentenced", [300], [5], [3], [6])
fasttext_tuner("unsentenced", [100], [5], [3], [6])

### MEN and SimLex Benchmarks

In [None]:
def compute_similarity(model, word1, word2):
  # Input:
  # - Model = the model that is used to get embeddings for word1 and word2
  # - Word1 = the first word
  # - Word2 = the second word

  # Process: 
  # Given the embeddings for word1 and word2 derived from the model, calculate
  # the cosine similarity between these words

  # Output:
  # The cosine similarity between word1 and word2 given the current model

    e1 = model[word1]
    e2 = model[word2]
    s = cosine_similarity(e1.reshape(1, -1), e2.reshape(1, -1))
    return s[0][0]

In [None]:
def compute_correlation_men(norms, model):
  # Input:
  # - Norms = The MEN similarity scores for word1 and word2
  # - Model = the model that will be scored

  # Process: 
  # For every model, note down the true similarity score (as dictated by the MEN
  # benchmark) and then estimate the similarity for the current model using the 
  # compute_similarity() function

  # Output:
  # The Spearman R between the true and estimated similarities for the current
  # model.

        true_similarities = []
        estimated_similarities = []
        for _, row in norms.iterrows():
            s = compute_similarity(model, row['w1'], row['w2'])

            estimated_similarities.append(s)
            true_similarities.append(row['sim'])
        
        return spearmanr(true_similarities, estimated_similarities)[0]

In [None]:
def compute_correlation_simlex(norms, model):
  # Input:
  # - Norms = The SimLex similarity scores for word1 and word2
  # - Model = the model that will be scored

  # Process: 
  # For every model, note down the true similarity score (as dictated by the SimLex
  # benchmark) and then estimate the similarity for the current model using the 
  # compute_similarity() function

  # Output:
  # The Spearman R between the true and estimated similarities for the current
  # model.

        true_similarities = []
        estimated_similarities = []
        for _, row in norms.iterrows():
            s = compute_similarity(model, row['word1'], row['word2'])

            estimated_similarities.append(s)
            true_similarities.append(row['SimLex999'])
        
        return spearmanr(true_similarities, estimated_similarities)[0]

In [None]:
men_norms = pd.read_csv(norms_path + 'norms.dev.csv', sep = ' ')                # Read the MEN benchmark CSV file

combi_score = {}
men_dict = {}

for model in os.listdir(model_path):                                            # For every saved model, calculate the scores on the MEN benchmarks
  men_dict[str(model)] = compute_correlation_men(men_norms, model)
  combi_score[str(model)] = compute_correlation_men(men_norms, model)

men_dict['pretrained_model'] = compute_correlation_men(men_norms, ft)           # Calculating the scores for the baseline (pretrained) model
combi_score['pretrained_model'] = compute_correlation_men(men_norms, ft)

In [None]:
simlex_norms = pd.read_csv(norms_path + 'SimLex-999.txt', sep = '\t')           # Read the SimLex benchmark text file

simlex_dict = {}

for model in os.listdir(model_path):                                            # For every saved model, calculate the scores on the SimLex benchmarks
  simlex_dict[str(model)] = compute_correlation_simlex(simlex_norms, model)
  combi_score[str(model)] = compute_correlation_simlex(simlex_norms, model)

simlex_dict['pretrained_model')] = compute_correlation_simlex(simlex_norms, ft) # Calculating the scores for the baseline (pretrained) model
combi_score['pretrained_model'] = compute_correlation_simlex(simlex_norms, ft)

In [None]:
pickle_men = open(pickle_path + "men_scores.pickle", "wb")                       # Save the MEN scores as a pickle
pickle.dump(men_dict, pickle_men)
pickle_men.close()

pickle_simlex = open(pickle_path + "simlex_scores.pickle", "wb")                 # Save the SimLex scores as a pickle
pickle.dump(simlex_dict, pickle_simlex)
pickle_simlex.close()

pickle_combi = open(pickle_path + "combi_scores.pickle", "wb")                   # Save the combined scores as a pickle
pickle.dump(combi_score, pickle_combi)
pickle_combi.close()

In [None]:
for i in sorted(men_dict, key = men_dict.get, reverse = True):
    print(i, men_dict[i])

In [None]:
for i in sorted(simlex_dict, key = simlex_dict.get, reverse = True):
    print(i, simlex_dict[i])

In [None]:
for i in sorted(combi_score, key = combi_score.get, reverse = True):
    print(i, combi_score[i])

### Final Model

In [5]:
drive.mount("/content/drive", force_remount=True) 

model = fasttext.load_model("drive/MyDrive/Thesis/Data/fastText and others/sentenced_dim300_ws2_minn2_maxn5.bin")

Mounted at /content/drive




## ElasticNet and Neural Network Regressions

In [6]:
def fasttext_xifyer(input_data):
  # Input:
  # - input_data = dataframe containing the names of the characters

  # Process:
  # Convert the character names to embeddings using FT

  # Output:
  # - df_output = a dataframe of word embeddings; one embedding per character name

  df_output = np.zeros((len(input_data), 300))                                  # Create an array of length = number of input rows & width = 300 (because of ft)

  i = 0

  for row in input_data.iterrows():                                             # For every row in the input data
    index = row[0]
    name = row[1][0].lower()                                                    # Lowercast the name
    df_output[i] = model[name]                                                  # Compute the fastText score of the name, and add it to the output array
    i += 1

  return df_output

In [7]:
def fasttext_xifyer_formless(input_data):
  # Input & Output are exactly the same as fasttext_xifyer

  # Process:
  # Similar to fasttext_xifyer, however, if the name has a 'surface' form that 
  # is already present in the model vocabulary, then we only use the embeddings
  # for the subwords. I.e., as compared to fasttext_xifyer, this function does 
  # NOT consider the 'surface' form embedding for the character names

  df_output = np.zeros((len(input_data), 300))                                  # Create an array of length = number of input rows & width = 300 (because of ft)

  c = 0

  for row in input_data.iterrows():                                             # For every row in the input data
    index = row[0]
    name = row[1][0].lower()
    if name == model.get_subwords(name)[0][0]:                                  # If the surface form of the name is present in the model vocabulary
      wordarray = np.zeros((len(model.get_subwords(name)[0][1:]), 300))         # Create an array of length = amount of subwords, and width = 300
      for i, j in enumerate(model.get_subwords(name)[0][1:]):         
          wordarray[i] = model[j]                                               # Then calculate the embeddings for every subword, and add them to the subword_array
      df_output[c] = np.mean(wordarray, axis = 0)                               # Take the mean of all of the embeddings, so that we get an array of length = 1 (i.e., 1 name), and width = 300
    else:
      df_output[c] = model[name]                                                # Else (if no surface name is present in the model vocabulary), just take the embedding of the name as is
    c += 1

  return df_output

In [8]:
mean_vector = np.zeros((len(model.words), 300))                                 # Create a mean vector array of length = amount of words in the model vocab, and width = 300

i = 0

for word in model.words:                          
  mean_vector[i] = model.get_word_vector(word)                                  # For every word in the vocab, get it's word embedding vector and add it to the mean_vector array
  i += 1

mean_vector = np.mean(mean_vector, axis = 0)                                    # Take the mean of the array so that we get a mean_vector of length = 1, and width = 300

### ElasticNet

In [9]:
def elasticnetifyer(x_train, y_train, x_test, y_test, test_full):
  # Input:
  # - x_train = array of embeddings used to train the model
  # - y_train = array of ratings used to train the model
  # - x_test = array of embeddings used to test the model
  # - y_test = array of ratings used to test the model
  # - test_full = full dataframe, which contains, among other things, the full 
  # names (i.e., not the embeddings) and the name type (real, madeup, talking)

  # Process:
  # Given x_train and y_train, train the ElasticNet CV model. When it converges,
  # calculate the MSE, MAE, and R2 for a bunch of different settings (test set,
  # name types, mean_vector-only model, mean_vector_or_surface_form model)

  # Output:
  # --> Too big to all individually explain here. The ouput contains the alpha, 
  # l1_ratio, n_iters, intercept, and MSE of the trained model.
  # Furthermore, it outputs the MAE, MSE, and R2 of the test set, as well as the
  # MAE for the three different name types, the MAE for a mean_vector_or_surface
  # _form-model, and the MAE and R2 for a mean_vector-only model.  

  regr = ElasticNetCV(l1_ratio = [0.01, 0.05, .1, 0.2, .5, .7, .9, .95, .99, 1], 
                      n_alphas = 250,
                      max_iter = 10000,
                      cv = len(x_train),
                      selection = 'random', 
                      random_state=17042020,)                                   # Set the hyperparameters of the ElasticNet CV

  regr.fit(x_train, y_train)                                                    # Fit the model on the train set

  alpha = regr.alpha_                                                           # Retrieve the alpha
  l1_ratio = regr.l1_ratio_                                                     # Retrieve the L1-ratio
  n_iters = regr.n_iter_                                                        # Retrieve the number of iterations needed to train the model
  intercept = regr.intercept_                                                   # Retrieve the intercept of the model

  mse_train = mean(regr.mse_path_)                                              # Retrieve the MSE for the train set

  mae_test = sklearn.metrics.mean_absolute_error(y_test, regr.predict(x_test))  # Retrieve the MAE for the test set
  mse_test = sklearn.metrics.mean_squared_error(y_test, regr.predict(x_test))   # Retrieve the MSE for the test set

  r2 = regr.score(x_test, y_test)                                               # Retrieve the R2 for the test set

  type_dict = {}
  type_counter = {}
  for n, i, j in zip(test_full['name_type'], y_test, x_test):                   # For every name type (i.e., real, talking, and madeup)
      if n in type_dict.keys():
          type_dict[n] = type_dict[n] + abs(i - regr.predict(j.reshape(1, -1))) # Append the MAE for every name given that type (so that you get a sum of MAEs; one for each name)
          type_counter[n] = type_counter[n] + 1                                 # And count the number of names given that type
      else:
          type_dict[n] = abs(i - regr.predict(j.reshape(1, -1)))
          type_counter[n] = 1
      
  for i in type_dict.keys():
    globals()[f"mae_{i}"] = float(type_dict[i])/float(type_counter[i])          # Calculate the average MAE per name type: (sum of MAEs for name type / name counter for name type)


  mean_or_form_r2_array = np.zeros((len(x_test), 300))

  mean_or_form_abs = 0
  mean_or_form_counter = 0
  for n, i, j in zip(test_full['name'], y_test, x_test):                        # For every name in the test set
      n = n.lower()                                                             # Convert name to lowercase
      if n == model.get_subwords(n)[0][0]:                                      # If the name is present in the model vocabulary
          mean_or_form_abs += abs(i - regr.predict(model[n].reshape(1, -1)))    # Retrieve the MAE of the name using the model embedding, add it to the MAE summation variable
          mean_or_form_r2_array[mean_or_form_counter] = model[n]                # and add the embedding to the R2 array
          mean_or_form_counter += 1
      else:                                                                     # If it isn't present in the model vocabulary
          mean_or_form_abs += abs(i - regr.predict(mean_vector.reshape(1, -1))) # Take the mean vector and retrieve the MAE, add it to the MAE summation variable
          mean_or_form_r2_array[mean_or_form_counter] = mean_vector             # and add the embedding to the R2 array
          mean_or_form_counter += 1

  mean_or_form_mae = float(mean_or_form_abs / mean_or_form_counter)             # Calculate the average MAE by taking the summation variable and dividing it by the counter variable

  mean_or_form_r2 = regr.score(mean_or_form_r2_array, y_test)

  mean_vec_array = np.full((len(x_test), 300), mean_vector)                     # Create a mean vector array with length = test_set_length, and width = 300

  mean_vec_mae_test = sklearn.metrics.mean_absolute_error(y_test, regr.predict(mean_vec_array))     # Retrieve the MAE for the mean vector array
  mean_vec_mse_test = sklearn.metrics.mean_squared_error(y_test, regr.predict(mean_vec_array))      # Retrieve the MSE for the mean vector array

  mean_vec_r2 = regr.score(mean_vec_array, y_test)                              # Retrieve the R2 for the mean_vector array

  return alpha, l1_ratio, n_iters, intercept, mse_train, mae_test, mse_test, r2, mae_madeup, mae_real, mae_talking, mean_or_form_mae, mean_or_form_r2, mean_vec_mae_test, mean_vec_mse_test, mean_vec_r2

In [10]:
def nested_cross_validator(df, rating, dictionary):
  # Input:
  # - df = a dataframe with the name, name_type, and rating for the dimension at  
  # hand (i.e., age, gender, or polarity)
  # - rating = a string indicating what rating to extract from the df
  # - dictionary = an empty dictionary to store all of the variables from 
  # elasticnetifyer() in

  # Process:
  # Given the df, get 5 train/test splits, and per fold, train a LOOCV ElasticNet
  # model using elasticnetifyer() for both the regular and formless data. 
  # Then, average the scores retrieved on the different folds, and print it.

  # Output: 
  # Nothing, but the results for all of the variables output by elasticnetifyer()
  # are averaged (so that we get a single score, combining scores of all of the 
  # different folds). These scores are subsequently printed.

  skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=17042020)      # Set up a stratified 5-fold splitter

  for train_index, test_index in skf.split(df, df[['name_type']]):              # For every fold (stratified on the name type, i.e., real, madeup, or talking)
    
    x_train_unfasttexted = df.iloc[train_index]                                 # Split the data into x_train and x_test
    x_test_unfasttexted = df.iloc[test_index]


    y_train = df.iloc[train_index][rating]                                      # Split the data into y_train and y_test
    y_test = df.iloc[test_index][rating]

    x_train = fasttext_xifyer(x_train_unfasttexted)                             # Get the regular embeddings (train)
    x_train_formless = fasttext_xifyer_formless(x_train_unfasttexted)           # And get the surface_form-less embeddings (train)

    x_test = fasttext_xifyer(x_test_unfasttexted)                               # Get the regular embeddings (test)
    x_test_formless = fasttext_xifyer_formless(x_test_unfasttexted)             # And get the surface_form-less embeddings (test)


    alpha, l1_ratio, n_iters, intercept, mse_train, mae_test, mse_test, r2, \
    mae_madeup, mae_real, mae_talking, mean_or_form_mae, mean_or_form_r2, \
    mean_vec_mae_test, mean_vec_mse_test, \
    mean_vec_r2 = elasticnetifyer(x_train, y_train, x_test, y_test, df)         # Train the ElasticNetCV given the fold, and return all of the variables of interest (regular)
    
    dictionary['regular'].append([alpha, l1_ratio, n_iters, intercept, mse_train, 
                                  mae_test, mse_test, r2, mae_madeup, mae_real, 
                                  mae_talking, mean_or_form_mae, mean_or_form_r2, 
                                  mean_vec_mae_test, mean_vec_mse_test, 
                                  mean_vec_r2])                                 # Append the variables of interest to the dictionary
    
    alpha_formless, l1_ratio_formless, n_iters_formless, intercept_formless, \
    mse_train_formless, mae_test_formless, mse_test_formless, r2_formless, \
    mae_madeup_formless, mae_real_formless, mae_talking_formless, \
    mean_or_form_mae_formless, mean_or_form_r2_formless, \
    mean_vec_mae_test_formless, mean_vec_mse_test_formless, \
    mean_vec_r2_formless \
    = elasticnetifyer(x_train_formless, y_train, x_test_formless, y_test, df)   # Train the ElasticNetCV given the fold, and return all of the variables of interest (formless)

    dictionary['formless'].append([alpha_formless, l1_ratio_formless, n_iters_formless,
                                   intercept_formless, mse_train_formless, 
                                   mae_test_formless, mse_test_formless, r2_formless,
                                   mae_madeup_formless, mae_real_formless, mae_talking_formless, 
                                   mean_or_form_mae_formless, mean_or_form_r2_formless,
                                   mean_vec_mae_test_formless, mean_vec_mse_test_formless,
                                   mean_vec_r2_formless])                       # Append the variables of interest to the dictionary

In [12]:
def nested_cv_addotron(dictionary, name):
  # Input:
  # - dictionary = the dictionary of values for the dimension of interest
  # - name = string indicating the dimension of interest (i.e., age, gender, or 
  # polarity). This is used to make printing easier and more organised.

  # Process:
  # Given the dimension of interest, for all 5 folds, average the scores and 
  # print all of the variables for both the 'regular' and 'formless' models

  variable_list = ('alpha', 'l1_ratio', 'n_iters', 'intercept', 'mse_train',    # List indicating all of the variables of interest
                   'mae_test', 'mse_test', 'r2', 'mae_madeup', 'mae_real', 'mae_talking',
                   'mean_or_form_mae', 'mean_or_form_r2', 'mean_vec_mae_test', 
                   'mean_vec_mse_test', 'mean_vec_r2')
  
  type_list = ['regular', 'formless']                                           # List indicating the model type

  regular_list = dictionary['regular']
  formless_list = dictionary['formless']
  
  regular_list = [sum(x) for x in zip(*regular_list)]                           # For all of the variables in the list, sum the scores (regular)
  regular_list = [x / 5 for x in regular_list]                                  # For all of the variables in the list, divide the sum by 5 to get the average score (regular)

  formless_list = [sum(x) for x in zip(*formless_list)]                         # For all of the variables in the list, sum the scores (formless)
  formless_list = [x / 5 for x in formless_list]                                # For all of the variables in the list, divide the sum by 5 to get the average score (formless)


  for value_list, analysis_type in zip([regular_list, formless_list], type_list):
    for value, variable in zip(value_list, variable_list):
      print(f"Average {analysis_type} {variable} for {name} = {value}")         # Per variable, and per analysis type, print the corresponding average value
    print("\n")

In [None]:
age_dict = {'regular' : [], 'formless' : []}                                    # initialize the score dictionary for age

nested_cross_validator(df_age, 'rating.mean_age', age_dict)                     # Perform the 5-fold cross validation

pickle_age_ncv = open(pickle_path + "age_ncv.pickle", "wb")                     # Save the dictionary to a pickle
pickle.dump(age_dict, pickle_age_ncv)
pickle_age_ncv.close()

nested_cv_addotron(age_dict, "age")                                             # Print all of the values in an orderly fashion

Average regular alpha for age = 0.05072756831996765
Average regular l1_ratio for age = 0.43599999999999994
Average regular n_iters for age = 136.0
Average regular intercept for age = 14.950048695609155
Average regular mse_train for age = 370.10286262790083
Average regular mae_test for age = 15.895302331440499
Average regular mse_test for age = 375.90199255971396
Average regular r2 for age = 0.08337560465475283
Average regular mae_madeup for age = 17.490164988987708
Average regular mae_real for age = 14.171504170036538
Average regular mae_talking for age = 15.617077040925457
Average regular mean_or_form_mae for age = 20.442560801615947
Average regular mean_or_form_r2 for age = -0.43498162737068746
Average regular mean_vec_mae_test for age = 17.792782648068588
Average regular mean_vec_mse_test for age = 428.63615987581454
Average regular mean_vec_r2 for age = -0.043229677802586595


Average formless alpha for age = 0.06749729879751121
Average formless l1_ratio for age = 0.479999999999999

In [14]:
gender_dict = {'regular' : [], 'formless' : []}                                 # initialize the score dictionary for gender

nested_cross_validator(df_gender, 'rating.mean_gender', gender_dict)            # Perform the 5-fold cross validation

pickle_gender_ncv = open(pickle_path + "gender_ncv.pickle", "wb")               # Save the dictionary to a pickle
pickle.dump(gender_dict, pickle_gender_ncv)
pickle_gender_ncv.close()

nested_cv_addotron(gender_dict, "gender")                                       # Print all of the values in an orderly fashion

Average regular alpha for gender = 0.15315895481366898
Average regular l1_ratio for gender = 0.9299999999999999
Average regular n_iters for gender = 39.8
Average regular intercept for gender = -8.20463045373086
Average regular mse_train for gender = 905.1063392504739
Average regular mae_test for gender = 19.233139844182546
Average regular mse_test for gender = 551.8804822794039
Average regular r2 for gender = 0.5898452840428152
Average regular mae_madeup for gender = 20.80736339780847
Average regular mae_real for gender = 19.69674619213221
Average regular mae_talking for gender = 16.908570100806692
Average regular mean_or_form_mae for gender = 34.818933757111175
Average regular mean_or_form_r2 for gender = -0.29163438662340035
Average regular mean_vec_mae_test for gender = 33.65809119694448
Average regular mean_vec_mse_test for gender = 1423.0961711118614
Average regular mean_vec_r2 for gender = -0.06793078160867369


Average formless alpha for gender = 0.17331042386763676
Average form

In [13]:
polarity_dict = {'regular' : [], 'formless' : []}                               # initialize the score dictionary for polarity

nested_cross_validator(df_polarity, 'rating.mean_valence', polarity_dict)       # Perform the 5-fold cross validation

pickle_polarity_ncv = open(pickle_path + "polarity_ncv.pickle", "wb")           # Save the dictionary to a pickle
pickle.dump(polarity_dict, pickle_polarity_ncv)
pickle_polarity_ncv.close()

nested_cv_addotron(polarity_dict, "polarity")                                   # Print all of the values in an orderly fashion

Average regular alpha for polarity = 0.24835051374456635
Average regular l1_ratio for polarity = 0.782
Average regular n_iters for polarity = 272.0
Average regular intercept for polarity = 4.408481760823845
Average regular mse_train for polarity = 379.57109960567766
Average regular mae_test for polarity = 15.71990488339597
Average regular mse_test for polarity = 376.60904549612854
Average regular r2 for polarity = -0.03191570552598533
Average regular mae_madeup for polarity = 15.492608786809763
Average regular mae_real for polarity = 17.452835468858968
Average regular mae_talking for polarity = 14.396252329425153
Average regular mean_or_form_mae for polarity = 17.800922879831738
Average regular mean_or_form_r2 for polarity = -0.293236186228163
Average regular mean_vec_mae_test for polarity = 17.61507358173261
Average regular mean_vec_mse_test for polarity = 412.61032850202776
Average regular mean_vec_r2 for polarity = -0.11769143744667451


Average formless alpha for polarity = 0.23515

### Neural Networks

In [None]:
def fnn_maker(x_train, y_train, x_test, y_test, nodes, dropout):
  seed(17042020)
  set_seed(17042020)

  fnn_model = Sequential()

  fnn_model.add(Dense(nodes, input_dim=300, kernel_initializer=HeNormal(), activation=keras.layers.LeakyReLU()))

  fnn_model.add(Dense(1, activation='linear'))

  callback = EarlyStopping(monitor = 'loss', patience=3)

  fnn_model.compile(optimizer=Adam(), loss='mean_squared_error')

  fnn_model.fit(x_train, y_train, epochs=100, batch_size=len(x_train), callbacks=[callback], verbose=0)

  y_pred = fnn_model.predict(x_test)
  
  mse = mean_squared_error(y_test, y_pred)

  return mse

In [None]:
def nested_neural_network_finder(df, rating, dimension, dictionary):
  skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=17042020)

  for train_index, test_index in skf.split(df, df[['name_type']]):
    k_fold = KFold(n_splits = len(train_index), shuffle=True, random_state=17042020)

    x_train_unfasttexted = df.iloc[train_index]
    x_test_unfasttexted = df.iloc[test_index]

    x_train = fasttext_xifyer(x_train_unfasttexted)
    x_train_formless = fasttext_xifyer_formless(x_train_unfasttexted)

    x_test = fasttext_xifyer(x_test_unfasttexted)
    x_test_formless = fasttext_xifyer_formless(x_test_unfasttexted)


    y_train = df.iloc[train_index][rating]
    y_test = df.iloc[test_index][rating]

    for nested_train_index, nested_test_index in k_fold.split(x_train):
      nested_x_train = x_train[[nested_train_index], :].reshape(len(nested_train_index), 300)
      nested_x_train_formless = x_train_formless[[nested_train_index], :].reshape(len(nested_train_index), 300)

      nested_x_test = x_train[[nested_test_index], :].reshape(len(nested_test_index), 300)
      nested_x_test_formless = x_train_formless[[nested_test_index], :].reshape(len(nested_test_index), 300)

      nested_y_train = y_train.iloc[nested_train_index]
      nested_y_test = y_train.iloc[nested_test_index]

      for nodes in [8, 16, 32, 50, 64, 100, 128, 200, 256, 300, 512]:
        for dropout in [0.5, 0.6, 0.7, 0.8]:
          age_dict_nn['regular'][str(nodes)][str(dropout)].append(
              fnn_maker(nested_x_train, 
                        nested_y_train, 
                        nested_x_test, 
                        nested_y_test, 
                        nodes, 
                        dropout))

          age_dict_nn['formless'][str(nodes)][str(dropout)].append(
              fnn_maker(nested_x_train_formless, 
                        nested_y_train, 
                        nested_x_test_formless, 
                        nested_y_test, 
                        nodes, 
                        dropout))
  
  for nodes in [8, 16, 32, 50, 64, 100, 128, 200, 256, 300, 512]:
      for dropout in [0.5, 0.6, 0.7, 0.8]:
        regular_mse = sum(age_dict_nn['regular'][str(nodes)][str(dropout)]) / len(age_dict_nn['regular'][str(nodes)][str(dropout)])
        formless_mse = sum(age_dict_nn['formless'][str(nodes)][str(dropout)]) / len(age_dict_nn['formless'][str(nodes)][str(dropout)])

        print("{}, regular, nodes = {}, dropout = {}, Average MSE = {}".format(dimension, nodes, dropout, regular_mse))
        print("{}, formless, nodes = {}, dropout = {}, Average MSE = {}".format(dimension, nodes, dropout, formless_mse))
        print("\n")

In [None]:
age_dict_nn = {'regular' : {'8': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '16' : {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '32': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '50': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '64': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '100': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '128': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '200': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '256': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '300': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '512': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}}, 
               'formless' : {'8': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '16' : {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '32': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '50': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '64': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '100': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '128': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '200': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '256': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '300': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '512': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}}}

nested_neural_network_finder(df_age, 'rating.mean_age', 'Age', age_dict_nn)

In [None]:
gender_dict_nn = {'regular' : {'8': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '16' : {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '32': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '50': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '64': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '100': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '128': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '200': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '256': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '300': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '512': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}}, 
               'formless' : {'8': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '16' : {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '32': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '50': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '64': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '100': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '128': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '200': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '256': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '300': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '512': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}}}

nested_neural_network_finder(df_gender, 'rating.mean_gender', 'Gender', gender_dict_nn)

In [None]:
polarity_dict_nn = {'regular' : {'8': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '16' : {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '32': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '50': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '64': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '100': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '128': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '200': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '256': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '300': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '512': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}}, 
               'formless' : {'8': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '16' : {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '32': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '50': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '64': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '100': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '128': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '200': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '256': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '300': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}, 
                            '512': {'0.5': [], '0.6': [], '0.7': [], '0.8': []}}}

nested_neural_network_finder(df_polarity, 'rating.mean_valence', 'Polarity', polarity_dict_nn)

In [None]:
def fnn_maker_and_evaluator(x_train, y_train, x_test, y_test, test_full, nodes, dropout):
  seed(17042020)
  set_seed(17042020)

  fnn_model = Sequential()

  fnn_model.add(Dense(nodes, input_dim=300, kernel_initializer=HeNormal(), activation=keras.layers.LeakyReLU()))
  fnn_model.add(Dropout(dropout))
  
  fnn_model.add(Dense(1, activation='linear'))
  

  callback = EarlyStopping(monitor = 'loss', patience=3)

  fnn_model.compile(optimizer=Adam(), loss='mean_squared_error')


  fnn_model.fit(x_train, y_train, epochs=100, batch_size=len(x_train), callbacks=[callback], verbose=0)
  
  y_pred_train = fnn_model.predict(x_train)
  mse_train = mean_squared_error(y_train, y_pred_train)
  

  y_pred = fnn_model.predict(x_test)
  
  mae_test = mean_absolute_error(y_test, y_pred)
  mse_test = mean_squared_error(y_test, y_pred)
  r2 = sklearn.metrics.r2_score(y_test, y_pred)
  

  type_dict = {}
  type_counter = {}
  for n, i, j in zip(test_full['name_type'], y_test, x_test):                   # For every name type (i.e., real, talking, and madeup)
    if n in type_dict.keys():
      type_dict[n] = type_dict[n] + abs(i - fnn_model.predict(j.reshape(1, -1))) # Append the MAE for every name given that type (so that you get a sum of MAEs; one for each name)
      type_counter[n] = type_counter[n] + 1                                 # And count the number of names given that type
    else:
      type_dict[n] = abs(i - fnn_model.predict(j.reshape(1, -1)))
      type_counter[n] = 1

  for i in type_dict.keys():
    globals()[f"mae_{i}"] = float(type_dict[i])/float(type_counter[i])          # Calculate the average MAE per name type: (sum of MAEs for name type / name counter for name type)


  mean_or_form_r2_array = np.zeros((len(x_test), 300))

  mean_or_form_abs = 0
  mean_or_form_counter = 0
  for n, i, j in zip(test_full['name'], y_test, x_test):                        # For every name in the test set
    n = n.lower()                                                             # Convert name to lowercase
    if n == model.get_subwords(n)[0][0]:                                      # If the name is present in the model vocabulary
      mean_or_form_abs += abs(i - fnn_model.predict(model[n].reshape(1, -1)))    # Retrieve the MAE of the name using the model embedding, add it to the MAE summation variable
      mean_or_form_r2_array[mean_or_form_counter] = model[n]                # and add the embedding to the R2 array
      mean_or_form_counter += 1
    else:                                                                     # If it isn't present in the model vocabulary
      mean_or_form_abs += abs(i - fnn_model.predict(mean_vector.reshape(1, -1))) # Take the mean vector and retrieve the MAE, add it to the MAE summation variable
      mean_or_form_r2_array[mean_or_form_counter] = mean_vector             # and add the embedding to the R2 array
      mean_or_form_counter += 1

  mean_or_form_mae = float(mean_or_form_abs / mean_or_form_counter)             # Calculate the average MAE by taking the summation variable and dividing it by the counter variable  

  mean_or_form_r2 = sklearn.metrics.r2_score(y_test, fnn_model.predict(mean_or_form_r2_array))
 

  mean_vec_array = np.full((len(x_test), 300), mean_vector)                     # Create a mean vector array with length = test_set_length, and width = 300

  mean_vec_mae_test = sklearn.metrics.mean_absolute_error(y_test, fnn_model.predict(mean_vec_array))     # Retrieve the MAE for the mean vector array
  mean_vec_mse_test = sklearn.metrics.mean_squared_error(y_test, fnn_model.predict(mean_vec_array))      # Retrieve the MSE for the mean vector array

  mean_vec_r2 = sklearn.metrics.r2_score(y_test, fnn_model.predict(mean_vec_r2_array))                              # Retrieve the R2 for the mean_vector array

  return mse_train, mae_test, mse_test, r2, mae_madeup, mae_real, mae_talking, mean_or_form_mae, mean_or_form_r2, mean_vec_mae_test, mean_vec_mse_test, mean_vec_r2

In [None]:
def neural_network_5_folder(df, rating, dimension, dictionary, nodes, dropout):
  skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=17042020)

  for train_index, test_index in skf.split(df, df[['name_type']]):
    x_train_unfasttexted = df.iloc[train_index]
    x_test_unfasttexted = df.iloc[test_index]

    x_train = fasttext_xifyer(x_train_unfasttexted)
    x_train_formless = fasttext_xifyer_formless(x_train_unfasttexted)

    x_test = fasttext_xifyer(x_test_unfasttexted)
    x_test_formless = fasttext_xifyer_formless(x_test_unfasttexted)


    y_train = df.iloc[train_index][rating]
    y_test = df.iloc[test_index][rating]

    mse_train, mae_test, mse_test, r2, mae_madeup, mae_real, mae_talking, \
    mean_or_form_mae, mean_or_form_r2, mean_vec_mae_test, mean_vec_mse_test, \
    mean_vec_r2 = fnn_maker_and_evaluator(x_train, 
                                          y_train, 
                                          x_test, 
                                          y_test, 
                                          df, 
                                          nodes,
                                          dropout):

    mse_train_formless, mae_test_formless, mse_test_formless, r2_formless, \ 
    mae_madeup_formless, mae_real_formless, mae_talking_formless, \
    mean_or_form_mae_formless, mean_or_form_r2_formless, mean_vec_mae_test_formless, \
    mean_vec_mse_test, mean_vec_r2 = fnn_maker_and_evaluator(x_train_formless, 
                                          y_train, 
                                          x_test_formless, 
                                          y_test, 
                                          df, 
                                          nodes,
                                          dropout):

    dictionary['regular'].append([mse_train, mae_test, mse_test, r2, mae_madeup, 
                                  mae_real, mae_talking, mean_or_form_mae, 
                                  mean_or_form_r2, mean_vec_mae_test, 
                                  mean_vec_mse_test, mean_vec_r2]) 

    dictionary['formless'].append([mse_train_formless, mae_test_formless, 
                                   mse_test_formless, r2_formless, 
                                   mae_madeup_formless, mae_real_formless, 
                                   mae_talking_formless, mean_or_form_mae_formless, 
                                   mean_or_form_r2_formless, mean_vec_mae_test_formless, 
                                   mean_vec_mse_test_formless, mean_vec_r2_formless])    
  
  variable_list = ('mse_train', 'mae_test', 'mse_test', 'r2', 'mae_madeup',     # List indicating all of the variables of interest 
                   'mae_real', 'mae_talking', 'mean_or_form_mae', 
                   'mean_or_form_r2', 'mean_vec_mae_test', 'mean_vec_mse_test', 
                   'mean_vec_r2')
  
  type_list = ['regular', 'formless']                                           # List indicating the model type

  regular_list = dictionary['regular']
  formless_list = dictionary['formless']
  
  regular_list = [sum(x) for x in zip(*regular_list)]                           # For all of the variables in the list, sum the scores (regular)
  regular_list = [x / 5 for x in regular_list]                                  # For all of the variables in the list, divide the sum by 5 to get the average score (regular)

  formless_list = [sum(x) for x in zip(*formless_list)]                         # For all of the variables in the list, sum the scores (formless)
  formless_list = [x / 5 for x in formless_list]                                # For all of the variables in the list, divide the sum by 5 to get the average score (formless)


  for value_list, analysis_type in zip([regular_list, formless_list], type_list):
    for value, variable in zip(value_list, variable_list):
      print(f"Average {analysis_type} {variable} for {dimension} = {value}")    # Per variable, and per analysis type, print the corresponding average value
    print("\n")

In [None]:
age_dict_nn_final = {'regular' : [], 'formless' : []} 

neural_network_5_folder(df_age, 'rating.mean_age', 'age', 
                        age_dict_nn_final, nodes, dropout)

pickle_age_nn_final = open(pickle_path + "age_nn_final.pickle", "wb")           # Save the dictionary to a pickle
pickle.dump(age_dict_nn_final, pickle_age_nn_final)
pickle_age_nn_final.close()

In [None]:
gender_dict_nn_final = {'regular' : [], 'formless' : []} 

neural_network_5_folder(df_gender, 'rating.mean_gender', 'gender', 
                        gender_dict_nn_final, nodes, dropout)

pickle_gender_nn_final = open(pickle_path + "gender_nn_final.pickle", "wb")     # Save the dictionary to a pickle
pickle.dump(gender_dict_nn_final, pickle_gender_nn_final)
pickle_gender_nn_final.close()

In [None]:
polarity_dict_nn_final = {'regular' : [], 'formless' : []} 

neural_network_5_folder(df_polarity, 'rating.mean_valence', 'polarity', 
                        polarity_dict_nn_final, nodes, dropout)

pickle_polarity_nn_final = open(pickle_path + "polarity_nn_final.pickle", "wb") # Save the dictionary to a pickle
pickle.dump(polarity_dict_nn_final, pickle_polarity_nn_final)
pickle_polarity_nn_final.close()

## Plots Playing Ground

Ideas (I'm not listing possible tables for the results, cause those seem like no-brainers to me):

- Dimension Y-axis vs Dimension X-axis, real vs predicted, ANN vs ElasticNet
- Histogram of predicted 'dimensionness' per dimension & regression type
- Histogram of MAE per name type, per dimension & regression type
- Real vs predicted regression per dimension & name type (one of these can be done with different colours/lines, while the other can be done with left/right)
- Distribution of dimensionness for the predicted (per regression type) and the real --> see if they follow similar distributions!
- Schematic depiction of the experimental design (as done by Giovanni in one of his papers)

- **ONLY WHEN ENOUGH TIME**: Permutation testing to find the most influential n-grams per dimension & regression type

Problems:
- 170 names, how do I choose 'exemplary ones'? 
- How to combine dimenions / regression types / real vs predicted, while keeping things interpretable 
  - Scola: old/male = left, young/female = right, dimensions = colour
- 
