# Master Thesis on the Semantics of (made-up) Names

* Author: Aron Joosse
* Supervisor: Giovanni Cassani
* Institution: Tilburg University

Can take inspiration from: https://github.com/Masetto96/BA-Thesis-form-meaning-mapping/blob/master/form_meaning_mapping.ipynb

# Library Imports

In [2]:
!pip install fasttext --progress-bar off
import fasttext
import spacy
import numpy as np
import pandas as pd

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.9.1-py2.py3-none-any.whl (211 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3130761 sha256=6d5161767b45fdf2a2878656227e8940ff4568c6275bf72861385dc786ac5f7e
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.9.1


# Data Import

In [6]:
## Being able to access Google Drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True) 

Mounted at /content/drive


In [None]:
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.


In [21]:
## Getting the list of madeup names:

ratings_csv = pd.read_csv("drive/MyDrive/Thesis/Data/giovanni_email_data/avgRatings_annotated.csv",
                          usecols = ["name", "name_type"])

ratings_csv.head(10)

madeup_names = []

for i in ratings_csv.index:
  if ratings_csv["name_type"][i] == "madeup":
    madeup_names.append(str(ratings_csv["name"][i]))

madeup_names_lower = list(map(lambda x: x.lower(), madeup_names))

print(madeup_names)
print(len(madeup_names))
print(madeup_names_lower)
print(len(madeup_names_lower))

['Alastor', 'Alecto', 'Amabala', 'Araminta', 'Arcturus', 'Argus', 'Arobynn', 'Asim', 'Brum', 'Cardan', 'Chaol', 'Chi', 'Cooki', 'Dalip', 'Dumpa', 'Elide', 'Farder', 'Fenrir', 'Fortuna', 'Gmork', 'Goha', 'Griphook', 'Grunter', 'Hannerl', 'Hepzibah', 'Inej', 'Ione', 'Iorek', 'Jungli', 'Kaisa', 'Kaz', 'Kreacher', 'Lak', 'Levana', 'Lorcan', 'Luft', 'Minna', 'Miskouri', 'Mogget', 'Morgra', 'Muffet', 'Mundungus', 'Neoma', 'Nergui', 'Nitasha', 'Penthe', 'Robbo', 'Saiorse', 'Schaffa', 'Serafina', 'Skellig', 'Spink', 'Steg', 'Talentino', 'Tasha', 'Tenar', 'Titania', 'Yozadah', 'Zahara', 'Zubaida']
60
['alastor', 'alecto', 'amabala', 'araminta', 'arcturus', 'argus', 'arobynn', 'asim', 'brum', 'cardan', 'chaol', 'chi', 'cooki', 'dalip', 'dumpa', 'elide', 'farder', 'fenrir', 'fortuna', 'gmork', 'goha', 'griphook', 'grunter', 'hannerl', 'hepzibah', 'inej', 'ione', 'iorek', 'jungli', 'kaisa', 'kaz', 'kreacher', 'lak', 'levana', 'lorcan', 'luft', 'minna', 'miskouri', 'mogget', 'morgra', 'muffet', 'mu

## COCA

In [13]:
path = "drive/My Drive/Thesis/Data/CoCA/Text/"
unclean_path = path + "texts_combined/all_texts_combined.txt"
unclean_corpus = open(unclean_path).read()


In [14]:
print(len(unclean_corpus))
print(unclean_corpus[:100])

2977527143
@@4170367 Headnote # A puzzle has long pervaded the criminal law : why are two offenders who commit 


## Names

# Preprocessing


## Cleaning Corpus

In [16]:
## Loading the English spacy pipeline and removing stopwords

nlp = spacy.load("en_core_web_sm")
nlp.max_length = 10000000000

nlp.Defaults.stop_words.remove('him')
nlp.Defaults.stop_words.remove('her')
nlp.Defaults.stop_words.remove('hers')
nlp.Defaults.stop_words.remove('his')
nlp.Defaults.stop_words.remove('he')
nlp.Defaults.stop_words.remove('she')
nlp.Defaults.stop_words.remove('himself')
nlp.Defaults.stop_words.remove('herself')

In [20]:
def clean_corpus_unsentenced(data):
    # Tokenization
    doc = nlp(data)
    print(doc[:50])

    doc_filtered = []

    madeup_names = []

    for token in doc:
      if token.is_upper is True:
        continue
      elif token.is_stop is True:
        continue
      elif str(token).lower() in madeup_names_lower:
        continue
      elif token.is_alpha:
        doc_filtered.append(str(token).lower())

    doc_filtered = " ".join(doc_filtered)

    print(doc_filtered[:200])

    # Remove all words that are full-caps
    IS_UPPER = True

    # Lowercase
    

    # Remove non-letters (punctuation and numbers)
    IS_PUNCT = True 
    LIKE_NUM = True 
    
    # Removing stopwords: DO NOT REMOVE PRONOUNS, HIM HER ETC.
    IS_STOP = True


    # Remove made-up names from dataset

    # Remove words with freq < XX

clean_corpus_unsentenced(unclean_corpus[:1000000])

@@4170367 Headnote # A puzzle has long pervaded the criminal law : why are two offenders who commit the same criminal act punished differently when one of them , due to circumstances beyond her control , causes more harm than the other ? This tradition of result-based differential
headnote puzzle long pervaded criminal law offenders commit criminal act punished differently circumstances her control causes harm tradition result based differential punishment practice varying offe


In [None]:
def clean_corpus_sentenced(data):
    # Tokenization
    doc = nlp(data)
    print(doc[:50])

    # Split into sentences
    sents = list(doc.sents)
    print(sents[:3])

    # Remove all words that are full-caps
    IS_UPPER = True

    # Lowercase
    

    # Remove non-letters (punctuation and numbers)
    IS_PUNCT = True 
    LIKE_NUM = True 
    
    # Removing stopwords: DO NOT REMOVE PRONOUNS, HIM HER ETC.
    IS_STOP = True
    #stop_words = set(stopwords.words('english'))
    #stop_words.remove('him')
    #stop_words.remove('her')
    #stop_words.remove('hers')
    #stop_words.remove('his')
    #stop_words.remove('he')
    #stop_words.remove('she')
    #no_stop = []
    #for sent in lemma_sent:
    #    tokens_without_sw = [word for word in sent if not word in stop_words]
    #    no_stop.append(tokens_without_sw)

    # Remove made-up names from dataset

    # Remove words with freq < XX

clean_corpus_sentenced(unclean_corpus[:10000])

3000000000


In [None]:
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.


## Training fastText and Validating on Word Embeddings Benchmark

In [None]:
# Skipgram model :
#model = fasttext.train_unsupervised('data.txt', model='skipgram')

#model.save_model("model_filename.bin")

#model = fasttext.load_model("model_filename.bin")

#model.get_nearest_neighbors('asparagus')

#In a similar spirit, one can play around with word analogies. For example, we can see if our model can guess what is to France, and what Berlin is to Germany.
#This can be done with the analogies functionality. It takes a word triplet (like Germany Berlin France) and outputs the analogy:
#model.get_analogies("berlin", "germany", "france")