# Master Thesis on the Semantics of (made-up) Names

* Author: Aron Joosse
* Supervisor: Giovanni Cassani
* Institution: Tilburg University

Can take inspiration from: https://github.com/Masetto96/BA-Thesis-form-meaning-mapping/blob/master/form_meaning_mapping.ipynb

# Library Imports

In [32]:
!pip install fasttext --progress-bar off
!pip install -U spacy --progress-bar off
import fasttext
import spacy
import numpy as np
import pandas as pd

Collecting spacy
  Downloading spacy-3.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 25.2 MB/s 
Collecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.4 MB/s 
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (451 kB)
[K     |████████████████████████████████| 451 kB 47.7 MB/s 
[?25hCollecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.0-py3-none-any.whl (27 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.1-py3-none-any.whl (7.0 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 47.8 MB/s 
[?25hCollecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.13-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (628 kB)
[K   

# Data Import

In [6]:
## Being able to access Google Drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True) 

Mounted at /content/drive


In [None]:
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.


In [23]:
## Getting the list of madeup names:

ratings_csv = pd.read_csv("drive/MyDrive/Thesis/Data/giovanni_email_data/avgRatings_annotated.csv",
                          usecols = ["name", "name_type"])

ratings_csv.head(10)

madeup_names = []

for i in ratings_csv.index:
  if ratings_csv["name_type"][i] == "madeup":
    madeup_names.append(str(ratings_csv["name"][i]))

madeup_names_lower = list(map(lambda x: x.lower(), madeup_names))

print(madeup_names[:5])
print(len(madeup_names))
print(madeup_names_lower[:5])
print(len(madeup_names_lower))

['Alastor', 'Alecto', 'Amabala', 'Araminta', 'Arcturus']
60
['alastor', 'alecto', 'amabala', 'araminta', 'arcturus']
60


## COCA

In [13]:
path = "drive/My Drive/Thesis/Data/CoCA/Text/"
unclean_path = path + "texts_combined/all_texts_combined.txt"
unclean_corpus = open(unclean_path).read()


In [14]:
print(len(unclean_corpus))
print(unclean_corpus[:100])

2977527143
@@4170367 Headnote # A puzzle has long pervaded the criminal law : why are two offenders who commit 


## Names

# Preprocessing


## Cleaning Corpus

In [16]:
## Loading the English spacy pipeline and removing stopwords

nlp = spacy.load("en_core_web_sm")
nlp.max_length = 10000000000

nlp.Defaults.stop_words.remove('him')
nlp.Defaults.stop_words.remove('her')
nlp.Defaults.stop_words.remove('hers')
nlp.Defaults.stop_words.remove('his')
nlp.Defaults.stop_words.remove('he')
nlp.Defaults.stop_words.remove('she')
nlp.Defaults.stop_words.remove('himself')
nlp.Defaults.stop_words.remove('herself')

In [22]:
def clean_corpus_unsentenced(data):
    # Tokenization
    doc = nlp(data)
    print(doc[:150])

    doc_filtered = []

    for token in doc:
      if token.is_upper is True:
        continue
      elif token.is_stop is True:
        continue
      elif str(token).lower() in madeup_names_lower:
        continue
      elif token.is_alpha:
        doc_filtered.append(str(token).lower())
      else: 
        continue

    doc_filtered = " ".join(doc_filtered)

    print(doc_filtered[:500])

    # Remove words with freq < XX

clean_corpus_unsentenced(unclean_corpus[:1000000])

@@4170367 Headnote # A puzzle has long pervaded the criminal law : why are two offenders who commit the same criminal act punished differently when one of them , due to circumstances beyond her control , causes more harm than the other ? This tradition of result-based differential punishment-the practice of varying offenders ' punishment based on whether or not they cause specific " statutory harms " -has long stood as an intractable problem for scholars and jurists alike . # This Article proposes a solution to this long-standing conceptual problem . We begin by introducing a dichotomy between two broad and exhaustive categories of ideological justifications for punishing criminal offenders . The first category , offender-facing justifications , includes many of the most familiar theories of punishment : deterrence , retribution , incapacitation , and rehabilitation . These offender-facing
headnote puzzle long pervaded criminal law offenders commit criminal act punished differently cir

In [38]:
def clean_corpus_sentenced(data):
    # Tokenization
    doc = nlp(data)
    print(doc[:50])
    print(list(doc.sents)[:15])

    doc_filtered = []

    sentence = []

    for token in doc:
      if token.is_sent_start is True:
        doc_filtered.append(" ".join(sentence))
        sentence = []
      
      if token.is_upper is True:
        continue
      elif token.is_stop is True:
        continue
      elif str(token).lower() in madeup_names_lower:
        continue
      elif token.is_alpha:
        sentence.append(str(token).lower())

    doc_filtered = list(filter(lambda x: x != "", doc_filtered))
    print(doc_filtered[:15])

    # Remove words with freq < XX

clean_corpus_sentenced(unclean_corpus[:1000000])

@@4170367 Headnote # A puzzle has long pervaded the criminal law : why are two offenders who commit the same criminal act punished differently when one of them , due to circumstances beyond her control , causes more harm than the other ? This tradition of result-based differential
[@@4170367, Headnote # A puzzle has long pervaded the criminal law : why are two offenders who commit the same criminal act punished differently when one of them , due to circumstances beyond her control , causes more harm than the other ?, This tradition of result-based differential punishment-the practice of varying offenders ' punishment based on whether or not they cause specific " statutory harms " -has long stood as an intractable problem for scholars and jurists alike ., # This Article proposes a solution to this long-standing conceptual problem ., We begin by introducing a dichotomy between two broad and exhaustive categories of ideological justifications for punishing criminal offenders ., The first 

In [None]:
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.


## Training fastText and Validating on Word Embeddings Benchmark

In [None]:
# Skipgram model :
#model = fasttext.train_unsupervised('data.txt', model='skipgram')

#model.save_model("model_filename.bin")

#model = fasttext.load_model("model_filename.bin")

#model.get_nearest_neighbors('asparagus')

#In a similar spirit, one can play around with word analogies. For example, we can see if our model can guess what is to France, and what Berlin is to Germany.
#This can be done with the analogies functionality. It takes a word triplet (like Germany Berlin France) and outputs the analogy:
#model.get_analogies("berlin", "germany", "france")