# Master Thesis on the Semantics of (made-up) Names

* Author: Aron Joosse
* Supervisor: Giovanni Cassani
* Institution: Tilburg University

Can take inspiration from: https://github.com/Masetto96/BA-Thesis-form-meaning-mapping/blob/master/form_meaning_mapping.ipynb

# Library Imports

In [1]:
!pip install fasttext --progress-bar off
!pip install -U spacy --progress-bar off
!python -m spacy download en_core_web_sm
import fasttext
import spacy
import numpy as np
import pandas as pd
import re
import pickle
from google.colab import drive

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 549 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Data Import

In [2]:
## Getting the list of madeup names:
drive.mount("/content/drive", force_remount=True) 
ratings_csv = pd.read_csv("drive/MyDrive/Thesis/Data/giovanni_email_data/avgRatings_annotated.csv",
                          usecols = ["name", "name_type"])

ratings_csv.head(10)

madeup_names = []

for i in ratings_csv.index:                                           ## I can do exactly the same thing for talking & real
  if ratings_csv["name_type"][i] == "madeup":
    madeup_names.append(str(ratings_csv["name"][i]))

madeup_names_lower = list(map(lambda x: x.lower(), madeup_names))

print(madeup_names[:5])
print(len(madeup_names))
print(madeup_names_lower[:5])
print(len(madeup_names_lower))

Mounted at /content/drive
['Alastor', 'Alecto', 'Amabala', 'Araminta', 'Arcturus']
60
['alastor', 'alecto', 'amabala', 'araminta', 'arcturus']
60


## COCA

In [3]:
path = "drive/My Drive/Thesis/Data/CoCA/Text/"
unclean_path = path + "texts_combined/all_texts_combined.txt"
unclean_corpus = open(unclean_path).read()


In [4]:
print(len(unclean_corpus))
print(unclean_corpus[:100])

2977527143
@@4170367 Headnote # A puzzle has long pervaded the criminal law : why are two offenders who commit 


## Names

# Preprocessing


## Cleaning Corpus

In [5]:
## Loading the English spacy pipeline and removing stopwords

nlp = spacy.load("en_core_web_sm")
nlp.max_length = 10000000000

nlp.Defaults.stop_words.remove('him')
nlp.Defaults.stop_words.remove('her')
nlp.Defaults.stop_words.remove('hers')
nlp.Defaults.stop_words.remove('his')
nlp.Defaults.stop_words.remove('he')
nlp.Defaults.stop_words.remove('she')
nlp.Defaults.stop_words.remove('himself')
nlp.Defaults.stop_words.remove('herself')

In [6]:
def clean_corpus_sentenced(data, corpus_dict, index):
  # Tokenization
  with nlp.select_pipes(disable=["lemmatizer", "tok2vec", "tagger", "parser"]):
    nlp.enable_pipe("senter")
    doc = nlp(data)

  sentence = ""

  for token in doc:
    if token.is_sent_start is True:
      if sentence == "":
        continue
      else:
        corpus_dict[index] = sentence
        sentence = ""
        index += 1
    
    if token.is_upper is True:
      continue
    elif token.is_stop is True:
      continue
    elif str(token).lower() in madeup_names_lower:
      continue
    elif token.is_alpha:
      sentence += str(token).lower()

  return corpus_dict, index

In [9]:
def corpus_dict_maker(data, start, end, index):
  drive.mount("/content/drive", force_remount=True) 
  corpus_dict = {}

  prev_i = (start-2)*1000000

  for i in range(start, end, 2): #  secondly until 1000
    print(i)
    i *= 1000000
    corpus_dict, index = clean_corpus_sentenced(data[prev_i:i],
                                                corpus_dict,
                                                index)
    prev_i = i
  
  if prev_i == 2976000000:
    corpus_dict, index = clean_corpus_sentenced(data[prev_i:],
                                                corpus_dict,
                                                index)

  print(index)

  pickle_out = open(path + "corpus_dict_until_" + str(end) + ".pickle", "wb")
  pickle.dump(corpus_dict, pickle_out)
  pickle_out.close()

  drive.flush_and_unmount()
  print('All changes made in this colab session should now be visible in Drive.')

## doing it in batches to (1) make it possible in terms of time and the Google
## afk-checker captcha pop-up, and (2) to not blow out the RAM and have it break
## down

#corpus_dict_maker(unclean_corpus, 2, 500, 0)                   #first until 500
#corpus_dict_maker(unclean_corpus, 500, 640, 3217085)           # then until 640 ##index should have been 1 higher, because it's index + 1, so I accounted for this already and did the next +2 (+1 + +1)
#corpus_dict_maker(unclean_corpus, 640, 760, 4232218)           #  now until 760
#corpus_dict_maker(unclean_corpus, 760, 800, 5439286)            # now until 800 ## here the index should have been 1 higher again, so for the next I'll do +2 again
corpus_dict_maker(unclean_corpus, 800, 900, index = 5888161)
#corpus_dict_maker(unclean_corpus, 900, 1000, index = )
#corpus_dict_maker(unclean_corpus, 1000, 1100, index = )
#corpus_dict_maker(unclean_corpus, 1100, 1200, index = )

Mounted at /content/drive
800


KeyboardInterrupt: ignored

In [None]:
drive.mount("/content/drive", force_remount=True) 
corpus_dict = {}

for i in range(2500, 2976, 2): # lastly until end
  print(i)
  i *= 1000000
  corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:i],
                                              corpus_dict,
                                              index)
  prev_i = i

corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:],
                                            corpus_dict,
                                            index)

print("----------------------")
print(len(corpus_dict), index)

In [None]:
def clean_corpus_unsentenced(data):
    # Tokenization
    with nlp.select_pipes(disable=["lemmatizer", "tok2vec", "tagger", "parser"]):
      nlp.enable_pipe("senter")
      doc = nlp(data)
    print(doc[:150])

    doc_filtered = []

    for token in doc:
      if token.is_upper is True:
        continue
      elif token.is_stop is True:
        continue
      elif str(token).lower() in madeup_names_lower:
        continue
      elif token.is_alpha:
        doc_filtered.append(str(token).lower())
      else: 
        continue

    doc_filtered = " ".join(doc_filtered)

    print(doc_filtered[:500])

    # Remove words with freq < XX

clean_corpus_unsentenced(unclean_corpus)#[:1000000])

## Training fastText and Validating on Word Embeddings Benchmark

In [None]:
# Skipgram model :
#model = fasttext.train_unsupervised('data.txt', model='skipgram')

#model.save_model("model_filename.bin")

#model = fasttext.load_model("model_filename.bin")

#model.get_nearest_neighbors('asparagus')

#In a similar spirit, one can play around with word analogies. For example, we can see if our model can guess what is to France, and what Berlin is to Germany.
#This can be done with the analogies functionality. It takes a word triplet (like Germany Berlin France) and outputs the analogy:
#model.get_analogies("berlin", "germany", "france")

In [None]:
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.
