# Master Thesis on the Semantics of (made-up) Names

* Author: Aron Joosse
* Supervisor: Giovanni Cassani
* Institution: Tilburg University

Can take inspiration from: https://github.com/Masetto96/BA-Thesis-form-meaning-mapping/blob/master/form_meaning_mapping.ipynb

# Library Imports

In [1]:
!pip install fasttext --progress-bar off
!pip install -U spacy --progress-bar off
!python -m spacy download en_core_web_sm
import fasttext
import spacy
import numpy as np
import pandas as pd
import re
import pickle

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.9.1-py2.py3-none-any.whl (211 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3137895 sha256=0521e103abee33e15dd2e777f2834d5212584ad5099acd4aefe6b19c9d5db88c
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.9.1
Collecting spacy
  Downloading spacy-3.2.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[?25l
Collecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.0-py3-none-any.whl (27 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
[?25l
[?25hCollecting catalogue<2.1.0,>=2.0.6


# Data Import

In [2]:
## Being able to access Google Drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True) 

Mounted at /content/drive


In [3]:
## Getting the list of madeup names:

ratings_csv = pd.read_csv("drive/MyDrive/Thesis/Data/giovanni_email_data/avgRatings_annotated.csv",
                          usecols = ["name", "name_type"])

ratings_csv.head(10)

madeup_names = []

for i in ratings_csv.index:                                           ## I can do exactly the same thing for talking & real
  if ratings_csv["name_type"][i] == "madeup":
    madeup_names.append(str(ratings_csv["name"][i]))

madeup_names_lower = list(map(lambda x: x.lower(), madeup_names))

print(madeup_names[:5])
print(len(madeup_names))
print(madeup_names_lower[:5])
print(len(madeup_names_lower))

['Alastor', 'Alecto', 'Amabala', 'Araminta', 'Arcturus']
60
['alastor', 'alecto', 'amabala', 'araminta', 'arcturus']
60


## COCA

In [4]:
path = "drive/My Drive/Thesis/Data/CoCA/Text/"
unclean_path = path + "texts_combined/all_texts_combined.txt"
unclean_corpus = open(unclean_path).read()


In [5]:
print(len(unclean_corpus))
print(unclean_corpus[:100])

2977527143
@@4170367 Headnote # A puzzle has long pervaded the criminal law : why are two offenders who commit 


## Names

# Preprocessing


## Cleaning Corpus

In [6]:
## Loading the English spacy pipeline and removing stopwords

nlp = spacy.load("en_core_web_sm")
nlp.max_length = 10000000000

nlp.Defaults.stop_words.remove('him')
nlp.Defaults.stop_words.remove('her')
nlp.Defaults.stop_words.remove('hers')
nlp.Defaults.stop_words.remove('his')
nlp.Defaults.stop_words.remove('he')
nlp.Defaults.stop_words.remove('she')
nlp.Defaults.stop_words.remove('himself')
nlp.Defaults.stop_words.remove('herself')

In [7]:
def clean_corpus_sentenced(data, corpus_dict, index):
  # Tokenization
  with nlp.select_pipes(disable=["lemmatizer", "tok2vec", "tagger", "parser"]):
    nlp.enable_pipe("senter")
    doc = nlp(data)

  sentence = ""

  for token in doc:
    if token.is_sent_start is True:
      if sentence == "":
        continue
      else:
        corpus_dict[index] = sentence
        sentence = ""
        index += 1
    
    if token.is_upper is True:
      continue
    elif token.is_stop is True:
      continue
    elif str(token).lower() in madeup_names_lower:
      continue
    elif token.is_alpha:
      sentence += str(token).lower()

  return corpus_dict, index

In [8]:
corpus_dict = {}
index = 0

prev_i = 0

for i in range(2, 500, 2): # first until 500
  print(i)
  i *= 1000000
  corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:i],
                                              corpus_dict,
                                              index)
  prev_i = i

#corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:],
#                                            corpus_dict,
#                                            index)

print("----------------------")
print(len(corpus_dict), index)

2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
40
42
44
46
48
50
52
54
56
58
60
62
64
66
68
70
72
74
76
78
80
82
84
86
88
90
92
94
96
98
100
102
104
106
108
110
112
114
116
118
120
122
124
126
128
130
132
134
136
138
140
142
144
146
148
150
152
154
156
158
160
162
164
166
168
170
172
174
176
178
180
182
184
186
188
190
192
194
196
198
200
202
204
206
208
210
212
214
216
218
220
222
224
226
228
230
232
234
236
238
240
242
244
246
248
250
252
254
256
258
260
262
264
266
268
270
272
274
276
278
280
282
284
286
288
290
292
294
296
298
300
302
304
306
308
310
312
314
316
318
320
322
324
326
328
330
332
334
336
338
340
342
344
346
348
350
352
354
356
358
360
362
364
366
368
370
372
374
376
378
380
382
384
386
388
390
392
394
396
398
400
402
404
406
408
410
412
414
416
418
420
422
424
426
428
430
432
434
436
438
440
442
444
446
448
450
452
454
456
458
460
462
464
466
468
470
472
474
476
478
480
482
484
486
488
490
492
494
496
498
----------------------
3217085 3217085


In [9]:
pickle_out = open(path + "corpus_dict_until_500.pickle", "wb")
pickle.dump(corpus_dict, pickle_out)
pickle_out.close()

drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.


In [10]:
print(prev_i, i)

498000000 498000000


In [None]:
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

In [None]:
drive.mount("/content/drive", force_remount=True) 
corpus_dict = {}

for i in range(500, 1000, 2): #  secondly until 1000
  print(i)
  i *= 1000000
  corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:i],
                                              corpus_dict,
                                              index)
  prev_i = i

#corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:],
#                                            corpus_dict,
#                                            index)

print("----------------------")
print(len(corpus_dict), index)

Mounted at /content/drive
500
502
504
506
508
510
512
514
516
518
520
522
524
526
528
530
532
534
536
538
540
542
544
546


In [None]:
pickle_out = open(path + "corpus_dict_until_1000.pickle", "wb")
pickle.dump(corpus_dict, pickle_out)
pickle_out.close()

drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

In [None]:
drive.mount("/content/drive", force_remount=True) 
corpus_dict = {}

for i in range(1000, 1500, 2): # thirdly until 1500
  print(i)
  i *= 1000000
  corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:i],
                                              corpus_dict,
                                              index)
  prev_i = i

#corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:],
#                                            corpus_dict,
#                                            index)

print("----------------------")
print(len(corpus_dict), index)

In [None]:
pickle_out = open(path + "corpus_dict_until_1500.pickle", "wb")
pickle.dump(corpus_dict, pickle_out)
pickle_out.close()

drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

In [None]:
drive.mount("/content/drive", force_remount=True) 
corpus_dict = {}

for i in range(1500, 2000, 2): # fourthly until 2000
  print(i)
  i *= 1000000
  corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:i],
                                              corpus_dict,
                                              index)
  prev_i = i

#corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:],
#                                            corpus_dict,
#                                            index)

print("----------------------")
print(len(corpus_dict), index)

In [None]:
pickle_out = open(path + "corpus_dict_until_2000.pickle", "wb")
pickle.dump(corpus_dict, pickle_out)
pickle_out.close()

drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

In [None]:
drive.mount("/content/drive", force_remount=True) 
corpus_dict = {}

for i in range(2000, 2500, 2): # fifthly until 2500
  print(i)
  i *= 1000000
  corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:i],
                                              corpus_dict,
                                              index)
  prev_i = i

#corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:],
#                                            corpus_dict,
#                                            index)

print("----------------------")
print(len(corpus_dict), index)

In [None]:
pickle_out = open(path + "corpus_dict_until_2500.pickle", "wb")
pickle.dump(corpus_dict, pickle_out)
pickle_out.close()

drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

In [None]:
drive.mount("/content/drive", force_remount=True) 
corpus_dict = {}

for i in range(2500, 2976, 2): # lastly until end
  print(i)
  i *= 1000000
  corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:i],
                                              corpus_dict,
                                              index)
  prev_i = i

corpus_dict, index = clean_corpus_sentenced(unclean_corpus[prev_i:],
                                            corpus_dict,
                                            index)

print("----------------------")
print(len(corpus_dict), index)

In [None]:
pickle_out = open(path + "corpus_dict_until_end.pickle", "wb")
pickle.dump(corpus_dict, pickle_out)
pickle_out.close()

drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

In [None]:
def clean_corpus_unsentenced(data):
    # Tokenization
    with nlp.select_pipes(disable=["lemmatizer", "tok2vec", "tagger", "parser"]):
      nlp.enable_pipe("senter")
      doc = nlp(data)
    print(doc[:150])

    doc_filtered = []

    for token in doc:
      if token.is_upper is True:
        continue
      elif token.is_stop is True:
        continue
      elif str(token).lower() in madeup_names_lower:
        continue
      elif token.is_alpha:
        doc_filtered.append(str(token).lower())
      else: 
        continue

    doc_filtered = " ".join(doc_filtered)

    print(doc_filtered[:500])

    # Remove words with freq < XX

clean_corpus_unsentenced(unclean_corpus)#[:1000000])

## Training fastText and Validating on Word Embeddings Benchmark

In [None]:
# Skipgram model :
#model = fasttext.train_unsupervised('data.txt', model='skipgram')

#model.save_model("model_filename.bin")

#model = fasttext.load_model("model_filename.bin")

#model.get_nearest_neighbors('asparagus')

#In a similar spirit, one can play around with word analogies. For example, we can see if our model can guess what is to France, and what Berlin is to Germany.
#This can be done with the analogies functionality. It takes a word triplet (like Germany Berlin France) and outputs the analogy:
#model.get_analogies("berlin", "germany", "france")

In [None]:
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.
