# manual steps for Preprocessing

In [1]:
import numpy as np
import pandas as pd

In [2]:
from pathlib import Path

root = Path("data")
root.mkdir(exist_ok=True)

path = root / "rainbowfluffysheep.txt"

### load the txt file

In [3]:
with open(path, "r", encoding="latin1") as f:
    story = f.read()

story



### lower casing

In [4]:
story= story.lower()
story



### Sentence Tockenization

In [5]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [6]:
snentence=nlp(story)

In [7]:
sentence_list=list(snentence.sents)
sentence_list

[the longest text ever!
 i am going to do it.,
 i have made up my mind.,
 these are the first few words of the newâ¦ the best â¦ the longest text in the entire history of the known universe!,
 this has to have over 35,000 words the beat the current world record set by that person who made that flaming chicken handbooky thingy.,
 i might just be saying random things the whole time i type in this so you might get confused a lot.,
 i just discovered something terrible.,
 autocorrect is on!!,
 no!!!,
 this has to be crazy, so i will have to break all the english language rules and the basic knowledge of the average human being.,
 i am not an average human being, however i am special.,
 no no,
 no, not that kind of special ;).,
 why do people send that wink face!,
 it always gives me nightmares!,
 it can make a completely normal sentence creepy.,
 imagine you are going to a friendâs house, so you text this: [ see you soon ð ] seems normal, right?,
 but what is you add the word semi t

In [8]:
len(sentence_list)

2339

### Removing punctuation

In [9]:
# remove punctuation
import string
exclude=string.punctuation + '\n'
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n'

In [10]:
def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

In [11]:
sentence_list[1]

i have made up my mind.

In [12]:
sentence_list2=[]
for sen in range(len(sentence_list)):
    sentence_list2.append(remove_punc(str(sentence_list[sen])))
sentence_list2

['the longest text everi am going to do it',
 'i have made up my mind',
 'these are the first few words of the newâ\x80¦ the best â\x80¦ the longest text in the entire history of the known universe',
 'this has to have over 35000 words the beat the current world record set by that person who made that flaming chicken handbooky thingy',
 'i might just be saying random things the whole time i type in this so you might get confused a lot',
 'i just discovered something terrible',
 'autocorrect is on',
 'no',
 'this has to be crazy so i will have to break all the english language rules and the basic knowledge of the average human being',
 'i am not an average human being however i am special',
 'no no',
 'no not that kind of special ',
 'why do people send that wink face',
 'it always gives me nightmares',
 'it can make a completely normal sentence creepy',
 'imagine you are going to a friendâ\x80\x99s house so you text this  see you soon ð\x9f\x99\x82  seems normal right',
 'but what is y

In [13]:
len(sentence_list2)

2339

### Removing Emoji and Non-English Characters 

In [14]:
import emoji
print((str(sentence_list[2])))
print(emoji.demojize(str(sentence_list[2])))
# not working for currupted emoji

these are the first few words of the newâ¦ the best â¦ the longest text in the entire history of the known universe!
these are the first few words of the newâ¦ the best â¦ the longest text in the entire history of the known universe!


In [15]:
def keep_english(text):
    return "".join(ch for ch in text if 32 <= ord(ch) <= 126)

In [16]:
sentence_list3=[]
for sen in range(len(sentence_list2)):
    sentence_list3.append(keep_english(str(sentence_list2[sen])))
sentence_list3

['the longest text everi am going to do it',
 'i have made up my mind',
 'these are the first few words of the new the best  the longest text in the entire history of the known universe',
 'this has to have over 35000 words the beat the current world record set by that person who made that flaming chicken handbooky thingy',
 'i might just be saying random things the whole time i type in this so you might get confused a lot',
 'i just discovered something terrible',
 'autocorrect is on',
 'no',
 'this has to be crazy so i will have to break all the english language rules and the basic knowledge of the average human being',
 'i am not an average human being however i am special',
 'no no',
 'no not that kind of special ',
 'why do people send that wink face',
 'it always gives me nightmares',
 'it can make a completely normal sentence creepy',
 'imagine you are going to a friends house so you text this  see you soon   seems normal right',
 'but what is you add the word semi to that colon

In [17]:
print((str(sentence_list[2])))
print((str(sentence_list2[2])))
print((str(sentence_list3[2])))

these are the first few words of the newâ¦ the best â¦ the longest text in the entire history of the known universe!
these are the first few words of the newâ¦ the best â¦ the longest text in the entire history of the known universe
these are the first few words of the new the best  the longest text in the entire history of the known universe


### Create Corpus

In [18]:
corpus = " ".join(sentence_list3)
print(corpus)



### create tokens from corpus

In [19]:
nlp=spacy.load('en_core_web_sm')
doc=nlp(corpus)
tokens = [token.text for token in doc]
len(tokens),tokens

(27759,
 ['the',
  'longest',
  'text',
  'everi',
  'am',
  'going',
  'to',
  'do',
  'it',
  'i',
  'have',
  'made',
  'up',
  'my',
  'mind',
  'these',
  'are',
  'the',
  'first',
  'few',
  'words',
  'of',
  'the',
  'new',
  'the',
  'best',
  ' ',
  'the',
  'longest',
  'text',
  'in',
  'the',
  'entire',
  'history',
  'of',
  'the',
  'known',
  'universe',
  'this',
  'has',
  'to',
  'have',
  'over',
  '35000',
  'words',
  'the',
  'beat',
  'the',
  'current',
  'world',
  'record',
  'set',
  'by',
  'that',
  'person',
  'who',
  'made',
  'that',
  'flaming',
  'chicken',
  'handbooky',
  'thingy',
  'i',
  'might',
  'just',
  'be',
  'saying',
  'random',
  'things',
  'the',
  'whole',
  'time',
  'i',
  'type',
  'in',
  'this',
  'so',
  'you',
  'might',
  'get',
  'confused',
  'a',
  'lot',
  'i',
  'just',
  'discovered',
  'something',
  'terrible',
  'autocorrect',
  'is',
  'on',
  'no',
  'this',
  'has',
  'to',
  'be',
  'crazy',
  'so',
  'i',
  '

### Create vocabulary 

In [20]:
# build vocab
vocab = {'<unk>':0}

for token in set(tokens):
  if token not in vocab:
    vocab[token] = len(vocab)

len(vocab),vocab

(4343,
 {'<unk>': 0,
  'lot': 1,
  'quote': 2,
  'street': 3,
  'phone': 4,
  'creeeeeeeepy': 5,
  'emphasising': 6,
  'residential': 7,
  'teachers': 8,
  'knives': 9,
  'wage': 10,
  'matters': 11,
  'skin': 12,
  'noooooooooooooooo': 13,
  'appreciate': 14,
  'mwahahahah': 15,
  'sorry': 16,
  'your': 17,
  'sell': 18,
  'clapclapclapclapclapapplauseapplauseonelonelymaninthebackscreamsbecausehegotlosthewastryingtogettonemotheclownfishmoviebutnowthereisagiantbeeslashwasprightthereandheisreallyscared': 19,
  'boxes': 20,
  'imaginary': 21,
  'schools': 22,
  '23': 23,
  'coincidental': 24,
  'still': 25,
  'fore': 26,
  'here': 27,
  'less': 28,
  'orc': 29,
  'flips': 30,
  'hangin': 31,
  'blegh': 32,
  'doom': 33,
  'forced': 34,
  'generate': 35,
  'askit': 36,
  'restaurant': 37,
  'comes': 38,
  'unjust': 39,
  'cameraman': 40,
  'deflecting': 41,
  'b': 42,
  'shtars': 43,
  'if': 44,
  'imagine': 45,
  'look': 46,
  'memebr': 47,
  'alike': 48,
  'grease': 49,
  'playing': 50,

### Create word vector

In [21]:
import nltk
data_dir = Path("D:/data")
data_dir.mkdir(exist_ok=True)

# Download punkt into this folder
nltk.download("punkt", download_dir=str(data_dir))
nltk.download("punkt_tab", download_dir=str(data_dir))

# Tell NLTK to look inside your custom folder
nltk.data.path.append(str(data_dir))

[nltk_data] Downloading package punkt to D:\data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to D:\data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [22]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [23]:
def text_vector(sentetnce):

  vector_sentence = []

  for token in sentetnce:
    if token in vocab:
      vector_sentence.append(token)
    else:
      vector_sentence.append('<unk>')

  return vector_sentence

In [24]:
vector_sentence_ = []

for sentence in sentence_list3:
  vector_sentence_.append(text_vector(word_tokenize(sentence.lower())))

In [25]:
len(vector_sentence_),vector_sentence_

(2339,
 [['the', 'longest', 'text', 'everi', 'am', 'going', 'to', 'do', 'it'],
  ['i', 'have', 'made', 'up', 'my', 'mind'],
  ['these',
   'are',
   'the',
   'first',
   'few',
   'words',
   'of',
   'the',
   'new',
   'the',
   'best',
   'the',
   'longest',
   'text',
   'in',
   'the',
   'entire',
   'history',
   'of',
   'the',
   'known',
   'universe'],
  ['this',
   'has',
   'to',
   'have',
   'over',
   '35000',
   'words',
   'the',
   'beat',
   'the',
   'current',
   'world',
   'record',
   'set',
   'by',
   'that',
   'person',
   'who',
   'made',
   'that',
   'flaming',
   'chicken',
   'handbooky',
   'thingy'],
  ['i',
   'might',
   'just',
   'be',
   'saying',
   'random',
   'things',
   'the',
   'whole',
   'time',
   'i',
   'type',
   'in',
   'this',
   'so',
   'you',
   'might',
   'get',
   'confused',
   'a',
   'lot'],
  ['i', 'just', 'discovered', 'something', 'terrible'],
  ['autocorrect', 'is', 'on'],
  ['no'],
  ['this',
   'has',
   'to',


### create Pre-Vector (before Embbading)

In [26]:
def to_vector(sentence):

  vector_= []

  for token in sentence:
    if token in vocab:
      vector_.append(vocab[token])
    else:
      vector_.append(vocab['<unk>'])

  return vector_

In [27]:
pre_vector = []

for sentence in sentence_list3:
  pre_vector.append(to_vector(word_tokenize(sentence.lower())))

In [28]:
len(pre_vector),pre_vector

(2339,
 [[1080, 2761, 688, 3706, 1259, 3145, 3574, 2914, 917],
  [4285, 4123, 3409, 2637, 3225, 1495],
  [3228,
   3259,
   1080,
   618,
   821,
   58,
   2647,
   1080,
   3203,
   1080,
   1279,
   1080,
   2761,
   688,
   2500,
   1080,
   331,
   2799,
   2647,
   1080,
   2573,
   3269],
  [4000,
   877,
   3574,
   4123,
   208,
   689,
   58,
   1080,
   3219,
   1080,
   3033,
   995,
   3500,
   921,
   636,
   1488,
   3254,
   333,
   3409,
   1488,
   2079,
   2923,
   708,
   4068],
  [4285,
   4100,
   294,
   2651,
   3268,
   3656,
   3745,
   1080,
   2099,
   3237,
   4285,
   3972,
   2500,
   4000,
   846,
   2290,
   4100,
   1138,
   3548,
   3975,
   1],
  [4285, 294, 480, 2915, 4340],
  [3241, 2386, 346],
  [2574],
  [4000,
   877,
   3574,
   2651,
   310,
   846,
   4285,
   1691,
   4123,
   3574,
   320,
   820,
   1080,
   3482,
   1072,
   85,
   903,
   1080,
   2060,
   2806,
   2647,
   1080,
   2418,
   3825,
   2632],
  [4285, 1259, 639, 790, 2418, 

In [29]:
# Imports
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [30]:
model=Word2Vec(vector_sentence_,vector_size=100,epochs=30,window=5,workers=6)

In [31]:
model.corpus_count

2339

In [32]:
# Access keyed vectors
kv = model.wv

In [36]:
kv.most_similar("he")

[('joe', 0.910641074180603),
 ('bob', 0.8754317760467529),
 ('does', 0.8651716113090515),
 ('sitting', 0.8614405393600464),
 ('another', 0.8591088056564331),
 ('man', 0.8585515022277832),
 ('named', 0.8500186204910278),
 ('cat', 0.832484245300293),
 ('mats', 0.8306012153625488),
 ('friend', 0.8299413323402405)]

In [38]:
print("Vocab size:", len(kv.index_to_key))
print("Sample words:", kv.index_to_key[:10])
print("Vector shape for 'nlp':", kv['bob'].shape)

Vocab size: 709
Sample words: ['the', 'i', 'to', 'a', '<unk>', 'you', 'and', 'that', 'of', 'is']
Vector shape for 'nlp': (100,)


In [40]:
kv['bob']

array([ 0.03801609,  0.09295639,  0.1218942 ,  0.11854414, -0.0188069 ,
       -0.34205484,  0.07969678,  0.41907847, -0.23167968, -0.04868694,
       -0.03741133, -0.2724285 ,  0.04627834,  0.03387075,  0.14218095,
       -0.12167866,  0.08042049, -0.32480437, -0.06669123, -0.45000732,
        0.13982734,  0.14567201,  0.24774809, -0.19694291, -0.16771254,
        0.14360608, -0.18799448, -0.07887892, -0.18155125,  0.05513686,
        0.15739553,  0.06737792,  0.00530116, -0.04980158, -0.21879964,
        0.31123933,  0.03677911, -0.11434006, -0.30596086, -0.28908607,
        0.00863297, -0.26615432, -0.08811133,  0.0268061 ,  0.12917818,
       -0.14060363, -0.2371399 , -0.07497679,  0.16394709,  0.05214429,
       -0.01410676, -0.18730675, -0.01624135, -0.00145455, -0.1950769 ,
        0.03409503,  0.04062984,  0.03645948, -0.0764294 , -0.01572647,
        0.10785192, -0.02424111,  0.05310964, -0.04012802, -0.33734912,
        0.1485942 , -0.00279631,  0.21400659, -0.30938792, -0.01