# Natural Language Processing with gensim

- **Created by Andrés Segura Tinoco**
- **Created on June 09, 2019**

**Gensim** is a Python library for topic modelling, document indexing and similarity retrieval with large corpora. Target audience is the natural language processing (NLP) and information retrieval (IR) community.

In [1]:
# Load Python libraries
import io
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load NLP libraries from gensim and spacy
from gensim.models import Word2Vec
import spacy.lang.en as en

### Step 1 - Read natural text from a book

In [3]:
# Util function to read a plain text file
def read_text_file(file_path):
    text = ""
    with io.open(file_path, 'r', encoding = 'ISO-8859-1') as f:
        text = f.read()
    
    return text;

In [4]:
# Get text sample
file_path = "../data/en/The Adventures of Sherlock Holmes - Arthur Conan Doyle.txt"
plain_text = read_text_file(file_path)
len(plain_text)

576467

In [5]:
# Show first 1000 characters of document
plain_text[:1000]

"\nProject Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.net\n\n\nTitle: The Adventures of Sherlock Holmes\n\nAuthor: Arthur Conan Doyle\n\nRelease Date: November 29, 2002 [EBook #1661]\nLast Updated: May 20, 2019\n\nLanguage: English\n\nCharacter set encoding: UTF-8\n\n*** START OF THIS PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES ***\n\n\n\nProduced by an anonymous Project Gutenberg volunteer and Jose Menendez\n\n\n\ncover\n\n\n\nThe Adventures of Sherlock Holmes\n\n\n\nby Arthur Conan Doyle\n\n\n\nContents\n\n\n   I.     A Scandal in Bohemia\n   II.    The Red-Headed League\n   III.   A Case of Identity\n   IV.    The Boscombe Valley Mystery\n   V.     The Five Orange Pips\n   VI.    The Man with the 

### Step 2 - Tokenize and remove Stopwords

In [6]:
# Cleaing the text
clean_text = plain_text.lower()
clean_text = clean_text.replace('\n', '.')
clean_text = re.sub('[^a-zA-Z.]', ' ', clean_text)
clean_text = re.sub(r'\s+', ' ', clean_text)
clean_text = re.sub(r'\.+', ".", clean_text)
clean_text[:1000]

'.project gutenberg s the adventures of sherlock holmes by arthur conan doyle.this ebook is for the use of anyone anywhere at no cost and with.almost no restrictions whatsoever. you may copy it give it away or.re use it under the terms of the project gutenberg license included.with this ebook or online at www.gutenberg.net.title the adventures of sherlock holmes.author arthur conan doyle.release date november ebook .last updated may .language english.character set encoding utf . start of this project gutenberg ebook the adventures of sherlock holmes .produced by an anonymous project gutenberg volunteer and jose menendez.cover.the adventures of sherlock holmes.by arthur conan doyle.contents. i. a scandal in bohemia. ii. the red headed league. iii. a case of identity. iv. the boscombe valley mystery. v. the five orange pips. vi. the man with the twisted lip. vii. the adventure of the blue carbuncle. viii. the adventure of the speckled band. ix. the adventure of the engineer s thumb. x. t

In [7]:
# Tokenize text in sentences
sentence_list = clean_text.split('.')
len(sentence_list)

14592

In [8]:
# Tokenize sentences in words
word_list = [sentence.split() for sentence in sentence_list if len(sentence.split()) > 0]
word_list[:10]

[['project',
  'gutenberg',
  's',
  'the',
  'adventures',
  'of',
  'sherlock',
  'holmes',
  'by',
  'arthur',
  'conan',
  'doyle'],
 ['this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with'],
 ['almost', 'no', 'restrictions', 'whatsoever'],
 ['you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or'],
 ['re',
  'use',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'project',
  'gutenberg',
  'license',
  'included'],
 ['with', 'this', 'ebook', 'or', 'online', 'at', 'www'],
 ['gutenberg'],
 ['net'],
 ['title', 'the', 'adventures', 'of', 'sherlock', 'holmes'],
 ['author', 'arthur', 'conan', 'doyle']]

In [9]:
stopwords_en = en.stop_words.STOP_WORDS
print(stopwords_en)

{'not', 'empty', 'afterwards', 'that', 'each', 'anyhow', 'she', 'themselves', 'hereby', 'their', 'same', 'who', 'now', 'meanwhile', 'as', 'and', 'since', 'with', 'without', 'was', 'please', 'to', 'herein', 'otherwise', 'under', 'however', 'can', 'herself', 'make', 'never', 'such', 'then', 'eight', 'our', 'these', 'amount', 'sixty', 'nor', 'just', 'forty', 'often', 'could', 'less', 'few', 'every', 'no', 'most', 'here', 'me', 'were', 'fifty', 'whereafter', 'anywhere', "'re", 'but', 'has', 'move', 'should', 'wherein', 'some', 'on', 'yours', 'first', 'if', 'next', 'ours', 'serious', 'thereupon', 'used', 'wherever', 'amongst', 'in', 'latter', 'somehow', 'side', 'almost', 'do', 'upon', 'your', 'through', 'been', '‘s', 'except', 'full', 'what', 'into', 'show', 'ten', 'call', 'several', 'there', 'yourself', 'third', "'d", 'beforehand', 'least', 'seems', 'of', 'how', 'so', '‘m', 'alone', 'beyond', 'thus', 'nine', 'mine', 'across', 'while', 'are', 'due', 'must', 'will', 'together', 'both', 'with

In [10]:
# Remove stopwords
all_words = []
for ix in range(len(word_list)):
    all_words.append([word for word in word_list[ix] if (word not in stopwords_en and len(word) > 2)])

all_words[:10]

[['project',
  'gutenberg',
  'adventures',
  'sherlock',
  'holmes',
  'arthur',
  'conan',
  'doyle'],
 ['ebook', 'use', 'cost'],
 ['restrictions', 'whatsoever'],
 ['copy', 'away'],
 ['use', 'terms', 'project', 'gutenberg', 'license', 'included'],
 ['ebook', 'online', 'www'],
 ['gutenberg'],
 ['net'],
 ['title', 'adventures', 'sherlock', 'holmes'],
 ['author', 'arthur', 'conan', 'doyle']]

### Step 3 - Create a Word2Vec model

In [11]:
# Create Word2Vec model
word2vec = Word2Vec(all_words, min_count = 2)

In [12]:
# Show vocabulary size
vocabulary = word2vec.wv.vocab  
len(vocabulary)

4072

In [13]:
# Show 'project' vector
word2vec.wv['project']

array([ 0.00571524,  0.0037815 , -0.0029851 , -0.00234666, -0.00384644,
        0.0026138 ,  0.00376731, -0.00350076, -0.00345531, -0.00123497,
       -0.00392786, -0.00487326,  0.002396  ,  0.00065813, -0.00089581,
        0.00288083,  0.003073  , -0.00023033,  0.00064188,  0.00469708,
        0.00057567, -0.00287395,  0.00170696, -0.00220402, -0.00369562,
        0.00048359, -0.00554605,  0.00117496,  0.00097162,  0.00056364,
       -0.00300056,  0.00420297,  0.00553688,  0.00070874, -0.0008803 ,
       -0.00219831,  0.00295734,  0.00202147, -0.00204766,  0.00069915,
        0.00155343, -0.00173383,  0.00101644,  0.0037438 , -0.00295153,
       -0.0027267 , -0.00293964, -0.00352298, -0.00384365,  0.00290935,
       -0.00594365, -0.00033689, -0.00500323, -0.0002361 , -0.00169291,
        0.00290482, -0.0013868 ,  0.00172921,  0.00102748, -0.00260645,
        0.00270165, -0.002158  ,  0.00060265,  0.00046697, -0.00544463,
       -0.00219709, -0.00153067, -0.00328287, -0.00191149, -0.00