In [1]:
# Import libraries to build Word2Vec model, and load Newsgroups data
import os
import sys
import re
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser, Phrases
TEXT_DATA_DIR = './20news-bydate-train/'   # I only train on this my model

In [2]:
# Newsgroups data is split between many files and folders.
# Directory stucture ./20news-bydate-train/<newsgroup label>/<post ID>

texts = []         # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []        # list of label ids
label_text = []    # list of label texts

# Go through each directory
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            # News groups posts are named as numbers, with no extensions.
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header in file (starts with two newlines.)
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)
                label_text.append(name)

In [3]:
print (labels_index)

{'talk.politics.mideast': 17, 'talk.politics.guns': 16, 'comp.os.ms-windows.misc': 2, 'comp.windows.x': 5, 'soc.religion.christian': 15, 'alt.atheism': 0, 'comp.graphics': 1, 'rec.sport.baseball': 9, 'talk.politics.misc': 18, 'rec.sport.hockey': 10, 'sci.crypt': 11, 'sci.med': 13, 'talk.religion.misc': 19, 'sci.space': 14, 'rec.autos': 7, 'rec.motorcycles': 8, 'comp.sys.ibm.pc.hardware': 3, 'comp.sys.mac.hardware': 4, 'misc.forsale': 6, 'sci.electronics': 12}


In [5]:
len(labels) # total labelled text

11314

In [6]:
len(texts)  # total texts

11314

In [10]:
label_text[0]

'alt.atheism'

In [12]:
len(label_text) # name of the labels also the folder name 

11314

In [13]:
label_id # numerical tag for the label classes 

19

In [15]:
len(labels) # there are 11314 ground truth labels 

11314

In [19]:
labels[11312] # 11312th text belong to class label19

19

In [20]:
label_text[11312]# folder the text 11312 is located in , also it is the category this text belongs to in the newapAaper

'talk.religion.misc'

In [21]:
print('Found %s texts.' % len(texts))

Found 11314 texts.


# The data is loaded into memory (a single list ‘texts’) at this point; for preprocessing, remove all punctuation, and excess information.Gensim likes this form for input 

In [22]:
# Cleaning data - remove punctuation from every newsgroup text
sentences = []
# Go through each text in turn
for ii in range(len(texts)):
    sentences = [re.sub(pattern=r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]', 
                        repl='', 
                        string=x
                       ).strip().split(' ') for x in texts[ii].split('\n') 
                      if not x.endswith('writes:')]
    sentences = [x for x in sentences if x != ['']]
    texts[ii] = sentences

In [36]:
#Each original document is now represented in the list, ‘texts’, as a list of sentences, and 
#each sentence is a list of words.

print(texts[0])

[['Archivename', 'atheismresources'], ['Altatheismarchivename', 'resources'], ['Lastmodified', '11', 'December', '1992'], ['Version', '10'], ['Atheist', 'Resources'], ['Addresses', 'of', 'Atheist', 'Organizations'], ['USA'], ['FREEDOM', 'FROM', 'RELIGION', 'FOUNDATION'], ['Darwin', 'fish', 'bumper', 'stickers', 'and', 'assorted', 'other', 'atheist', 'paraphernalia', 'are'], ['available', 'from', 'the', 'Freedom', 'From', 'Religion', 'Foundation', 'in', 'the', 'US'], ['Write', 'to', '', 'FFRF', 'PO', 'Box', '750', 'Madison', 'WI', '53701'], ['Telephone', '608', '2568900'], ['EVOLUTION', 'DESIGNS'], ['Evolution', 'Designs', 'sell', 'the', 'Darwin', 'fish', '', "It's", 'a', 'fish', 'symbol', 'like', 'the', 'ones'], ['Christians', 'stick', 'on', 'their', 'cars', 'but', 'with', 'feet', 'and', 'the', 'word', 'Darwin', 'written'], ['inside', '', 'The', 'deluxe', 'moulded', '3D', 'plastic', 'fish', 'is', '495', 'postpaid', 'in', 'the', 'US'], ['Write', 'to', '', 'Evolution', 'Designs', '7119',

In [26]:
# concatenate all sentences from all texts into a single list of sentences
all_sentences = []
for text in texts:
    all_sentences += text

# Phrase Detection using Gensim Phraser


In [27]:
# The gensim.models.phrases module provides everything required in a simple form:
# Phrase Detection
# Give some common terms that can be ignored in phrase detection
# For example, 'state_of_affairs' will be detected because 'of' is provided here: 
# common_terms = ["of", "with", "without", "and", "or", "the", "a"]
# # Create the relevant phrases from the list of sentences:
# phrases = Phrases(all_sentences, common_terms=common_terms)
# # The Phraser object is used from now on to transform sentences
# bigram = Phraser(phrases)

# # Applying the Phraser to transform our sentences is simply
# all_sentences = list(bigram[all_sentences])


In [32]:
print(all_sentences[11678])

['Now', 'what', 'I', 'am', 'interested', 'in', 'is', 'the', 'original', 'notion', 'you', 'were', 'discussing']


In [33]:
len(all_sentences)

328961

In [34]:
len(all_sentences[11678])

13

In [37]:
common_terms = ["of", "with", "without", "and", "or", "the", "a"] # this you do after you get a feel for the kind of paired words in your testtual data
phrases = Phrases(all_sentences, common_terms=common_terms)
bigram = Phraser(phrases)
all_sentences = list(bigram[all_sentences])


In [38]:
len(all_sentences[11678])

11

In [42]:
print(all_sentences[11678])  # after bigramming the texts 

['Now', 'what', 'I_am', 'interested_in', 'is', 'the', 'original', 'notion', 'you', 'were', 'discussing']


In [43]:
#‘interested_in’ may indicate an overly greedy application of the phrase detection algorithm.

# Creating the Word Embeddings using Word2Vec

In [44]:
model = Word2Vec(all_sentences, 
                 min_count=3,   # Ignore words that appear less than this
                 size=200,      # Dimensionality of word embeddings
                 workers=2,     # Number of processors (parallelisation)
                 window=5,      # Context window for words during training
                 iter=30)       # Number of epochs training over corpus

In [45]:
model

<gensim.models.word2vec.Word2Vec at 0x1a300f7320>

In [46]:
# word embedding size
model.vector_size

200

In [47]:
# Total number of words in the model
len(model.wv.vocab)

53724

In [49]:
model.most_similar("New_York")

  """Entry point for launching an IPython kernel.


[('Los_Angeles', 0.4764913320541382),
 ('California', 0.4720371961593628),
 ('Florida', 0.45818209648132324),
 ('AHL', 0.44434309005737305),
 ('NY', 0.4333398640155792),
 ('N_Y', 0.4313083291053772),
 ('1970', 0.425307035446167),
 ('Wisconsin', 0.424573689699173),
 ('Albany', 0.42317038774490356),
 ('City', 0.42270994186401367)]

In [50]:
model.most_similar("engine")

  """Entry point for launching an IPython kernel.


[('car', 0.4875488877296448),
 ('bike', 0.4857217073440552),
 ('suspension', 0.46694448590278625),
 ('speed', 0.45672088861465454),
 ('motor', 0.45661652088165283),
 ('accelerator', 0.45547688007354736),
 ('torque', 0.4486038386821747),
 ('voltage', 0.44655799865722656),
 ('clutch', 0.4387759566307068),
 ('tires', 0.4374878406524658)]