In [1]:
import pandas as pd

# Read training data
train = pd.read_csv("data\\labeledTrainData.tsv", header=0, 
    delimiter="\t", quoting=3 )

test = pd.read_csv("data\\testData.tsv", header=0, delimiter="\t", quoting=3)
unlabelled_train = pd.read_csv("data\\unlabeledTrainData.tsv", header=0, 
    delimiter="\t", quoting=3)

# Verify the number of reviews that were read (100,000 in total)
print("Read %d labelled train reviews, %d labelled test reviews, " \
 "and %d unlabelled reviews\n" % (train["review"].size,  \
 test["review"].size, unlabelled_train["review"].size ))

Read 25000 labelled train reviews, 25000 labelled test reviews, and 50000 unlabelled reviews



In [2]:
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [7]:
# Download the punkt tokenizer for sentence splitting
import nltk.data
# nltk.download()   - uncomment if nltk datasets have not been downloaded yet.

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
# That's the format word2vec expects - it works on sentences 
# represented as a list of tokens. 
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [8]:
sentences = []
print('Parsing sentences from the training set')

for review in train['review']:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from the training set




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html.parser")

  markup_type=markup_type))


  'Beautiful Soup.' % markup)


  'Beautiful Soup.' % markup)


  ' that document to Beautiful Soup.' % decoded_markup


In [9]:
print('Parsing sentences from the unlabelled set')
for review in unlabelled_train['review']:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from the unlabelled set




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html.parser")

  markup_type=markup_type))


  'Beautiful Soup.' % markup)


  ' that document to Beautiful Soup.' % decoded_markup


  ' that document to Beautiful Soup.' % decoded_markup


  'Beautiful Soup.' % markup)


  'Beautiful Soup.' % markup)


  'Beautiful Soup.' % markup)


  ' that document to Beautiful Soup.' % decoded_markup


  'Beautiful Soup.' % markup)


  ' that document to Beautiful Soup.' % decoded_markup


  'Beautiful Soup.' % markup)


  ' that document to Beautiful Soup.' % decoded_markup


In [11]:
sentences[0]

['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with',
 'mj',
 'i',
 've',
 'started',
 'listening',
 'to',
 'his',
 'music',
 'watching',
 'the',
 'odd',
 'documentary',
 'here',
 'and',
 'there',
 'watched',
 'the',
 'wiz',
 'and',
 'watched',
 'moonwalker',
 'again']

In [13]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print ("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "models\\300features_40minwords_10context"
model.save(model_name)

2016-10-02 14:24:43,542 : INFO : detected Windows; aliasing chunkize to chunkize_serial


2016-10-02 14:24:43,634 : INFO : Pattern library is not installed, lemmatization won't be available.


2016-10-02 14:24:43,656 : INFO : Could not import Theano, will use standard float for default ShardedCorpus dtype.


2016-10-02 14:24:43,819 : INFO : 'pattern' package not found; tag filters are not available for English


2016-10-02 14:24:43,845 : INFO : collecting all words and their counts


2016-10-02 14:24:43,845 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


2016-10-02 14:24:43,886 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types


2016-10-02 14:24:43,931 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types


2016-10-02 14:24:43,973 : INFO : PROGRESS: at sentence #30000, processed 671314 words, keeping 30034 word types


2016-10-02 14:24:44,015 : INFO : PROGRESS: at sentence #40000, processed 897814 words, keeping 34348 word types


Training model...


2016-10-02 14:24:44,060 : INFO : PROGRESS: at sentence #50000, processed 1116962 words, keeping 37761 word types


2016-10-02 14:24:44,103 : INFO : PROGRESS: at sentence #60000, processed 1338403 words, keeping 40723 word types


2016-10-02 14:24:44,146 : INFO : PROGRESS: at sentence #70000, processed 1561579 words, keeping 43333 word types


2016-10-02 14:24:44,194 : INFO : PROGRESS: at sentence #80000, processed 1780886 words, keeping 45714 word types


2016-10-02 14:24:44,240 : INFO : PROGRESS: at sentence #90000, processed 2004995 words, keeping 48135 word types


2016-10-02 14:24:44,284 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50207 word types


2016-10-02 14:24:44,329 : INFO : PROGRESS: at sentence #110000, processed 2446580 words, keeping 52081 word types


2016-10-02 14:24:44,373 : INFO : PROGRESS: at sentence #120000, processed 2668775 words, keeping 54119 word types


2016-10-02 14:24:44,419 : INFO : PROGRESS: at sentence #130000, processed 2894303 words, keeping 55847 word types


2016-10-02 14:24:44,462 : INFO : PROGRESS: at sentence #140000, processed 3107005 words, keeping 57346 word types


2016-10-02 14:24:44,509 : INFO : PROGRESS: at sentence #150000, processed 3332627 words, keeping 59055 word types


2016-10-02 14:24:44,553 : INFO : PROGRESS: at sentence #160000, processed 3555315 words, keeping 60617 word types


2016-10-02 14:24:44,600 : INFO : PROGRESS: at sentence #170000, processed 3778655 words, keeping 62077 word types


2016-10-02 14:24:44,646 : INFO : PROGRESS: at sentence #180000, processed 3999236 words, keeping 63496 word types


2016-10-02 14:24:44,696 : INFO : PROGRESS: at sentence #190000, processed 4224449 words, keeping 64794 word types


2016-10-02 14:24:44,746 : INFO : PROGRESS: at sentence #200000, processed 4448603 words, keeping 66087 word types


2016-10-02 14:24:44,798 : INFO : PROGRESS: at sentence #210000, processed 4669967 words, keeping 67390 word types


2016-10-02 14:24:44,849 : INFO : PROGRESS: at sentence #220000, processed 4894968 words, keeping 68697 word types


2016-10-02 14:24:44,896 : INFO : PROGRESS: at sentence #230000, processed 5117545 words, keeping 69958 word types


2016-10-02 14:24:44,941 : INFO : PROGRESS: at sentence #240000, processed 5345050 words, keeping 71167 word types


2016-10-02 14:24:44,985 : INFO : PROGRESS: at sentence #250000, processed 5559165 words, keeping 72351 word types


2016-10-02 14:24:45,029 : INFO : PROGRESS: at sentence #260000, processed 5779146 words, keeping 73478 word types


2016-10-02 14:24:45,074 : INFO : PROGRESS: at sentence #270000, processed 6000435 words, keeping 74767 word types


2016-10-02 14:24:45,122 : INFO : PROGRESS: at sentence #280000, processed 6226314 words, keeping 76369 word types


2016-10-02 14:24:45,172 : INFO : PROGRESS: at sentence #290000, processed 6449474 words, keeping 77839 word types


2016-10-02 14:24:45,223 : INFO : PROGRESS: at sentence #300000, processed 6674077 words, keeping 79171 word types


2016-10-02 14:24:45,273 : INFO : PROGRESS: at sentence #310000, processed 6899391 words, keeping 80480 word types


2016-10-02 14:24:45,323 : INFO : PROGRESS: at sentence #320000, processed 7124278 words, keeping 81808 word types


2016-10-02 14:24:45,373 : INFO : PROGRESS: at sentence #330000, processed 7346021 words, keeping 83030 word types


2016-10-02 14:24:45,424 : INFO : PROGRESS: at sentence #340000, processed 7575533 words, keeping 84280 word types


2016-10-02 14:24:45,473 : INFO : PROGRESS: at sentence #350000, processed 7798803 words, keeping 85425 word types


2016-10-02 14:24:45,520 : INFO : PROGRESS: at sentence #360000, processed 8019466 words, keeping 86596 word types


2016-10-02 14:24:45,573 : INFO : PROGRESS: at sentence #370000, processed 8246654 words, keeping 87708 word types


2016-10-02 14:24:45,624 : INFO : PROGRESS: at sentence #380000, processed 8471801 words, keeping 88878 word types


2016-10-02 14:24:45,676 : INFO : PROGRESS: at sentence #390000, processed 8701551 words, keeping 89907 word types


2016-10-02 14:24:45,727 : INFO : PROGRESS: at sentence #400000, processed 8924500 words, keeping 90916 word types


2016-10-02 14:24:45,778 : INFO : PROGRESS: at sentence #410000, processed 9145850 words, keeping 91880 word types


2016-10-02 14:24:45,829 : INFO : PROGRESS: at sentence #420000, processed 9366930 words, keeping 92912 word types


2016-10-02 14:24:45,876 : INFO : PROGRESS: at sentence #430000, processed 9594467 words, keeping 93932 word types


2016-10-02 14:24:45,920 : INFO : PROGRESS: at sentence #440000, processed 9821218 words, keeping 94906 word types


2016-10-02 14:24:45,966 : INFO : PROGRESS: at sentence #450000, processed 10044980 words, keeping 96036 word types


2016-10-02 14:24:46,015 : INFO : PROGRESS: at sentence #460000, processed 10277740 words, keeping 97088 word types


2016-10-02 14:24:46,064 : INFO : PROGRESS: at sentence #470000, processed 10505665 words, keeping 97933 word types


2016-10-02 14:24:46,111 : INFO : PROGRESS: at sentence #480000, processed 10726049 words, keeping 98862 word types


2016-10-02 14:24:46,160 : INFO : PROGRESS: at sentence #490000, processed 10952793 words, keeping 99871 word types


2016-10-02 14:24:46,211 : INFO : PROGRESS: at sentence #500000, processed 11174449 words, keeping 100765 word types


2016-10-02 14:24:46,262 : INFO : PROGRESS: at sentence #510000, processed 11399724 words, keeping 101699 word types


2016-10-02 14:24:46,311 : INFO : PROGRESS: at sentence #520000, processed 11623075 words, keeping 102598 word types


2016-10-02 14:24:46,361 : INFO : PROGRESS: at sentence #530000, processed 11847473 words, keeping 103400 word types


2016-10-02 14:24:46,408 : INFO : PROGRESS: at sentence #540000, processed 12072088 words, keeping 104265 word types


2016-10-02 14:24:46,454 : INFO : PROGRESS: at sentence #550000, processed 12297639 words, keeping 105133 word types


2016-10-02 14:24:46,506 : INFO : PROGRESS: at sentence #560000, processed 12518929 words, keeping 105997 word types


2016-10-02 14:24:46,558 : INFO : PROGRESS: at sentence #570000, processed 12748076 words, keeping 106787 word types


2016-10-02 14:24:46,609 : INFO : PROGRESS: at sentence #580000, processed 12969572 words, keeping 107665 word types


2016-10-02 14:24:46,659 : INFO : PROGRESS: at sentence #590000, processed 13195097 words, keeping 108501 word types


2016-10-02 14:24:46,711 : INFO : PROGRESS: at sentence #600000, processed 13417295 words, keeping 109218 word types


2016-10-02 14:24:46,760 : INFO : PROGRESS: at sentence #610000, processed 13638318 words, keeping 110092 word types


2016-10-02 14:24:46,811 : INFO : PROGRESS: at sentence #620000, processed 13864643 words, keeping 110837 word types


2016-10-02 14:24:46,863 : INFO : PROGRESS: at sentence #630000, processed 14088929 words, keeping 111610 word types


2016-10-02 14:24:46,911 : INFO : PROGRESS: at sentence #640000, processed 14309712 words, keeping 112416 word types


2016-10-02 14:24:46,960 : INFO : PROGRESS: at sentence #650000, processed 14535468 words, keeping 113196 word types


2016-10-02 14:24:47,009 : INFO : PROGRESS: at sentence #660000, processed 14758258 words, keeping 113945 word types


2016-10-02 14:24:47,054 : INFO : PROGRESS: at sentence #670000, processed 14981651 words, keeping 114643 word types


2016-10-02 14:24:47,100 : INFO : PROGRESS: at sentence #680000, processed 15206483 words, keeping 115354 word types


2016-10-02 14:24:47,149 : INFO : PROGRESS: at sentence #690000, processed 15428676 words, keeping 116131 word types


2016-10-02 14:24:47,202 : INFO : PROGRESS: at sentence #700000, processed 15657382 words, keeping 116943 word types


2016-10-02 14:24:47,254 : INFO : PROGRESS: at sentence #710000, processed 15880371 words, keeping 117596 word types


2016-10-02 14:24:47,303 : INFO : PROGRESS: at sentence #720000, processed 16105658 words, keeping 118221 word types


2016-10-02 14:24:47,349 : INFO : PROGRESS: at sentence #730000, processed 16332039 words, keeping 118954 word types


2016-10-02 14:24:47,394 : INFO : PROGRESS: at sentence #740000, processed 16553072 words, keeping 119668 word types


2016-10-02 14:24:47,440 : INFO : PROGRESS: at sentence #750000, processed 16771399 words, keeping 120295 word types


2016-10-02 14:24:47,486 : INFO : PROGRESS: at sentence #760000, processed 16990803 words, keeping 120930 word types


2016-10-02 14:24:47,534 : INFO : PROGRESS: at sentence #770000, processed 17217940 words, keeping 121703 word types


2016-10-02 14:24:47,582 : INFO : PROGRESS: at sentence #780000, processed 17448086 words, keeping 122402 word types


2016-10-02 14:24:47,631 : INFO : PROGRESS: at sentence #790000, processed 17675162 words, keeping 123066 word types


2016-10-02 14:24:47,662 : INFO : collected 123504 word types from a corpus of 17798263 raw words and 795538 sentences


2016-10-02 14:24:47,763 : INFO : min_count=40 retains 16490 unique words (drops 107014)


2016-10-02 14:24:47,764 : INFO : min_count leaves 17239118 word corpus (96% of original 17798263)


2016-10-02 14:24:47,818 : INFO : deleting the raw counts dictionary of 123504 items


2016-10-02 14:24:47,824 : INFO : sample=0.001 downsamples 48 most-common words


2016-10-02 14:24:47,826 : INFO : downsampling leaves estimated 12749794 word corpus (74.0% of prior 17239118)


2016-10-02 14:24:47,828 : INFO : estimated required memory for 16490 words and 300 dimensions: 47821000 bytes


2016-10-02 14:24:47,884 : INFO : resetting layer weights


2016-10-02 14:24:48,128 : INFO : training model with 4 workers on 16490 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5


2016-10-02 14:24:48,129 : INFO : expecting 795538 sentences, matching count from corpus used for vocabulary survey


2016-10-02 14:24:49,143 : INFO : PROGRESS: at 1.60% examples, 1006626 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:24:50,151 : INFO : PROGRESS: at 3.20% examples, 1007931 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:24:51,160 : INFO : PROGRESS: at 4.69% examples, 981371 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:24:52,160 : INFO : PROGRESS: at 6.24% examples, 982277 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:24:53,163 : INFO : PROGRESS: at 7.87% examples, 992735 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:24:54,167 : INFO : PROGRESS: at 9.49% examples, 999352 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:24:55,171 : INFO : PROGRESS: at 11.10% examples, 1003120 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:24:56,179 : INFO : PROGRESS: at 12.67% examples, 1002540 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:24:57,184 : INFO : PROGRESS: at 14.24% examples, 1002874 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:24:58,187 : INFO : PROGRESS: at 15.82% examples, 1003125 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:24:59,192 : INFO : PROGRESS: at 17.40% examples, 1003096 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:00,198 : INFO : PROGRESS: at 19.00% examples, 1003622 words/s, in_qsize 8, out_qsize 0


2016-10-02 14:25:01,203 : INFO : PROGRESS: at 20.59% examples, 1004707 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:02,210 : INFO : PROGRESS: at 22.21% examples, 1005439 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:03,217 : INFO : PROGRESS: at 23.78% examples, 1004273 words/s, in_qsize 8, out_qsize 1


2016-10-02 14:25:04,219 : INFO : PROGRESS: at 25.28% examples, 1000791 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:05,221 : INFO : PROGRESS: at 26.80% examples, 998101 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:06,229 : INFO : PROGRESS: at 28.42% examples, 999824 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:07,230 : INFO : PROGRESS: at 29.99% examples, 1000218 words/s, in_qsize 8, out_qsize 0


2016-10-02 14:25:08,240 : INFO : PROGRESS: at 31.52% examples, 998683 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:09,249 : INFO : PROGRESS: at 33.10% examples, 999099 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:10,253 : INFO : PROGRESS: at 34.72% examples, 1000652 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:11,257 : INFO : PROGRESS: at 36.32% examples, 1001113 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:12,262 : INFO : PROGRESS: at 37.91% examples, 1001508 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:13,264 : INFO : PROGRESS: at 39.53% examples, 1002843 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:14,267 : INFO : PROGRESS: at 41.14% examples, 1003756 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:15,271 : INFO : PROGRESS: at 42.77% examples, 1004380 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:16,271 : INFO : PROGRESS: at 44.40% examples, 1005270 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:17,272 : INFO : PROGRESS: at 46.00% examples, 1005804 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:18,278 : INFO : PROGRESS: at 47.61% examples, 1006150 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:19,288 : INFO : PROGRESS: at 49.22% examples, 1006574 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:20,290 : INFO : PROGRESS: at 50.83% examples, 1007235 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:21,291 : INFO : PROGRESS: at 52.43% examples, 1007881 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:22,294 : INFO : PROGRESS: at 54.06% examples, 1008621 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:23,298 : INFO : PROGRESS: at 55.65% examples, 1008938 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:24,301 : INFO : PROGRESS: at 57.26% examples, 1009230 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:25,304 : INFO : PROGRESS: at 58.89% examples, 1009902 words/s, in_qsize 6, out_qsize 1


2016-10-02 14:25:26,306 : INFO : PROGRESS: at 60.51% examples, 1010728 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:27,309 : INFO : PROGRESS: at 62.15% examples, 1011325 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:28,311 : INFO : PROGRESS: at 63.78% examples, 1011753 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:29,312 : INFO : PROGRESS: at 65.41% examples, 1012168 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:30,319 : INFO : PROGRESS: at 67.00% examples, 1011888 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:31,321 : INFO : PROGRESS: at 68.62% examples, 1012450 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:32,321 : INFO : PROGRESS: at 70.24% examples, 1012991 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:33,323 : INFO : PROGRESS: at 71.82% examples, 1013160 words/s, in_qsize 6, out_qsize 1


2016-10-02 14:25:34,325 : INFO : PROGRESS: at 73.41% examples, 1013173 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:35,326 : INFO : PROGRESS: at 74.96% examples, 1012599 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:36,327 : INFO : PROGRESS: at 76.56% examples, 1012621 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:37,333 : INFO : PROGRESS: at 78.17% examples, 1013004 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:38,333 : INFO : PROGRESS: at 79.79% examples, 1013347 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:39,337 : INFO : PROGRESS: at 81.37% examples, 1013163 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:40,345 : INFO : PROGRESS: at 83.00% examples, 1013354 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:41,350 : INFO : PROGRESS: at 84.64% examples, 1013696 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:42,350 : INFO : PROGRESS: at 86.17% examples, 1012926 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:43,359 : INFO : PROGRESS: at 87.72% examples, 1012158 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:44,375 : INFO : PROGRESS: at 89.26% examples, 1011422 words/s, in_qsize 6, out_qsize 1


2016-10-02 14:25:45,382 : INFO : PROGRESS: at 90.87% examples, 1011740 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:46,384 : INFO : PROGRESS: at 92.46% examples, 1011882 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:47,389 : INFO : PROGRESS: at 94.10% examples, 1012359 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:48,393 : INFO : PROGRESS: at 95.72% examples, 1012704 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:49,395 : INFO : PROGRESS: at 97.35% examples, 1013052 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:50,404 : INFO : PROGRESS: at 98.97% examples, 1013165 words/s, in_qsize 7, out_qsize 0


2016-10-02 14:25:51,029 : INFO : worker thread finished; awaiting finish of 3 more threads


2016-10-02 14:25:51,034 : INFO : worker thread finished; awaiting finish of 2 more threads


2016-10-02 14:25:51,044 : INFO : worker thread finished; awaiting finish of 1 more threads


2016-10-02 14:25:51,046 : INFO : worker thread finished; awaiting finish of 0 more threads


2016-10-02 14:25:51,047 : INFO : training on 88991315 raw words (63752123 effective words) took 62.9s, 1013362 effective words/s


2016-10-02 14:25:51,049 : INFO : precomputing L2-norms of word weight vectors


2016-10-02 14:25:51,148 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None


2016-10-02 14:25:51,149 : INFO : not storing attribute cum_table


2016-10-02 14:25:51,150 : INFO : not storing attribute syn0norm


In [38]:
# The "doesnt_match" function will try to deduce which word in a set is most dissimilar from the others:
model.doesnt_match("man woman child kitchen".split())

'kitchen'

In [48]:
model.most_similar("china")

[('japan', 0.772025465965271),
 ('russia', 0.7666354179382324),
 ('spain', 0.7642085552215576),
 ('greece', 0.7607915997505188),
 ('germany', 0.7541348934173584),
 ('northern', 0.7521875500679016),
 ('india', 0.7501975893974304),
 ('europe', 0.7477932572364807),
 ('italy', 0.7419039607048035),
 ('poland', 0.738102376461029)]