In [1]:
import pandas as pd
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

# Read data from files 
train = pd.read_csv( "labeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )

# Verify the number of reviews that were read (100,000 in total)
print("Read %d labeled train reviews, %d labeled test reviews, " \
 "and %d unlabeled reviews\n" % (train["review"].size,  
 test["review"].size, unlabeled_train["review"].size ))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [2]:
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review, "lxml").get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [3]:
import nltk.data

In [4]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 0. Remove URL from reviews
    no_url = re.sub("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"," ", review)
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(no_url.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [5]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)


Parsing sentences from training set


  'Beautiful Soup.' % markup)


Parsing sentences from unlabeled set


  'Beautiful Soup.' % markup)


In [6]:
print(len(sentences))

795532


In [7]:
print(sentences[0])

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


In [8]:
print(sentences[1])

['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']


In [9]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)
# Initialize and train the model (this will take some time)
from gensim.models import word2vec

2017-03-11 10:46:29,862 : INFO : 'pattern' package not found; tag filters are not available for English


In [10]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [11]:
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2017-03-11 10:46:29,905 : INFO : collecting all words and their counts
2017-03-11 10:46:29,906 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-03-11 10:46:29,955 : INFO : PROGRESS: at sentence #10000, processed 225799 words, keeping 17773 word types
2017-03-11 10:46:30,007 : INFO : PROGRESS: at sentence #20000, processed 451864 words, keeping 24940 word types
2017-03-11 10:46:30,057 : INFO : PROGRESS: at sentence #30000, processed 671284 words, keeping 30022 word types


Training model...


2017-03-11 10:46:30,109 : INFO : PROGRESS: at sentence #40000, processed 897753 words, keeping 34337 word types
2017-03-11 10:46:30,159 : INFO : PROGRESS: at sentence #50000, processed 1116926 words, keeping 37748 word types
2017-03-11 10:46:30,210 : INFO : PROGRESS: at sentence #60000, processed 1338325 words, keeping 40707 word types
2017-03-11 10:46:30,260 : INFO : PROGRESS: at sentence #70000, processed 1561492 words, keeping 43317 word types
2017-03-11 10:46:30,314 : INFO : PROGRESS: at sentence #80000, processed 1780792 words, keeping 45695 word types
2017-03-11 10:46:30,365 : INFO : PROGRESS: at sentence #90000, processed 2004860 words, keeping 48112 word types
2017-03-11 10:46:30,417 : INFO : PROGRESS: at sentence #100000, processed 2226848 words, keeping 50183 word types
2017-03-11 10:46:30,468 : INFO : PROGRESS: at sentence #110000, processed 2446430 words, keeping 52056 word types
2017-03-11 10:46:30,519 : INFO : PROGRESS: at sentence #120000, processed 2668576 words, keepin

In [12]:
model.doesnt_match("man woman child kitchen".split())

'kitchen'

In [13]:
model.doesnt_match("desk table chair god".split())

'god'

In [14]:
model.doesnt_match("phone mobile movie network".split())

'movie'

In [15]:
model.doesnt_match("france england germany berlin".split())

'berlin'

In [16]:
model.doesnt_match("paris berlin london austria".split())

'austria'

In [17]:
model.most_similar("man")

[('woman', 0.6254433393478394),
 ('lady', 0.5991166234016418),
 ('millionaire', 0.5377988219261169),
 ('lad', 0.5343524813652039),
 ('chap', 0.5198793411254883),
 ('guy', 0.5184804201126099),
 ('farmer', 0.5150920152664185),
 ('person', 0.5096235871315002),
 ('monk', 0.4978397488594055),
 ('men', 0.49767956137657166)]

In [18]:
model.most_similar("queen")

[('princess', 0.6834498047828674),
 ('belle', 0.6295305490493774),
 ('latifah', 0.6238353252410889),
 ('bride', 0.6002123355865479),
 ('nun', 0.5854156613349915),
 ('seductress', 0.5845000147819519),
 ('regina', 0.5790630578994751),
 ('stepmother', 0.5752915143966675),
 ('prince', 0.5752872228622437),
 ('goddess', 0.5739991664886475)]

In [19]:
model.most_similar("god")

[('goodness', 0.6461307406425476),
 ('jesus', 0.5351325273513794),
 ('heavens', 0.5024672150611877),
 ('holy', 0.49229785799980164),
 ('gosh', 0.48883056640625),
 ('heaven', 0.47777146100997925),
 ('gods', 0.46265023946762085),
 ('dear', 0.4312209188938141),
 ('grail', 0.4188128113746643),
 ('wrath', 0.4008309543132782)]

In [20]:
model.most_similar("Queen")

KeyError: "word 'Queen' not in vocabulary"

In [21]:
model.most_similar("awful")

[('terrible', 0.7763335704803467),
 ('horrible', 0.7417645454406738),
 ('abysmal', 0.7341878414154053),
 ('dreadful', 0.7324559092521667),
 ('atrocious', 0.7271116971969604),
 ('horrendous', 0.7015389204025269),
 ('appalling', 0.6938928961753845),
 ('horrid', 0.673977255821228),
 ('lousy', 0.6452018618583679),
 ('laughable', 0.6174856424331665)]

In [22]:
model.most_similar("house")

[('mansion', 0.741044282913208),
 ('farmhouse', 0.6267083287239075),
 ('apartment', 0.6173556447029114),
 ('cabin', 0.6168539524078369),
 ('basement', 0.5831036567687988),
 ('castle', 0.5814731121063232),
 ('hotel', 0.5772657990455627),
 ('houses', 0.5624116659164429),
 ('cemetery', 0.5523293614387512),
 ('room', 0.5395114421844482)]

In [23]:
model.most_similar("table")

[('desk', 0.6891450881958008),
 ('floor', 0.6831225156784058),
 ('couch', 0.6608123779296875),
 ('balcony', 0.6509571075439453),
 ('ceiling', 0.6409872770309448),
 ('coffee', 0.6386309862136841),
 ('roof', 0.6307677626609802),
 ('diner', 0.6298085451126099),
 ('wheel', 0.6163753271102905),
 ('lawn', 0.6120452880859375)]

In [24]:
model.most_similar("computer")

[('computers', 0.6148785352706909),
 ('technology', 0.5999691486358643),
 ('generated', 0.5970050692558289),
 ('cgi', 0.586999773979187),
 ('digital', 0.5721021890640259),
 ('software', 0.5582946538925171),
 ('graphics', 0.526003360748291),
 ('equipment', 0.5192840695381165),
 ('cg', 0.5190702080726624),
 ('laser', 0.5137609839439392)]

In [25]:
model.most_similar("mobile")

[('cell', 0.6229346990585327),
 ('fires', 0.619251549243927),
 ('trucks', 0.6119228601455688),
 ('washing', 0.608405590057373),
 ('tanks', 0.598710834980011),
 ('laser', 0.5966479182243347),
 ('furniture', 0.5891460180282593),
 ('motor', 0.5792536735534668),
 ('phones', 0.5780150890350342),
 ('parachute', 0.5776175260543823)]

In [26]:
model.most_similar("car")

[('truck', 0.7519577741622925),
 ('jeep', 0.6660259962081909),
 ('bus', 0.665249228477478),
 ('bike', 0.6515318155288696),
 ('plane', 0.6242357492446899),
 ('garage', 0.6217759847640991),
 ('helicopter', 0.6137675046920776),
 ('boat', 0.6016305685043335),
 ('cab', 0.5965676307678223),
 ('train', 0.5895345211029053)]

In [27]:
model.most_similar("dice")

[('cigar', 0.549802839756012),
 ('tin', 0.5442754030227661),
 ('butter', 0.542699933052063),
 ('balls', 0.5180373191833496),
 ('pee', 0.5163849592208862),
 ('duck', 0.5100090503692627),
 ('juice', 0.49665337800979614),
 ('sweat', 0.4954224228858948),
 ('chips', 0.4887160360813141),
 ('gum', 0.4798174500465393)]

In [28]:
model.most_similar("digit")

KeyError: "word 'digit' not in vocabulary"

In [29]:
model.most_similar("exercise")

[('abundance', 0.6164333820343018),
 ('oddity', 0.6106451749801636),
 ('alarming', 0.610059380531311),
 ('afterthought', 0.6100172996520996),
 ('abomination', 0.5960849523544312),
 ('embarrassment', 0.592820405960083),
 ('ounce', 0.5902347564697266),
 ('excess', 0.5827727317810059),
 ('insult', 0.5797513723373413),
 ('observer', 0.5775488615036011)]

In [30]:
model.most_similar("good")

[('decent', 0.6829209327697754),
 ('bad', 0.6340751051902771),
 ('great', 0.6212310791015625),
 ('nice', 0.6000409126281738),
 ('fine', 0.5729934573173523),
 ('cool', 0.5637296438217163),
 ('mediocre', 0.5629239678382874),
 ('passable', 0.5612195730209351),
 ('lousy', 0.5608699321746826),
 ('solid', 0.527820885181427)]

In [31]:
model.most_similar("yellow")

[('fur', 0.7448852062225342),
 ('tinted', 0.7302852869033813),
 ('helmet', 0.6969269514083862),
 ('blue', 0.6964552998542786),
 ('colored', 0.6921685934066772),
 ('red', 0.692031741142273),
 ('trench', 0.6850919723510742),
 ('horns', 0.6850051283836365),
 ('leather', 0.6819203495979309),
 ('masks', 0.6747287511825562)]

In [32]:
model.most_similar("sky")

[('sun', 0.680997371673584),
 ('skies', 0.6526676416397095),
 ('clouds', 0.6435355544090271),
 ('river', 0.6240108013153076),
 ('heat', 0.6215382814407349),
 ('rain', 0.6108661890029907),
 ('sand', 0.6089564561843872),
 ('bridge', 0.6045998930931091),
 ('mist', 0.597465455532074),
 ('water', 0.597454309463501)]

In [33]:
type(model.syn0)



numpy.ndarray

In [34]:
model.syn0.shape



(16485, 300)

In [None]:
model["flower"]