In [1]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
from gensim.models import word2vec
import pandas as pd

#  wget http://www.nltk.org/nltk_data/packages/tokenizers/punkt.zip
# Extract it in ~/nltk_data/packages/tokenizers/
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def description_to_wordlist( description, remove_stopwords=False ):
    # Convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.

    des_text = BeautifulSoup(description).get_text()
    des_text = re.sub("[^a-zA-Z]"," ", des_text)
    words = des_text.lower().split()
   
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
  
    return(words)

def description_to_sentences( description, tokenizer, remove_stopwords=False ):
    # Split a paragraph into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    
    raw_sentences = tokenizer.tokenize(description.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        
        if len(raw_sentence) > 0:
            sentences.append( description_to_wordlist( raw_sentence, \
              remove_stopwords ))
            
    return sentences


def train_w2v(sentences_list, num_features=300, min_word_count=10, 
              num_workers=4, context=5, downsampling=1e-3):
    # num_features = word vector dimensionality                      
    # min_word_count = minimum word count                        
    # num_workers = number of threads
    # context = context window size                                                                                    
    # downsampling = downsample frequent words

    # Initialize and train the model

    print "Training model..."
    model = word2vec.Word2Vec(sentences_list, workers=num_workers, \
                size=num_features, min_count = min_word_count, \
                window = context, sample = downsampling)

    model.init_sims(replace=True)

    model_name = "{}features_{}minwords_{}context"\
    .format(num_features, min_word_count, context)
    model.save(model_name)

Using gpu device 0: GeForce GTX 970


In [2]:
train = pd.read_csv('data/train.csv').fillna("")
test = pd.read_csv('data/test.csv').fillna("")
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
y = train.median_relevance.values
train = train.drop(['median_relevance', 'relevance_variance'], axis=1)

# Extract sentences from descriptions of both training and test sets
sentences = []

print "Parsing sentences from training set"
for description in train["product_description"]:
    sentences += description_to_sentences(description.decode("utf8"), tokenizer)

print "Parsing sentences from unlabeled test set"
for description in test["product_description"]:
    sentences += description_to_sentences(description.decode("utf8"), tokenizer)

# Train Model    
train_w2v(sentences)

  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)


Parsing sentences from training set
Parsing sentences from unlabeled test set

  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Training model...


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)


In [3]:
# Load and test trained model
model = word2vec.Word2Vec.load("300features_10minwords_5context")
model.most_similar('lenovo')

[(u'thinkpad', 0.8476001620292664),
 (u'ibm', 0.7556716799736023),
 (u'intel', 0.6887473464012146),
 (u'notebook', 0.6659045815467834),
 (u'uhs', 0.6368834972381592),
 (u'sdxc', 0.6269737482070923),
 (u'cf', 0.6235103607177734),
 (u'rw', 0.6204118132591248),
 (u'verbatim', 0.6137969493865967),
 (u'chipset', 0.5992798805236816)]

In [4]:
model.most_similar('winter')

[(u'summer', 0.7962464094161987),
 (u'nights', 0.7220483422279358),
 (u'rainy', 0.6389271020889282),
 (u'fall', 0.6318002939224243),
 (u'weekend', 0.6239671111106873),
 (u'rides', 0.6066479682922363),
 (u'lounge', 0.6060628294944763),
 (u'layering', 0.6030182242393494),
 (u'season', 0.6007533073425293),
 (u'formal', 0.5823827385902405)]

In [5]:
model.most_similar('red')

[(u'blue', 0.8068062663078308),
 (u'white', 0.7729544043540955),
 (u'yellow', 0.7679733633995056),
 (u'gray', 0.7630442380905151),
 (u'purple', 0.7610870599746704),
 (u'pink', 0.7597286105155945),
 (u'navy', 0.7231534719467163),
 (u'fuchsia', 0.6991884708404541),
 (u'black', 0.6961089372634888),
 (u'tan', 0.6957448124885559)]