In [None]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
from gensim.models import word2vec
import pandas as pd

#  wget http://www.nltk.org/nltk_data/packages/tokenizers/punkt.zip
# Extract it in ~/nltk_data/packages/tokenizers/
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def description_to_wordlist( description, remove_stopwords=False ):
    # Convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.

    des_text = BeautifulSoup(description).get_text()
    des_text = re.sub("[^a-zA-Z]"," ", des_text)
    words = des_text.lower().split()
   
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
  
    return(words)

def description_to_sentences( description, tokenizer, remove_stopwords=True ):
    # Split a paragraph into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    
    raw_sentences = description.strip()
    sentences = []
    for raw_sentence in raw_sentences:
        
        if len(raw_sentence) > 0:
            sentences.append( description_to_wordlist( raw_sentence, \
              remove_stopwords ))
            
    return sentences


def train_w2v(sentences_list, num_features=300, min_word_count=10, 
              num_workers=4, context=5, downsampling=1e-3):
    # num_features = word vector dimensionality                      
    # min_word_count = minimum word count                        
    # num_workers = number of threads
    # context = context window size                                                                                    
    # downsampling = downsample frequent words

    # Initialize and train the model

    print "Training model..."
    model = word2vec.Word2Vec(sentences_list, workers=num_workers, \
                size=num_features, min_count = min_word_count, \
                window = context, sample = downsampling)

    model.init_sims(replace=True)

    model_name = "{}features_{}minwords_{}context"\
    .format(num_features, min_word_count, context)
    model.save(model_name)

In [None]:
train = pd.read_csv('data/train.csv').fillna("")
test = pd.read_csv('data/test.csv').fillna("")
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
y = train.median_relevance.values
train = train.drop(['median_relevance', 'relevance_variance'], axis=1)

# Extract sentences from descriptions of both training and test sets
sentences = []

print "Parsing sentences from training set"
for description in train["product_description"]:
    sentences += description_to_sentences(description.decode("utf8"), tokenizer)

print "Parsing sentences from unlabeled test set"
for description in test["product_description"]:
    sentences += description_to_sentences(description.decode("utf8"), tokenizer)

# Train Model    
train_w2v(sentences)

In [None]:
# Load and test trained model
model = word2vec.Word2Vec.load_word2vec_format("data/GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
train = pd.read_csv('data/train.csv').fillna("")
test = pd.read_csv('data/test.csv').fillna("")
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
y = train.median_relevance.values
train = train.drop(['median_relevance', 'relevance_variance'], axis=1)

# Extract sentences from descriptions of both training and test sets
sentences_des_train = []
sentences_des_test = []
print "Parsing sentences from training set"
for description in train["product_description"]:
    sentences_des_train += description_to_sentences(description.decode("utf8"), tokenizer)

print "Parsing sentences from unlabeled test set"
for description in test["product_description"]:
    sentences_des_test += description_to_sentences(description.decode("utf8"), tokenizer)


In [None]:
phrases_que_train = []
phrases_que_test = []
print "Parsing query phrases from training set"
for query in train["query"]:
    phrases_que_train += description_to_sentences(query.decode("utf8"), tokenizer)

print "Parsing query phrases from test set"
for query in test["query"]:
    phrases_que_test += description_to_sentences(query.decode("utf8"), tokenizer)

In [None]:
for q in phrases_que_train:
    if len(q) >= 7:
        print("Hey!")

In [None]:
sentences_des_train[5]