In [37]:
import pandas as pd

train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter='\t',quoting=3)
test = pd.read_csv("data/testData.tsv", header=0, delimiter='\t', quoting=3)

unlabeled_train = pd.read_csv("data/unlabeledTrainData.tsv", header=0, delimiter='\t', quoting=3)

print(train.size)
print(test.size)
print(unlabeled_train.size)

75000
50000
100000


In [38]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [39]:
def review_to_wordlist(review, remove_stopwords=False):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-z]", " ", review_text)
    words = review_text.lower().split()
    if(remove_stopwords):
        stops = set(stopwords.words('english'))
        words = [w for w in words if w not in stops]
    return words

In [40]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [41]:
sentences = []

print("Parsing sentences from training set")

for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)
    
print ("Parsing sentences from unlabeled set")

for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [42]:
len(sentences)

795538

In [23]:
print(sentences[1])

['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']


In [27]:
import logging
from gensim.models import word2vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers,
                          size=num_features, min_count = min_word_count,
                          window=context, sample=downsampling)

model.init_sims(replace=True)

model_name = "300features_40minwords_10context"
model.save(model_name)

2019-07-10 09:50:20,532 : INFO : collecting all words and their counts
2019-07-10 09:50:20,533 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-07-10 09:50:20,642 : INFO : PROGRESS: at sentence #10000, processed 227352 words, keeping 18345 word types
2019-07-10 09:50:20,709 : INFO : PROGRESS: at sentence #20000, processed 455015 words, keeping 26112 word types


Training model...


2019-07-10 09:50:20,804 : INFO : PROGRESS: at sentence #30000, processed 675833 words, keeping 31668 word types
2019-07-10 09:50:20,875 : INFO : PROGRESS: at sentence #40000, processed 903705 words, keeping 36428 word types
2019-07-10 09:50:20,938 : INFO : PROGRESS: at sentence #50000, processed 1124382 words, keeping 40273 word types
2019-07-10 09:50:20,999 : INFO : PROGRESS: at sentence #60000, processed 1347279 words, keeping 43577 word types
2019-07-10 09:50:21,062 : INFO : PROGRESS: at sentence #70000, processed 1571925 words, keeping 46571 word types
2019-07-10 09:50:21,122 : INFO : PROGRESS: at sentence #80000, processed 1792766 words, keeping 49300 word types
2019-07-10 09:50:21,182 : INFO : PROGRESS: at sentence #90000, processed 2018456 words, keeping 52045 word types
2019-07-10 09:50:21,241 : INFO : PROGRESS: at sentence #100000, processed 2241761 words, keeping 54429 word types
2019-07-10 09:50:21,300 : INFO : PROGRESS: at sentence #110000, processed 2462827 words, keeping 

2019-07-10 09:50:28,563 : INFO : PROGRESS: at sentence #750000, processed 16885099 words, keeping 135708 word types
2019-07-10 09:50:28,633 : INFO : PROGRESS: at sentence #760000, processed 17105901 words, keeping 136444 word types
2019-07-10 09:50:28,724 : INFO : PROGRESS: at sentence #770000, processed 17334591 words, keeping 137324 word types
2019-07-10 09:50:29,060 : INFO : PROGRESS: at sentence #780000, processed 17566230 words, keeping 138131 word types
2019-07-10 09:50:29,310 : INFO : PROGRESS: at sentence #790000, processed 17794892 words, keeping 138912 word types
2019-07-10 09:50:29,465 : INFO : collected 139407 word types from a corpus of 17918782 raw words and 795538 sentences
2019-07-10 09:50:29,470 : INFO : Loading a fresh vocabulary
2019-07-10 09:50:29,798 : INFO : effective_min_count=40 retains 16612 unique words (11% of original 139407, drops 122795)
2019-07-10 09:50:29,799 : INFO : effective_min_count=40 leaves 17306744 word corpus (96% of original 17918782, drops 612

2019-07-10 09:51:26,300 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-10 09:51:26,311 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-10 09:51:26,324 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-10 09:51:26,358 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-10 09:51:26,359 : INFO : EPOCH - 2 : training on 17918782 raw words (12771129 effective words) took 27.0s, 473005 effective words/s
2019-07-10 09:51:27,386 : INFO : EPOCH 3 - PROGRESS: at 4.56% examples, 578864 words/s, in_qsize 6, out_qsize 1
2019-07-10 09:51:28,404 : INFO : EPOCH 3 - PROGRESS: at 9.28% examples, 583042 words/s, in_qsize 7, out_qsize 0
2019-07-10 09:51:29,408 : INFO : EPOCH 3 - PROGRESS: at 11.99% examples, 502841 words/s, in_qsize 7, out_qsize 0
2019-07-10 09:51:30,413 : INFO : EPOCH 3 - PROGRESS: at 16.40% examples, 516935 words/s, in_qsize 7, out_qsize 0
2019-07-10 09:51:31,420 : INFO : EPOCH 3 - PRO

2019-07-10 09:52:28,896 : INFO : EPOCH 5 - PROGRESS: at 30.23% examples, 540027 words/s, in_qsize 7, out_qsize 0
2019-07-10 09:52:29,909 : INFO : EPOCH 5 - PROGRESS: at 33.10% examples, 516484 words/s, in_qsize 7, out_qsize 0
2019-07-10 09:52:30,927 : INFO : EPOCH 5 - PROGRESS: at 36.71% examples, 509552 words/s, in_qsize 8, out_qsize 1
2019-07-10 09:52:31,943 : INFO : EPOCH 5 - PROGRESS: at 40.31% examples, 504173 words/s, in_qsize 7, out_qsize 0
2019-07-10 09:52:32,945 : INFO : EPOCH 5 - PROGRESS: at 42.79% examples, 487563 words/s, in_qsize 7, out_qsize 0
2019-07-10 09:52:33,974 : INFO : EPOCH 5 - PROGRESS: at 44.07% examples, 459853 words/s, in_qsize 7, out_qsize 0
2019-07-10 09:52:35,004 : INFO : EPOCH 5 - PROGRESS: at 47.12% examples, 453554 words/s, in_qsize 7, out_qsize 0
2019-07-10 09:52:36,018 : INFO : EPOCH 5 - PROGRESS: at 50.56% examples, 452229 words/s, in_qsize 7, out_qsize 0
2019-07-10 09:52:37,024 : INFO : EPOCH 5 - PROGRESS: at 54.56% examples, 455961 words/s, in_qsiz

In [44]:
model.doesnt_match("france england germany berlin".split())

  """Entry point for launching an IPython kernel.


'berlin'

In [45]:
model.most_similar("queen")

  """Entry point for launching an IPython kernel.


[('princess', 0.6686308979988098),
 ('latifah', 0.6347348690032959),
 ('bride', 0.6235207319259644),
 ('maid', 0.6224480867385864),
 ('stepmother', 0.6088650226593018),
 ('victoria', 0.6076782941818237),
 ('belle', 0.593917191028595),
 ('angela', 0.5797292590141296),
 ('mistress', 0.5797204375267029),
 ('aurora', 0.5775083303451538)]

In [52]:
import numpy as np

def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    
    nwords = 0
    index2word_set = set(model.wv.index2word)
    
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])
    
    featureVec = np.divide(featureVec, nwords)
    return featureVec

def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    
    for review in reviews:
        if counter%1000 == 0:
            print ("Review %d of %d" % (counter, len(reviews)))
            
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        
        counter = counter + 1
    return reviewFeatureVecs

In [53]:
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append(review_to_wordlist(review,remove_stopwords=True))
    
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

print("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True))

testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Review 0 of 25000


  if sys.path[0] == '':


Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Creating average feature vecs for test reviews
Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 

In [57]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)

print("Fitting a random forest to labeled training data...")
forest = forest.fit(trainDataVecs, train["sentiment"])

result = forest.predict(testDataVecs)

output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3)

Fitting a random forest to labeled training data...
