In [1]:
import re

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from functools import partial

from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import PunktSentenceTokenizer

In [69]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier

In [12]:
from gensim.models import Word2Vec

In [15]:
def review_to_wordlist(review, remove_stopwords=True):
    review_text = BeautifulSoup(review, "lxml").get_text()
    # Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
   
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    return words

def clean_review(review, remove_stopwords=True):
    return ' '.join(review_to_wordlist(review, remove_stopwords))

def review_to_sentences(review, tokenizer, remove_stopwords=True):
    raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())

    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) == 0:
            continue
        sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))

    return sentences

In [4]:
data_folder = '/home/agrigorev/tmp/data/bagofpopcorn'

In [5]:
train = pd.read_csv(data_folder + '/labeledTrainData.tsv', delimiter="\t", quoting=3)
train_unlab = pd.read_csv(data_folder + '/unlabeledTrainData.tsv', delimiter="\t", quoting=3)
test = pd.read_csv(data_folder + '/testData.tsv', delimiter="\t", quoting=3)

In [6]:
tokenizer = PunktSentenceTokenizer()
stokenizer = partial(review_to_sentences, tokenizer=tokenizer)

In [7]:
train['review_clean'] = train.review.apply(stokenizer)
train_unlab['review_clean'] = train.review.apply(stokenizer)
test['review_clean'] = test.review.apply(stokenizer)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))
  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)


In [9]:
sentences = []
for ss in train.review_clean:
    if isinstance(ss, list):
        sentences.extend(ss)

for ss in train_unlab.review_clean:
    if isinstance(ss, list):
        sentences.extend(ss)

Description:

- https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors

In [13]:
num_features = 300    # Word vector dimensionality
min_word_count = 40   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

In [14]:
model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count,
                 window=context, sample=downsampling, seed=1)

In [16]:
model.doesnt_match('man woman child kitchen'.split())

'kitchen'

In [18]:
model.most_similar('awful')

[(u'terrible', 0.7435693740844727),
 (u'dreadful', 0.7191959619522095),
 (u'horrible', 0.6926224231719971),
 (u'atrocious', 0.689213752746582),
 (u'abysmal', 0.6871772408485413),
 (u'lousy', 0.6616935133934021),
 (u'horrid', 0.6603978872299194),
 (u'horrendous', 0.6189523935317993),
 (u'laughable', 0.6157838106155396),
 (u'appalling', 0.6149518489837646)]

In [19]:
model.save('300_features_40_minwords_10context.bin')

In [29]:
w1 = 'awful'
w2 = 'terrible'
v1 = model[w1]
v2 = model[w2]
v1.dot(v2) / (np.sqrt(v1.dot(v1)) * np.sqrt(v2.dot(v2)))

0.74356931

## From words to paragaphs: 

- https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-3-more-fun-with-word-vectors
- just sum - worse than BoW
- weight by TF-IDF - only marginally better
- clustering

### Clustering

In [37]:
from sklearn.cluster import KMeans, MiniBatchKMeans

In [35]:
word_vectors = model.syn0
word_count = word_vectors.shape[0]
word_vectors.shape

(13026, 300)

In [36]:
num_clusters = word_count / 5

In [44]:
norms = np.sqrt((word_vectors ** 2).sum(axis=1, keepdims=True))
norm_vectors = word_vectors / norms

In [52]:
km = MiniBatchKMeans(init='random', n_clusters=num_clusters, init_size=(3*num_clusters))
idx = km.fit_predict(norm_vectors)

  init_size=init_size)


In [54]:
word_centroid_map = dict(zip(model.index2word, idx))

In [55]:
for cluster in xrange(0,10):
    print "Cluster %d" % cluster
    print 
    words = []
    for i in xrange(0, len(word_centroid_map.values())):
        if word_centroid_map.values()[i] == cluster:
            words.append(word_centroid_map.keys()[i])
    print words
    print

Cluster 0

[u'attain']

Cluster 1

[u'perform', u'performing']

Cluster 2

[u'dumps']

Cluster 3

[u'wai']

Cluster 4

[u'jameson']

Cluster 5

[u'pushed']

Cluster 6

[u'sterile']

Cluster 7

[u'ambitious', u'virtue', u'bhandarkar', u'moreover', u'roeg', u'method', u'storyteller', u'input', u'blending', u'flair', u'demonstration', u'expertise', u'professionalism', u'ek', u'uniquely', u'firmly', u'mastery', u'immense', u'skill', u'capable', u'magnetic', u'skills', u'astute', u'distinction', u'benefits', u'excesses', u'attributes', u'sophistication', u'strengths', u'genius', u'grandeur', u'characterized', u'precision', u'restraint', u'knack', u'abilities', u'moods', u'characteristic', u'demonstrates', u'ability', u'capabilities']

Cluster 8

[u'person', u'everybody', u'anyone', u'someone', u'everyone', u'everything', u'somebody', u'anybody']

Cluster 9

[u'mourning']



In [66]:
def create_bag_of_centroids(wordlist, word_centroid_map):
    cnt = Counter()
    cnt.update([word_centroid_map[w] for w in wordlist if w in word_centroid_map])
    return cnt

In [63]:
def concat_lists(lists):
    if not isinstance(lists, list):
        return []

    res = []
    for l in lists:
        res.extend(l)
    return res

In [67]:
boc = partial(create_bag_of_centroids, word_centroid_map=word_centroid_map)
train['boc'] = train.review_clean.apply(concat_lists).apply(boc)

In [72]:
vect = DictVectorizer()
X_boc = vect.fit_transform(train.boc)
X_boc.shape

(25000, 2514)

In [73]:
forest = RandomForestClassifier(n_estimators=100)
forest.fit(X_boc, train.sentiment)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [74]:
test['boc'] = test.review_clean.apply(concat_lists).apply(boc)
X_boc_test = vect.transform(test.boc)

In [75]:
y_pred = forest.predict(X_boc_test)

In [77]:
out = pd.DataFrame({'id': test['id'], 'sentiment': y_pred})
out.to_csv( "BagOfCentroids.csv", index=False, quoting=3 )