# Text Data

In [1]:
import nltk
from math import log

## Download some nltk data packages

In [2]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/spr18/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/spr18/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/spr18/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [5]:
nltk.download('words')

[nltk_data] Downloading package words to /home/spr18/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [6]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/spr18/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## TF-IDF

In [7]:
corpus = {
    'a':"Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow.",
    'b':"Professor Plum has a green plant in his study.",
    'c':"Miss Scarlett watered Professor Plum's green plant while he was away from his office last week."
}

In [8]:
terms = {
    'a': [i.lower() for i in corpus['a'].split()],
    'b': [i.lower() for i in corpus['b'].split()],
    'c': [i.lower() for i in corpus['c'].split()]    
}

In [9]:
terms

{'a': ['mr.',
  'green',
  'killed',
  'colonel',
  'mustard',
  'in',
  'the',
  'study',
  'with',
  'the',
  'candlestick.',
  'mr.',
  'green',
  'is',
  'not',
  'a',
  'very',
  'nice',
  'fellow.'],
 'b': ['professor',
  'plum',
  'has',
  'a',
  'green',
  'plant',
  'in',
  'his',
  'study.'],
 'c': ['miss',
  'scarlett',
  'watered',
  'professor',
  "plum's",
  'green',
  'plant',
  'while',
  'he',
  'was',
  'away',
  'from',
  'his',
  'office',
  'last',
  'week.']}

In [10]:
queryTerms = ['mr.','green']

In [11]:
def tf(term, doc, normalize=True):
    doc = doc.lower().split()
    if normalize:
        return doc.count(term.lower()) / float(len(doc))
    else:
        return doc.count(term.lower()) / 1.0

In [12]:
def idf(term, corpus):
    numTextsWithTerm = len([True for text in corpus if term.lower() in text.lower().split()])
    try:
        return 1.0 + log(float(len(corpus)) / numTextsWithTerm)
    except ZeroDivisionError:
        return 1.0

In [13]:
def tf_idf(term, doc, corpus):
    return tf(term, doc) * idf(term,corpus)

In [14]:
corpus = {
    'a':"Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow.",
    'b':"Professor Plum has a green plant in his study.",
    'c':"Miss Scarlett watered Professor Plum's green plant while he was away from his office last week."
}

In [15]:
for (k,v) in sorted(corpus.items()):
    print k, ':', v

a : Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow.
b : Professor Plum has a green plant in his study.
c : Miss Scarlett watered Professor Plum's green plant while he was away from his office last week.


In [16]:
queryScores = {'a':0,'b':0,'c':0}
for term in [t.lower() for t in queryTerms]:
    for doc in sorted(corpus):
        print 'TF(%s): %s' % (doc,term), tf(term, corpus[doc])
    print 'IDF: %s' % (term,), idf(term,corpus.values())
    
    for doc in sorted(corpus):
        score = tf_idf(term,corpus[doc],corpus.values())
        print 'TF-IDF(%s): %s' % (doc,term), score
        queryScores[doc] += score
    
    print "Overall TF-IDF scores for query '%s'" % (' '.join(queryTerms),)
    for (doc, score) in sorted(queryScores.items()):
        print doc, score

TF(a): mr. 0.105263157895
TF(b): mr. 0.0
TF(c): mr. 0.0
IDF: mr. 2.09861228867
TF-IDF(a): mr. 0.220906556702
TF-IDF(b): mr. 0.0
TF-IDF(c): mr. 0.0
Overall TF-IDF scores for query 'mr. green'
a 0.220906556702
b 0.0
c 0.0
TF(a): green 0.105263157895
TF(b): green 0.111111111111
TF(c): green 0.0625
IDF: green 1.0
TF-IDF(a): green 0.105263157895
TF-IDF(b): green 0.111111111111
TF-IDF(c): green 0.0625
Overall TF-IDF scores for query 'mr. green'
a 0.326169714597
b 0.111111111111
c 0.0625


## NLP Pipeline

In [17]:
txt = "Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow."

### EOS detection

In [18]:
sentences = nltk.tokenize.sent_tokenize(txt)
sentences

['Mr. Green killed Colonel Mustard in the study with the candlestick.',
 'Mr. Green is not a very nice fellow.']

### Tokenization

In [19]:
tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]
tokens

[['Mr.',
  'Green',
  'killed',
  'Colonel',
  'Mustard',
  'in',
  'the',
  'study',
  'with',
  'the',
  'candlestick',
  '.'],
 ['Mr.', 'Green', 'is', 'not', 'a', 'very', 'nice', 'fellow', '.']]

### POS tagging

In [20]:
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
pos_tagged_tokens = nltk.pos_tag(tokens[0])

pos_tagged_tokens

[('Mr.', 'NNP'),
 ('Green', 'NNP'),
 ('killed', 'VBD'),
 ('Colonel', 'NNP'),
 ('Mustard', 'NNP'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('study', 'NN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('candlestick', 'NN'),
 ('.', '.')]

### Chunking and Extraction

In [21]:
tree = nltk.chunk.ne_chunk(pos_tagged_tokens)
print tree

(S
  (PERSON Mr./NNP)
  (PERSON Green/NNP)
  killed/VBD
  (ORGANIZATION Colonel/NNP Mustard/NNP)
  in/IN
  the/DT
  study/NN
  with/IN
  the/DT
  candlestick/NN
  ./.)


## Engineer some Twitter data

In [22]:
from nltk.tag import pos_tag_sents
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer

In [23]:
import json

In [24]:
tweets = []
for line in open('data/positive_tweets.json', 'r'):
    tweets.append(json.loads(line)['text'])

In [25]:
tweets

[u'#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 u'@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 u'@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 u'@97sides CONGRATS :)',
 u'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days',
 u'@BhaktisBanter @PallaviRuhail This one is irresistible :)\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM',
 u"We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI",
 u'@Impatientraider On second thought, there\u2019s just not enough time for a DD :) But new shorts entering system. Sheep must be buying.',
 u'Jgh , but we have to go to Bayan :D bye',
 u'As an act of mischievousness, am calling the ETL layer of our in-hou

In [26]:
#tokenization
tweets_tokens = []
for tweet in tweets:
    for s in sent_tokenize(tweet):
        tweets_tokens.append(word_tokenize(s))

In [27]:
tweets_tokens

[[u'#',
  u'FollowFriday',
  u'@',
  u'France_Inte',
  u'@',
  u'PKuchly57',
  u'@',
  u'Milipol_Paris',
  u'for',
  u'being',
  u'top',
  u'engaged',
  u'members',
  u'in',
  u'my',
  u'community',
  u'this',
  u'week',
  u':',
  u')'],
 [u'@', u'Lamb2ja', u'Hey', u'James', u'!'],
 [u'How',
  u'odd',
  u':',
  u'/',
  u'Please',
  u'call',
  u'our',
  u'Contact',
  u'Centre',
  u'on',
  u'02392441234',
  u'and',
  u'we',
  u'will',
  u'be',
  u'able',
  u'to',
  u'assist',
  u'you',
  u':',
  u')',
  u'Many',
  u'thanks',
  u'!'],
 [u'@',
  u'DespiteOfficial',
  u'we',
  u'had',
  u'a',
  u'listen',
  u'last',
  u'night',
  u':',
  u')',
  u'As',
  u'You',
  u'Bleed',
  u'is',
  u'an',
  u'amazing',
  u'track',
  u'.'],
 [u'When', u'are', u'you', u'in', u'Scotland', u'?'],
 [u'!'],
 [u'@', u'97sides', u'CONGRATS', u':', u')'],
 [u'yeaaaah', u'yippppy', u'!', u'!', u'!'],
 [u'my',
  u'accnt',
  u'verified',
  u'rqst',
  u'has',
  u'succeed',
  u'got',
  u'a',
  u'blue',
  u'tick',
  u'

In [28]:
#tokenization using TweetTokenizer
tweets_tokens2 = []
for tweet in tweets:
    for s in sent_tokenize(tweet):
        tweets_tokens2.append(TweetTokenizer().tokenize(s))

In [29]:
tweets_tokens2

[[u'#FollowFriday',
  u'@France_Inte',
  u'@PKuchly57',
  u'@Milipol_Paris',
  u'for',
  u'being',
  u'top',
  u'engaged',
  u'members',
  u'in',
  u'my',
  u'community',
  u'this',
  u'week',
  u':)'],
 [u'@Lamb2ja', u'Hey', u'James', u'!'],
 [u'How',
  u'odd',
  u':/',
  u'Please',
  u'call',
  u'our',
  u'Contact',
  u'Centre',
  u'on',
  u'02392441234',
  u'and',
  u'we',
  u'will',
  u'be',
  u'able',
  u'to',
  u'assist',
  u'you',
  u':)',
  u'Many',
  u'thanks',
  u'!'],
 [u'@DespiteOfficial',
  u'we',
  u'had',
  u'a',
  u'listen',
  u'last',
  u'night',
  u':)',
  u'As',
  u'You',
  u'Bleed',
  u'is',
  u'an',
  u'amazing',
  u'track',
  u'.'],
 [u'When', u'are', u'you', u'in', u'Scotland', u'?'],
 [u'!'],
 [u'@97sides', u'CONGRATS', u':)'],
 [u'yeaaaah', u'yippppy', u'!', u'!', u'!'],
 [u'my',
  u'accnt',
  u'verified',
  u'rqst',
  u'has',
  u'succeed',
  u'got',
  u'a',
  u'blue',
  u'tick',
  u'mark',
  u'on',
  u'my',
  u'fb',
  u'profile',
  u':)',
  u'in',
  u'15',
  u

In [30]:
tweets_tagged = pos_tag_sents(tweets_tokens2)

In [31]:
tweets_tagged 

[[(u'#FollowFriday', 'JJ'),
  (u'@France_Inte', 'NNP'),
  (u'@PKuchly57', 'NNP'),
  (u'@Milipol_Paris', 'NNP'),
  (u'for', 'IN'),
  (u'being', 'VBG'),
  (u'top', 'JJ'),
  (u'engaged', 'VBN'),
  (u'members', 'NNS'),
  (u'in', 'IN'),
  (u'my', 'PRP$'),
  (u'community', 'NN'),
  (u'this', 'DT'),
  (u'week', 'NN'),
  (u':)', 'NN')],
 [(u'@Lamb2ja', 'NN'), (u'Hey', 'NNP'), (u'James', 'NNP'), (u'!', '.')],
 [(u'How', 'WRB'),
  (u'odd', 'JJ'),
  (u':/', 'JJ'),
  (u'Please', 'NNP'),
  (u'call', 'VB'),
  (u'our', 'PRP$'),
  (u'Contact', 'NNP'),
  (u'Centre', 'NNP'),
  (u'on', 'IN'),
  (u'02392441234', 'CD'),
  (u'and', 'CC'),
  (u'we', 'PRP'),
  (u'will', 'MD'),
  (u'be', 'VB'),
  (u'able', 'JJ'),
  (u'to', 'TO'),
  (u'assist', 'VB'),
  (u'you', 'PRP'),
  (u':)', 'VBP'),
  (u'Many', 'JJ'),
  (u'thanks', 'NNS'),
  (u'!', '.')],
 [(u'@DespiteOfficial', 'JJ'),
  (u'we', 'PRP'),
  (u'had', 'VBD'),
  (u'a', 'DT'),
  (u'listen', 'VBN'),
  (u'last', 'JJ'),
  (u'night', 'NN'),
  (u':)', 'NN'),
  (u'As'

In [32]:
# count POS tags
JJ_count = 0
NN_count = 0

for tweet in tweets_tagged:
    for pair in tweet:
        tag=pair[1]
        if tag == 'JJ':
            JJ_count += 1
        elif tag == 'NN':
            NN_count += 1

print JJ_count, NN_count

5941 13549


## Bag of Words

In [33]:
import pandas as pd

In [34]:
train = pd.read_csv("data/labeledTrainData.tsv",header=0,delimiter="\t",quoting=3)
train

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."
5,"""8196_8""",1,"""I dont know why people think this is such a b..."
6,"""7166_2""",0,"""This movie could have been very good, but com..."
7,"""10633_1""",0,"""I watched this video at a friend's house. I'm..."
8,"""319_1""",0,"""A friend of mine bought this film for £1, and..."
9,"""8713_10""",1,"""<br /><br />This movie is full of references...."


In [35]:
print train["review"][0]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

### removing HTML markup using BS4

In [36]:
from bs4 import BeautifulSoup

In [37]:
example1 = BeautifulSoup(train["review"][0],"html5lib")

print example1.get_text()

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 20 mi

### Remove punctuation, numbers and stopwords

In [38]:
import re

In [39]:
letters_only = re.sub("[^a-zA-Z]"," ",example1.get_text())
print letters_only

 With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    mi

In [40]:
lower_case = letters_only.lower()
words = lower_case.split()

In [41]:
# deal with stopwords such as "a", "and", "is", "the"
from nltk.corpus import stopwords
print stopwords.words("english")

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u"you're", u"you've", u"you'll", u"you'd", u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u"she's", u'her', u'hers', u'herself', u'it', u"it's", u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u"that'll", u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'eac

In [42]:
words = [w for w in words if not w in stopwords.words("english")]
print words

[u'stuff', u'going', u'moment', u'mj', u'started', u'listening', u'music', u'watching', u'odd', u'documentary', u'watched', u'wiz', u'watched', u'moonwalker', u'maybe', u'want', u'get', u'certain', u'insight', u'guy', u'thought', u'really', u'cool', u'eighties', u'maybe', u'make', u'mind', u'whether', u'guilty', u'innocent', u'moonwalker', u'part', u'biography', u'part', u'feature', u'film', u'remember', u'going', u'see', u'cinema', u'originally', u'released', u'subtle', u'messages', u'mj', u'feeling', u'towards', u'press', u'also', u'obvious', u'message', u'drugs', u'bad', u'kay', u'visually', u'impressive', u'course', u'michael', u'jackson', u'unless', u'remotely', u'like', u'mj', u'anyway', u'going', u'hate', u'find', u'boring', u'may', u'call', u'mj', u'egotist', u'consenting', u'making', u'movie', u'mj', u'fans', u'would', u'say', u'made', u'fans', u'true', u'really', u'nice', u'actual', u'feature', u'film', u'bit', u'finally', u'starts', u'minutes', u'excluding', u'smooth', u'cri

In [43]:
def review_to_words( raw_review ):
    review_text = BeautifulSoup(raw_review,"html5lib").get_text()
    letters_only = re.sub("[^a-zA-Z]"," ",review_text)
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    return(" ".join( meaningful_words ))

In [44]:
clean_review = review_to_words( train["review"][0] )
print clean_review

stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working

In [45]:
# loop through and clean all of the trainig set at once
num_reviews = train["review"].size
clean_train_reviews = []
for i in xrange(0, num_reviews):
    if( (i+1)%1000 == 0):
        print "%d of %d\n" % (i+1, num_reviews)
    clean_train_reviews.append( review_to_words( train["review"][i] ) )

1000 of 25000

2000 of 25000

3000 of 25000

4000 of 25000

5000 of 25000

6000 of 25000

7000 of 25000

8000 of 25000

9000 of 25000

10000 of 25000

11000 of 25000

12000 of 25000

13000 of 25000

14000 of 25000

15000 of 25000

16000 of 25000

17000 of 25000

18000 of 25000

19000 of 25000

20000 of 25000

21000 of 25000

22000 of 25000

23000 of 25000

24000 of 25000

25000 of 25000



In [46]:
# extract the 5000 most frequent words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,stop_words=None,max_features=5000)
train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()

In [47]:
train_data_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [48]:
train_data_features.shape

(25000, 5000)

In [49]:
# check the vocabulary
vocab = vectorizer.get_feature_names()
print vocab



## Word2vec

In [50]:
import pandas as pd

In [51]:
train = pd.read_csv("data/labeledTrainData.tsv",header=0,delimiter="\t",quoting=3)

### To train word2vec, we do not need to remove stopwords since the algorithm relies on the broader context of the sentence in order to produce high-quality word vectors

In [52]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [53]:
def review_to_wordlist( review, remove_stopwords=False ):
    review_text = BeautifulSoup(review,"html5lib").get_text()
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

### Word2vec expects single sentences, each one as a list of words (the input format is a list of lists)

In [54]:
import nltk.data

In [55]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [56]:
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append( review_to_wordlist( raw_sentence, remove_stopwords))
    return sentences

In [57]:
sentences = []
for review in train["review"]:
    sentences += review_to_sentences(review.decode('utf-8'), tokenizer)

  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)


In [58]:
print len(sentences)

266551


In [59]:
print sentences[0]

[u'with', u'all', u'this', u'stuff', u'going', u'down', u'at', u'the', u'moment', u'with', u'mj', u'i', u've', u'started', u'listening', u'to', u'his', u'music', u'watching', u'the', u'odd', u'documentary', u'here', u'and', u'there', u'watched', u'the', u'wiz', u'and', u'watched', u'moonwalker', u'again']


### Train the word2vec model

In [60]:
import logging

In [61]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [62]:
# Set values for parameters
num_features = 300 # word vector dimensionality
min_word_count = 40 # minimum word count
num_workers = 4 # number of threads to run in parallel
context = 10 # context window size
downsampling = 1e-3 # downsample setting for frequent words

In [63]:
from gensim.models import word2vec

2018-03-23 11:34:08,925 : INFO : 'pattern' package not found; tag filters are not available for English


In [64]:
model = word2vec.Word2Vec(sentences, workers = num_workers, size=num_features, min_count=min_word_count, window=context,sample=downsampling)

2018-03-23 11:34:10,807 : INFO : collecting all words and their counts
2018-03-23 11:34:10,809 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-03-23 11:34:10,896 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2018-03-23 11:34:10,974 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2018-03-23 11:34:11,041 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2018-03-23 11:34:11,110 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2018-03-23 11:34:11,172 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2018-03-23 11:34:11,239 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2018-03-23 11:34:11,304 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2018-03-23 11:34:11,378 : INFO : PROGRESS: 

2018-03-23 11:34:31,881 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-03-23 11:34:31,884 : INFO : EPOCH - 5 : training on 5920725 raw words (4044208 effective words) took 3.7s, 1100049 effective words/s
2018-03-23 11:34:31,905 : INFO : training on a 29603625 raw words (20217623 effective words) took 18.9s, 1069041 effective words/s


In [65]:
model.init_sims(replace=True)

2018-03-23 11:36:24,881 : INFO : precomputing L2-norms of word weight vectors


In [66]:
model_name="300features_40minwords_10context"
model.save(model_name)

2018-03-23 11:36:25,521 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2018-03-23 11:36:25,523 : INFO : not storing attribute vectors_norm
2018-03-23 11:36:25,525 : INFO : not storing attribute cum_table
2018-03-23 11:36:25,629 : INFO : saved 300features_40minwords_10context


### Explore the model results

In [67]:
from gensim.models import Word2Vec

In [68]:
model = Word2Vec.load("300features_40minwords_10context")

2018-03-23 11:36:29,614 : INFO : loading Word2Vec object from 300features_40minwords_10context
2018-03-23 11:36:29,656 : INFO : loading vocabulary recursively from 300features_40minwords_10context.vocabulary.* with mmap=None
2018-03-23 11:36:29,658 : INFO : loading wv recursively from 300features_40minwords_10context.wv.* with mmap=None
2018-03-23 11:36:29,659 : INFO : setting ignored attribute vectors_norm to None
2018-03-23 11:36:29,661 : INFO : loading trainables recursively from 300features_40minwords_10context.trainables.* with mmap=None
2018-03-23 11:36:29,662 : INFO : setting ignored attribute cum_table to None
2018-03-23 11:36:29,664 : INFO : loaded 300features_40minwords_10context


In [84]:
# deduce which word in a set is most dissimilar from the others
model.wv.doesnt_match("man woman child kitchen".split())

'kitchen'

In [77]:
model.wv.doesnt_match("france england germany berlin".split())

'berlin'

In [78]:
model.wv.doesnt_match("paris berlin london australia".split())

'paris'

In [79]:
# get insight into the model's word clusters
model.wv.most_similar("man")

[(u'woman', 0.6463024020195007),
 (u'soldier', 0.6306912302970886),
 (u'boy', 0.6249088048934937),
 (u'doctor', 0.6015763282775879),
 (u'murderer', 0.5756363868713379),
 (u'journalist', 0.5751552581787109),
 (u'cop', 0.5686773657798767),
 (u'guy', 0.564098596572876),
 (u'lady', 0.561547040939331),
 (u'businessman', 0.558123767375946)]

In [80]:
model.wv.most_similar("queen")

[(u'princess', 0.7676656246185303),
 (u'bride', 0.7588721513748169),
 (u'sophie', 0.7395970821380615),
 (u'victoria', 0.7277116179466248),
 (u'aunt', 0.7219737768173218),
 (u'betty', 0.7104591131210327),
 (u'harlow', 0.7082267999649048),
 (u'bach', 0.7048423290252686),
 (u'hannah', 0.7041159868240356),
 (u'mary', 0.7010382413864136)]

In [81]:
model.wv.most_similar("awful")

[(u'horrible', 0.8327822685241699),
 (u'terrible', 0.8104604482650757),
 (u'dreadful', 0.7363953590393066),
 (u'atrocious', 0.7141791582107544),
 (u'laughable', 0.6929225921630859),
 (u'lame', 0.6752660274505615),
 (u'pathetic', 0.6672459840774536),
 (u'horrendous', 0.652542233467102),
 (u'crappy', 0.6522139310836792),
 (u'bad', 0.6510164737701416)]

### Numeric representations of words
-  feature vector for each word is stored in a numpy array called "syn0"

In [82]:
model.wv.vectors.shape

(8306, 300)

In [83]:
model.wv["flower"]

array([-0.04464524, -0.01729888, -0.06246409,  0.07163054,  0.08404221,
        0.02162345, -0.02939735,  0.00230872,  0.05055407,  0.01303973,
       -0.05600348, -0.11249622, -0.00242699,  0.07191952, -0.03745076,
        0.05204913, -0.04398435, -0.01262407, -0.06331637, -0.00094362,
       -0.07512797,  0.00120313,  0.09296246,  0.02870839,  0.01768998,
        0.00525393, -0.06947118, -0.10641769, -0.0238047 , -0.01047186,
       -0.01124007,  0.05279192, -0.01737494, -0.05213966,  0.05240201,
       -0.00250922,  0.04268034, -0.00825918, -0.02880999,  0.02016666,
        0.00337367,  0.03212991,  0.02401098, -0.08326312,  0.02394862,
       -0.01955871,  0.03970387,  0.03879419, -0.02596398,  0.07555451,
       -0.06086858,  0.0423749 ,  0.01572101, -0.03678506, -0.00331094,
        0.0477781 , -0.01207077, -0.04019843, -0.11505822, -0.02090936,
       -0.04566725, -0.04742807,  0.03130346, -0.08101349,  0.00338593,
        0.05322516,  0.0174869 ,  0.03101363,  0.10838465,  0.05