### Load Gensim Library

In [2]:
!pip install gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/27/a4/d10c0acc8528d838cda5eede0ee9c784caa598dbf40bd0911ff8d067a7eb/gensim-3.6.0-cp36-cp36m-manylinux1_x86_64.whl (23.6MB)
[K    100% |████████████████████████████████| 23.6MB 1.3MB/s 
Collecting smart-open>=1.2.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/4b/1f/6f27e3682124de63ac97a0a5876da6186de6c19410feab66c1543afab055/smart_open-1.7.1.tar.gz
Collecting boto>=2.32 (from smart-open>=1.2.1->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/23/10/c0b78c27298029e4454a472a1919bde20cb182dab1662cec7f2ca1dcc523/boto-2.49.0-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 13.6MB/s 
[?25hCollecting bz2file (from smart-open>=1.2.1->gensim)
  Downloading https://files.pythonhosted.org/packages/61/39/122222b5e85cd41c391b68a99ee296584b2a2d1d233e7ee32b4532384f2d/bz2file-0.98.tar.gz
Collecting boto3 (from smart-open>=1.2.1->gensim)
[?25l  Downlo

In [0]:
import gensim

In [0]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

In [6]:
!ls

adc.json  labeledTrainData.tsv	sample_data  unlabeledTrainData.tsv


### Load Text Data

In [7]:
import pandas as pd
df = pd.read_csv('unlabeledTrainData.tsv', header=0, delimiter="\t", quoting=3)

print('Number of examples in Dataset: ', df.shape)
df.head()

Number of examples in Dataset:  (50000, 2)


Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


### Function to Clean up data

In [0]:
import re, string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)	
  except:
    return ""

### Clean the Data using routine above

In [9]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


### Convert Review to a Word List

In [10]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

print(len(documents))
print(documents[0])

50000
['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends', 'maybe', 'they', 'were', 'sitting', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', 'hey', 'let', 's', 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', 'or', 'something', 'like', 'that', 'what', 'ever', 'they', 'said', 'they', 'still', 'ended', 'up', 'making', 'a', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', 'etc', 'all', 'corners', 'were', 'cut', 'except', 'the', 'one', 'that', 'would', 'have', 'prevented', 'this', 'film', 's', 'release', 'life', 's', 'like', 'that']


### Build the Model

In [11]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=10, #Ignore all words with total frequency lower than this                           
                               workers=4, #Number of CPUs
                               size=50,  #Embedding size
                               window=5, #Maximum Distance between current and predicted word
                               iter=10   #Number of iterations over the text corpus
                              )  

2018-11-03 07:35:30,883 : INFO : collecting all words and their counts
2018-11-03 07:35:30,885 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-11-03 07:35:31,566 : INFO : PROGRESS: at sentence #10000, processed 2399440 words, keeping 51654 word types
2018-11-03 07:35:32,270 : INFO : PROGRESS: at sentence #20000, processed 4835846 words, keeping 69077 word types
2018-11-03 07:35:32,973 : INFO : PROGRESS: at sentence #30000, processed 7267977 words, keeping 81515 word types
2018-11-03 07:35:33,646 : INFO : PROGRESS: at sentence #40000, processed 9669772 words, keeping 91685 word types
2018-11-03 07:35:34,307 : INFO : collected 100479 word types from a corpus of 12084660 raw words and 50000 sentences
2018-11-03 07:35:34,309 : INFO : Loading a fresh vocabulary
2018-11-03 07:35:34,467 : INFO : effective_min_count=10 retains 28322 unique words (28% of original 100479, drops 72157)
2018-11-03 07:35:34,468 : INFO : effective_min_count=10 leaves 11910457 word cor

# Exploring the model

### How many words in the model

In [12]:
#Model size
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(28322, 50)

In [13]:
# Vocablury of the model
model.wv.vocab

{'watching': <gensim.models.keyedvectors.Vocab at 0x7f0243f89c18>,
 'time': <gensim.models.keyedvectors.Vocab at 0x7f0243a82ac8>,
 'chasers': <gensim.models.keyedvectors.Vocab at 0x7f0243a82a58>,
 'it': <gensim.models.keyedvectors.Vocab at 0x7f0243a82a20>,
 'obvious': <gensim.models.keyedvectors.Vocab at 0x7f0243a82a90>,
 'that': <gensim.models.keyedvectors.Vocab at 0x7f021c2b51d0>,
 'was': <gensim.models.keyedvectors.Vocab at 0x7f021c2b5160>,
 'made': <gensim.models.keyedvectors.Vocab at 0x7f021c2b50b8>,
 'by': <gensim.models.keyedvectors.Vocab at 0x7f021c2b5128>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7f021c2b5240>,
 'bunch': <gensim.models.keyedvectors.Vocab at 0x7f021c2b52e8>,
 'of': <gensim.models.keyedvectors.Vocab at 0x7f021c2b52b0>,
 'friends': <gensim.models.keyedvectors.Vocab at 0x7f021c2b5320>,
 'maybe': <gensim.models.keyedvectors.Vocab at 0x7f021c2b5358>,
 'they': <gensim.models.keyedvectors.Vocab at 0x7f021c2b5390>,
 'were': <gensim.models.keyedvectors.Vocab at 0x7f

### Get an embedding for a word

In [14]:
model.wv['flower']

array([-0.2856322 ,  1.8690357 ,  1.1131401 , -0.05964514, -1.2817084 ,
        0.81716233, -0.40908554, -0.631256  , -1.0271999 ,  0.80576074,
        1.0246943 , -0.8428521 ,  0.45067772,  0.41216603,  0.53160316,
       -0.61579573, -1.4384805 , -0.3435652 ,  1.1133881 ,  0.39138207,
       -0.17454813, -0.61728454,  0.36534303,  0.5923682 , -0.12844305,
        0.9723035 , -0.19784941, -0.41993052, -0.9005013 ,  0.5294631 ,
        1.1154263 , -0.40063667,  0.30552238, -0.37215015, -0.51981354,
       -0.88963205, -1.028535  ,  0.41966107, -0.44275188,  0.30857357,
        0.12868123,  1.098906  ,  0.2299486 ,  1.5636653 ,  0.09110276,
        0.61304325, -0.23170473,  0.11386333,  0.30175668, -0.13751759],
      dtype=float32)

### Finding Words which have similar meaning

In [15]:
model.wv.most_similar('great')

2018-11-03 07:43:49,411 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('terrific', 0.8819054961204529),
 ('fantastic', 0.8809152841567993),
 ('wonderful', 0.8730698823928833),
 ('fine', 0.849142849445343),
 ('good', 0.8334894776344299),
 ('brilliant', 0.8108375668525696),
 ('superb', 0.7947901487350464),
 ('perfect', 0.7504172921180725),
 ('nice', 0.7463917136192322),
 ('amazing', 0.7434860467910767)]

### Find the word which is not like others

In [16]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


'kitchen'

### Saving the model

In [0]:
model.save('word2vec-movie-50')

In [0]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-50')

1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [17]:
model.most_similar(positive=['king','man'], negative=['queen'])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('prophecy', 0.5854490399360657),
 ('soldier', 0.5454061627388),
 ('scientist', 0.5230584740638733),
 ('guy', 0.49423331022262573),
 ('master', 0.4940657913684845),
 ('enforcer', 0.4927676320075989),
 ('seed', 0.48684585094451904),
 ('freddy', 0.4807429313659668),
 ('mercenary', 0.4800286591053009),
 ('ryu', 0.4785825312137604)]