# Using Word2Vec Algorithm

It works on unlabelled data

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import spacy
import gensim
import cython

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
lab_df = pd.read_csv('drive/My Drive/Pytorch_DataSet/Bag Of Words Meets Bags of popcorn/labeledTrainData.tsv',sep='\t')
unlab_df = pd.read_csv('drive/My Drive/Pytorch_DataSet/Bag Of Words Meets Bags of popcorn/unlabeledTrainData.tsv',delimiter="\t", quoting=3)
test_df = pd.read_csv('drive/My Drive/Pytorch_DataSet/Bag Of Words Meets Bags of popcorn/testData.tsv',sep='\t')

In [5]:
lab_df.head(2)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."


In [6]:
unlab_df.head(2)

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."


In [7]:
len(lab_df), len(unlab_df), len(test_df)

(25000, 50000, 25000)

In [8]:
from nltk.corpus import stopwords # Import the stop word list
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [0]:
# Cleaning text
"""
def clean_text(text):
  text = text.lower()
  text = re.sub('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});','',text)  # for removal of html tags
  text = re.sub(r"[^a-zA-Z]"," ",text)
  text = re.sub('\W', ' ', text)  # If the comment/word does not contain any alphabets
  text = text.strip(' ') # Removing leading and trailing white spaces
  text = text.split()  # For words splitting
  return text
"""
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

def review_to_wordlist(review, remove_stopwords=False ):
    
    # 1. Removing html tags
    review_text = BeautifulSoup(review).get_text()
    #review_text = re.sub('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});','',review)  

    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)  

In [10]:
"""
lab_df['review'] = lab_df['review'].apply(lambda text: clean_text(text))
unlab_df['review'] = unlab_df['review'].apply(lambda text: clean_text(text))
test_df['review'] = test_df['review'].apply(lambda text: clean_text(text))
"""

"\nlab_df['review'] = lab_df['review'].apply(lambda text: clean_text(text))\nunlab_df['review'] = unlab_df['review'].apply(lambda text: clean_text(text))\ntest_df['review'] = test_df['review'].apply(lambda text: clean_text(text))\n"

Next, we want a specific input format. Word2Vec expects single sentences, each one as a list of words. In other words, the input format is a list of lists.

It is not at all straightforward how to split a paragraph into sentences. There are all kinds of gotchas in natural language. English sentences can end with "?", "!", """, or ".", among other things, and spacing and capitalization are not reliable guides either. For this reason, we'll use NLTK's punkt tokenizer for sentence splitting. In order to use this, you will need to install NLTK and use nltk.download() to download the relevant training file for punkt.

In [24]:
import nltk.data
nltk.download('punkt')   

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
tokenizer

<nltk.tokenize.punkt.PunktSentenceTokenizer at 0x7fbf215caeb8>

In [0]:
# Define a function to split a review into parsed sentences

def review_to_sentences( review, tokenizer, remove_stopwords=False ):

    # Function to split a review into parsed sentences. Returns a list of sentences, where each sentence is a list of words
    
    # 1. Use the NLTK tokenizer to split the paragraph into sentences

    raw_sentences = tokenizer.tokenize(review.strip())
    
    # 2. Loop over each sentence

    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(review_to_wordlist( raw_sentence, remove_stopwords ))
    
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [31]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in lab_df["review"]:
    #print(type(review))
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlab_df["review"]:
    sentences += review_to_sentences(review, tokenizer)


Parsing sentences from training set
Parsing sentences from unlabeled set


In [32]:
print(len(sentences))

795872


In [34]:
' '.join(sentences[0])

'with all this stuff going down at the moment with mj i ve started listening to his music watching the odd documentary here and there watched the wiz and watched moonwalker again'

In [36]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, 
            size=num_features, min_count = min_word_count,
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2020-05-24 15:33:20,179 : INFO : collecting all words and their counts
2020-05-24 15:33:20,184 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-05-24 15:33:20,244 : INFO : PROGRESS: at sentence #10000, processed 225664 words, keeping 17775 word types
2020-05-24 15:33:20,298 : INFO : PROGRESS: at sentence #20000, processed 451582 words, keeping 24944 word types
2020-05-24 15:33:20,349 : INFO : PROGRESS: at sentence #30000, processed 670632 words, keeping 30023 word types


Training model...


2020-05-24 15:33:20,406 : INFO : PROGRESS: at sentence #40000, processed 896478 words, keeping 34329 word types
2020-05-24 15:33:20,458 : INFO : PROGRESS: at sentence #50000, processed 1115469 words, keeping 37741 word types
2020-05-24 15:33:20,509 : INFO : PROGRESS: at sentence #60000, processed 1336692 words, keeping 40702 word types
2020-05-24 15:33:20,557 : INFO : PROGRESS: at sentence #70000, processed 1559365 words, keeping 43300 word types
2020-05-24 15:33:20,612 : INFO : PROGRESS: at sentence #80000, processed 1778623 words, keeping 45699 word types
2020-05-24 15:33:20,658 : INFO : PROGRESS: at sentence #90000, processed 2002603 words, keeping 48113 word types
2020-05-24 15:33:20,711 : INFO : PROGRESS: at sentence #100000, processed 2224101 words, keeping 50180 word types
2020-05-24 15:33:20,756 : INFO : PROGRESS: at sentence #110000, processed 2442894 words, keeping 52050 word types
2020-05-24 15:33:20,806 : INFO : PROGRESS: at sentence #120000, processed 2665092 words, keepin

In [37]:
model.doesnt_match("man woman child kitchen".split())

'kitchen'