## IMPORT LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from nltk.tokenize import word_tokenize
import csv
import os

## IMPORT DATA

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [3]:
train_data.shape, test_data.shape

((1306122, 3), (375806, 2))

In [4]:
train_data.head(10)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
5,00004f9a462a357c33be,"Is Gaza slowly becoming Auschwitz, Dachau or T...",0
6,00005059a06ee19e11ad,Why does Quora automatically ban conservative ...,0
7,0000559f875832745e2e,Is it crazy if I wash or wipe my groceries off...,0
8,00005bd3426b2d0c8305,"Is there such a thing as dressing moderately, ...",0
9,00006e6928c5df60eacb,Is it just me or have you ever been in this ph...,0


In [5]:
train_data.columns

Index(['qid', 'question_text', 'target'], dtype='object')

In [6]:
test_data.head(10)

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?
5,000101884c19f3515c1a,How do you train a pigeon to send messages?
6,00010f62537781f44a47,What is the currency in Langkawi?
7,00012afbd27452239059,"What is the future for Pandora, can the busine..."
8,00014894849d00ba98a9,My voice range is A2-C5. My chest voice goes u...
9,000156468431f09b3cae,How much does a tutor earn in Bangalore?


In [7]:
test_data.columns

Index(['qid', 'question_text'], dtype='object')

### Divide data into train data and validation data 

In [8]:
X_train, y_train = train_data[['qid','question_text']], train_data['target']

##### Train Data

In [9]:
X_train[:5]

Unnamed: 0,qid,question_text
0,00002165364db923c7e6,How did Quebec nationalists see their province...
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco..."
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...


In [10]:
y_train[:5]

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

#### Test Data

In [11]:
X_test = test_data[['qid','question_text']]

In [12]:
X_test[:5]

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


## TEXT PREPROCESSING

In [13]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# REGEXs to remove unwanted patterns from the text
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
# BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
BAD_SYMBOLS_RE = re.compile('[^a-zA-Z]') # for word2vec
STOPWORDS = set(stopwords.words('english'))

In [15]:
def text_prepare(text, remove_stopwords = False):
    """
        text: a string
        
        return: modified initial string
    """
    # remove HTML
    text = BeautifulSoup(text).get_text()
    
    
    #replace Symbols with a space in string
#     text = re.sub(REPLACE_BY_SPACE_RE, " ",text)
    
    
    # delete unwanted synbols from string
    text = re.sub(BAD_SYMBOLS_RE," ", text)
    
    # convert all characters in a string to lowercase
    text = text.lower()
    
    # delete stopwords from text
    if remove_stopwords:
        text = remove_stopwords(text)
    
    return text.split()

In [18]:
# punk tokenizer for sentence splitting
import nltk.data
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [19]:
tokenizer = nltk.data.load('/data/tokenizers/punkt/english.pickle')
# C:\Users\abhi\AppData\Roaming\nltk_data

In [20]:
"""
Function to split the text into parsed sentences
Returns a list of sentences, where each sentence is a list of words
"""
def text_to_senteces(text, tokenizer, remove_stopwords = False):
    # NlTK tokenizer to split the paragtraph into sentences
    raw_texts = tokenizer.tokenize(text.strip())
    
    # loop over each sentence
    sentences = []
    for raw_text in raw_texts:
        # If a sentence is empty, skip it
        if len(raw_text) > 0:
            # otherwise, call text_prepare to get a list of words
            sentences.append(text_prepare(raw_text))
    
    # Return the list of sentencecs (each sentence is a list of words,
    # so this returns a list of lists)
    return sentences

In [21]:
# Initialize an empty list of sentences
sentences = []

print("Parsing sentences from labeled training set")
for text in X_train['question_text']:
#     print(text)
    sentences+= text_to_senteces(text, tokenizer)

print("Parsing sentences from unlabeled set")
for text in X_test['question_text']:
#     print(text)
    sentences+= text_to_senteces(text, tokenizer)

Parsing sentences from labeled training set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  '

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  '

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [26]:
print(len(sentences))

1914724


In [27]:
sentences[0]

['how',
 'did',
 'quebec',
 'nationalists',
 'see',
 'their',
 'province',
 'as',
 'a',
 'nation',
 'in',
 'the',
 's']

In [29]:
sentences[1914723]

['i',
 'mean',
 'i',
 'don',
 't',
 'think',
 'humans',
 'will',
 'survive',
 'on',
 'this',
 'earth',
 'for',
 'another',
 'years',
 'what',
 'do',
 'you',
 'think']

## Word2Vec model

In [30]:
# Logging module creates nice output messages
import logging

In [31]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [32]:
"""Set values for various parameters"""

# Word vector dimensionality
num_features = 300
# Minimum word count
min_word_count = 40
# Number of threads to run in parallel
num_workers = 4
# Context window size
context = 10
# Downsample setting for frequent words
downsampling = 1e-4

In [33]:
# Initialize and train the model
from gensim.models import word2vec

2019-04-26 17:56:41,812 : INFO : 'pattern' package not found; tag filters are not available for English


In [34]:
print("Training model...")
model_word2vec = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)
# init_sims will make the model much more memory-efficient.
model_word2vec.init_sims(replace=True)

2019-04-26 17:56:45,662 : INFO : collecting all words and their counts
2019-04-26 17:56:45,663 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-04-26 17:56:45,693 : INFO : PROGRESS: at sentence #10000, processed 112571 words, keeping 13902 word types
2019-04-26 17:56:45,724 : INFO : PROGRESS: at sentence #20000, processed 225358 words, keeping 20140 word types
2019-04-26 17:56:45,757 : INFO : PROGRESS: at sentence #30000, processed 338365 words, keeping 24944 word types
2019-04-26 17:56:45,783 : INFO : PROGRESS: at sentence #40000, processed 451240 words, keeping 29015 word types
2019-04-26 17:56:45,813 : INFO : PROGRESS: at sentence #50000, processed 564335 words, keeping 32403 word types
2019-04-26 17:56:45,843 : INFO : PROGRESS: at sentence #60000, processed 677543 words, keeping 35611 word types


Training model...


2019-04-26 17:56:45,874 : INFO : PROGRESS: at sentence #70000, processed 791481 words, keeping 38555 word types
2019-04-26 17:56:45,904 : INFO : PROGRESS: at sentence #80000, processed 905229 words, keeping 41192 word types
2019-04-26 17:56:45,936 : INFO : PROGRESS: at sentence #90000, processed 1019487 words, keeping 43731 word types
2019-04-26 17:56:45,968 : INFO : PROGRESS: at sentence #100000, processed 1132870 words, keeping 46116 word types
2019-04-26 17:56:45,999 : INFO : PROGRESS: at sentence #110000, processed 1246878 words, keeping 48337 word types
2019-04-26 17:56:46,026 : INFO : PROGRESS: at sentence #120000, processed 1359392 words, keeping 50369 word types
2019-04-26 17:56:46,058 : INFO : PROGRESS: at sentence #130000, processed 1472178 words, keeping 52405 word types
2019-04-26 17:56:46,090 : INFO : PROGRESS: at sentence #140000, processed 1585505 words, keeping 54401 word types
2019-04-26 17:56:46,121 : INFO : PROGRESS: at sentence #150000, processed 1698087 words, keep

2019-04-26 17:56:48,211 : INFO : PROGRESS: at sentence #790000, processed 8947307 words, keeping 129628 word types
2019-04-26 17:56:48,244 : INFO : PROGRESS: at sentence #800000, processed 9059987 words, keeping 130466 word types
2019-04-26 17:56:48,273 : INFO : PROGRESS: at sentence #810000, processed 9172498 words, keeping 131246 word types
2019-04-26 17:56:48,303 : INFO : PROGRESS: at sentence #820000, processed 9285904 words, keeping 132073 word types
2019-04-26 17:56:48,333 : INFO : PROGRESS: at sentence #830000, processed 9398907 words, keeping 132852 word types
2019-04-26 17:56:48,364 : INFO : PROGRESS: at sentence #840000, processed 9512658 words, keeping 133663 word types
2019-04-26 17:56:48,397 : INFO : PROGRESS: at sentence #850000, processed 9624914 words, keeping 134487 word types
2019-04-26 17:56:48,427 : INFO : PROGRESS: at sentence #860000, processed 9738467 words, keeping 135317 word types
2019-04-26 17:56:48,459 : INFO : PROGRESS: at sentence #870000, processed 985200

2019-04-26 17:56:50,435 : INFO : PROGRESS: at sentence #1500000, processed 16987805 words, keeping 179656 word types
2019-04-26 17:56:50,467 : INFO : PROGRESS: at sentence #1510000, processed 17102047 words, keeping 180294 word types
2019-04-26 17:56:50,503 : INFO : PROGRESS: at sentence #1520000, processed 17215026 words, keeping 180899 word types
2019-04-26 17:56:50,535 : INFO : PROGRESS: at sentence #1530000, processed 17327051 words, keeping 181499 word types
2019-04-26 17:56:50,567 : INFO : PROGRESS: at sentence #1540000, processed 17440937 words, keeping 182134 word types
2019-04-26 17:56:50,597 : INFO : PROGRESS: at sentence #1550000, processed 17554401 words, keeping 182785 word types
2019-04-26 17:56:50,633 : INFO : PROGRESS: at sentence #1560000, processed 17668285 words, keeping 183419 word types
2019-04-26 17:56:50,664 : INFO : PROGRESS: at sentence #1570000, processed 17782336 words, keeping 184023 word types
2019-04-26 17:56:50,694 : INFO : PROGRESS: at sentence #1580000,

2019-04-26 17:57:09,819 : INFO : EPOCH 2 - PROGRESS: at 21.66% examples, 726252 words/s, in_qsize 7, out_qsize 0
2019-04-26 17:57:10,826 : INFO : EPOCH 2 - PROGRESS: at 29.22% examples, 735421 words/s, in_qsize 7, out_qsize 0
2019-04-26 17:57:11,829 : INFO : EPOCH 2 - PROGRESS: at 35.54% examples, 716624 words/s, in_qsize 7, out_qsize 0
2019-04-26 17:57:12,833 : INFO : EPOCH 2 - PROGRESS: at 41.80% examples, 703285 words/s, in_qsize 7, out_qsize 0
2019-04-26 17:57:13,833 : INFO : EPOCH 2 - PROGRESS: at 47.80% examples, 689993 words/s, in_qsize 7, out_qsize 0
2019-04-26 17:57:14,837 : INFO : EPOCH 2 - PROGRESS: at 54.39% examples, 687249 words/s, in_qsize 7, out_qsize 0
2019-04-26 17:57:15,841 : INFO : EPOCH 2 - PROGRESS: at 60.79% examples, 683110 words/s, in_qsize 7, out_qsize 0
2019-04-26 17:57:16,842 : INFO : EPOCH 2 - PROGRESS: at 67.89% examples, 686926 words/s, in_qsize 8, out_qsize 0
2019-04-26 17:57:17,844 : INFO : EPOCH 2 - PROGRESS: at 74.52% examples, 685829 words/s, in_qsiz

In [35]:
# model_word2vec.train(sentences, total_examples=len(sentences), epochs=10)

In [36]:
# Saving the model
model_name = "300features_40minwords_10context"
model_word2vec.save(model_name)

2019-04-26 17:58:04,023 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2019-04-26 17:58:04,024 : INFO : not storing attribute vectors_norm
2019-04-26 17:58:04,025 : INFO : not storing attribute cum_table
2019-04-26 17:58:05,017 : INFO : saved 300features_40minwords_10context


In [38]:
len(model_word2vec.wv.vocab)

18154

In [63]:
model_word2vec.doesnt_match("cricket football swimming tennis none".split())

  """Entry point for launching an IPython kernel.


'none'

In [45]:
model_word2vec.wv.most_similar("india")

[('bangladesh', 0.7272980213165283),
 ('nepal', 0.7265176773071289),
 ('kerala', 0.6324079036712646),
 ('indian', 0.6255563497543335),
 ('malaysia', 0.6192783117294312),
 ('punjab', 0.6165116429328918),
 ('pakistan', 0.6109528541564941),
 ('gujarat', 0.6100716590881348),
 ('bihar', 0.6086043119430542),
 ('karachi', 0.600983738899231)]

In [50]:
model_word2vec.wv.most_similar("man")

[('woman', 0.7190712690353394),
 ('guy', 0.6376925706863403),
 ('men', 0.6064354181289673),
 ('lady', 0.558889627456665),
 ('boy', 0.5585207939147949),
 ('person', 0.5341476202011108),
 ('husband', 0.5148203372955322),
 ('wife', 0.49383699893951416),
 ('girl', 0.492467999458313),
 ('whore', 0.48396003246307373)]