In [1]:
# Import necessary packages
import pandas as pd
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import re
import matplotlib
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()



In [2]:
import nltk
#nltk.download('all', halt_on_error=False)
from nltk.corpus import stopwords # Import the stop word list

In [3]:
# Read dataset
questions = pd.read_csv('questions_data_for_assignment.csv') 

In [4]:
questions.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,40146,79967,79968,If there was one movie that you would suggest ...,What is that one movie you will recommend some...,1
1,157322,311105,311106,Is Rick Perry electable?,Where is Rick Perry?,0
2,76441,151885,151886,Why am I getting my period so much?,Why am I getting my period twice a month?,0
3,355648,696531,696532,Why absolute refractive index can never be les...,Why does a relative refractive index less than...,0
4,136457,270171,270172,"As kids develop into teenagers, do they someti...","Do psychopaths, as older children and teenager...",0


In [5]:
# Fill missing values
questions['question2'].fillna('',inplace =True) 

# Creating bag of words model

In [6]:
# Function to convert a raw question to a string of words
# The input is a single string (question), and the output is a single string (a preprocessed question)
def review_to_words( raw_review ):
    # 1. Remove HTML tags 
    review_text = BeautifulSoup(raw_review).get_text() 
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    # 4. Convert the stop words to a set (Searching is faster in a set than a list)
    stops = set(stopwords.words("english"))                  
    # 5. Remove stop words and lemmatize
    meaningful_words = [w for w in words if not w in stops]   
    #meaningful_words = [wordnet_lemmatizer.lemmatize(w) for w in words]
    # 6. Join the words back into one string separated by space, and return the result.
    return( " ".join( meaningful_words ))   

In [7]:
# Get the number of questions based on the dataframe column size
num_questions = questions['question1'].size

# Initialize an empty list to hold clean questions
clean_train_questions = []

# Loop over each question in column question1 ; create an index i that goes from 0 to the length of the movie review list 
for i in range( 0, num_questions ):
    # If the index is divisible by 10000, print a message
    if( (i+1)%10000 == 0 ):
        print("Question %d of %d\n" % ( i+1, num_questions ))                                                                    
    clean_train_questions.append( review_to_words( questions['question1'][i] ))

# Repeat same for the column question2
for i in range( 0, num_questions ):
    if( (i+1)%10000 == 0 ):
        print("Question %d of %d\n" % ( i+1, num_questions ))                                                                    
    clean_train_questions.append( review_to_words( questions['question2'][i] ))



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Question 10000 of 300000

Question 20000 of 300000

Question 30000 of 300000

Question 40000 of 300000

Question 50000 of 300000

Question 60000 of 300000

Question 70000 of 300000

Question 80000 of 300000

Question 90000 of 300000

Question 100000 of 300000

Question 110000 of 300000

Question 120000 of 300000

Question 130000 of 300000

Question 140000 of 300000

Question 150000 of 300000

Question 160000 of 300000

Question 170000 of 300000

Question 180000 of 300000

Question 190000 of 300000

Question 200000 of 300000

Question 210000 of 300000

Question 220000 of 300000

Question 230000 of 300000

Question 240000 of 300000

Question 250000 of 300000

Question 260000 of 300000

Question 270000 of 300000

Question 280000 of 300000

Question 290000 of 300000

Question 300000 of 300000

Question 10000 of 300000

Question 20000 of 300000

Question 30000 of 300000

Question 40000 of 300000

Question 50000 of 300000

Question 60000 of 300000

Question 70000 of 300000

Question 80000 of

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object (scikit-learn's bag of words tool.)  
# Select 10000 most commonly occuring words
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 10000) 

# Transform training data into feature vectors by giving input as list of strings
train_data_features = vectorizer.fit_transform(clean_train_questions)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

"questions['question1'] = questions.apply(lambda x: x['question1'].split(), axis =1)\nquestions['question2'] = questions.apply(lambda x: x['question2'].split(), axis =1)"

In [10]:
train_data_features.shape

(600000, 10000)

# Computing the meta features

In [11]:
import math
# Initialize empty list to hold our meta features
angles = []
distances = []
for i in range(0,num_questions):
    angle = 0
    dist = 0
    for j in range(0,10000):
        # Find angle by multiplying values of corresponding elements and summing it up 
        angle = angle + train_data_features[i][j] * train_data_features[i+300000][j]
        # Find distance as root of sum of differneces between corresponding elements
        dist = dist + (train_data_features[i][j] - train_data_features[i+300000][j])**2
    angles.append(angle)
    distances.append(math.sqrt(dist))
    # Print after every 10000 elements
    if(i%10000==0):
        print(i)


0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000


In [12]:
questions['angles'] = angles
questions['distances'] = distances

In [13]:
# Popping the columns with questions here since the questions are now unicode encoded and hence can not be read when "read_csv" command is used
questions.pop('question1')
questions.pop('question2')
# Save final dataset as a new csv to file
questions.to_csv('questions_final_dataset.csv')