In [2]:
import pandas as pd
import numpy as np
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import names, stopwords
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from sklearn.model_selection import train_test_split

# Importing File

In [3]:
data = pd.read_csv('Text_Similarity_Dataset.csv')

# Data Familiarization

In [4]:
data.head(10)

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...
5,5,india seeks to boost construction india has cl...,music mogul fuller sells company pop idol supr...
6,6,podcasters look to net money nasa is doing it...,ukip outspent labour on eu poll the uk indepen...
7,7,row over police power for csos the police fe...,ban on hunting comes into force fox hunting wi...
8,8,election could be terror target terrorists m...,nhs waiting time target is cut hospital waitin...
9,9,japan economy slides to recession the japanese...,optimism remains over uk housing the uk proper...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4023 entries, 0 to 4022
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Unique_ID  4023 non-null   int64 
 1   text1      4023 non-null   object
 2   text2      4023 non-null   object
dtypes: int64(1), object(2)
memory usage: 94.4+ KB


In [6]:
data.isna().sum()

Unique_ID    0
text1        0
text2        0
dtype: int64

In [7]:
print('Size of data', data.shape)

Size of data (4023, 3)


In [8]:
text_size1 = data['text1'].apply(lambda x: len(x.split(' '))).sum()
text_size2 = data['text2'].apply(lambda x: len(x.split(' '))).sum()
print('Total number of words in document', text_size1+text_size2)

Total number of words in document 3383889


There are total 4023 rows and 3 columns. columns are Unique_ID, text1 and text2. we will drop Unique_ID column as it is not relevant to us or we can simply make Unique_ID as index column for data.
This data set do not have any missing values. Total 33,83,889 words

In [9]:
Unique_ID = data['Unique_ID']

In [10]:
data.drop('Unique_ID', axis=1, inplace=True)

# Data Clean Up

Initializing stopwords, all names, English()

In [11]:
stop_words = set(stopwords.words('english'))

In [12]:
all_names = set(names.words())

In [13]:
lemma = WordNetLemmatizer()

Functions to clean data

In [14]:
def cleaned_string(string):
    """
    Function to clean each string of dataset
    """
    # Removing all the digits
    string = re.sub(r'\d', '', string)
    
    # Removing Mentions
    string = re.sub(r'@\w+', ' ', string)
    
    # Removing links 
    string = re.sub(r'(https?:\/\/)?([\da-zA-Z\.-\/\#\:]+)\.([\da-zA-Z\.\/\:\#]{0,9})([\/\w \.-\/\:\#]*)', ' ', string)
    
    # Removing all the digits special caharacters
    string = re.sub(r'\W', ' ', string)
        
  
   
    string = string.strip()
    
    #Removing all Single characters
    string = re.sub(r"\b[a-zA-Z]\b", "", string)
    
    
    # Lemmetizing the string and removing stop words
    string = string.split()
    string = [lemma.lemmatize(word) for word in string if word not in stop_words]
    string = ' '.join(string)
    
    # Lowercasing all data
    string = string.lower()
        
    return string

In [15]:
def clean_text(data):
    """
    Function to retrieve each value in dataset and pass it through cleaned_string function.
    """
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            data.iloc[i, j] = cleaned_string(data.iloc[i, j])
    return data
            
            

In [16]:
x = clean_text(data)

In [17]:
x.head()

Unnamed: 0,text1,text2
0,savvy searcher fail spot ad internet search en...,newcastle bolton kieron dyer smashed home winn...
1,million miss net uk population still without i...,nasdaq planning share sale owner technology do...
2,young debut cut short ginepri fifteen year old...,ruddock back yapp credential wale coach mike r...
3,diageo buy u wine firm diageo world biggest sp...,mci share climb takeover bid share u phone com...
4,careful code new european directive could put ...,medium gadget get moving pocket sized device l...


In [18]:
text_size1 = x['text1'].apply(lambda x: len(x.split(' '))).sum()
text_size2 = x['text2'].apply(lambda x: len(x.split(' '))).sum()
print('Total number of words in document after preocessing', text_size1+text_size2)

Total number of words in document after preocessing 461962


This is a relatively small dataset with only 4,48,910 words

# Data preparation

Data splited in array to label it because doc2vec requires labeled data

In [19]:
def process_text(text_list, texts, text_list_name):
# function to transform questions and display progress
    for text in texts:
        text_list.append(text)

In [20]:
text1 = []
process_text(text1, x['text1'], 'text1')

In [21]:
text2 = []
process_text(text2, x['text2'], 'text2')

In [22]:
text_labeled = []

In [23]:
for i in range(len(text1)):
    # Text strings need to be separated into words
    # Each Text string needs a unique label
    text_labeled.append(TaggedDocument(text1[i].split( ), data[data.index ==i]))
    text_labeled.append(TaggedDocument(text2[i].split( ), data[data.index ==i]))

In [24]:
text_labeled

[TaggedDocument(words=['savvy', 'searcher', 'fail', 'spot', 'ad', 'internet', 'search', 'engine', 'user', 'odd', 'mix', 'naive', 'sophisticated', 'suggests', 'report', 'search', 'searcher', 'usually', 'find', 'looking', 'using', 'search', 'result', 'organic', 'net', 'user', 'say', 'regularly', 'use', 'google', 'ask', 'jeeves', 'msn', 'yahoo', 'questioned', 'said', 'would', 'trust', 'search', 'engine', 'much', 'le', 'knew', 'information', 'paid', 'result', 'carry', 'search', 'least', 'weekly', 'asked', 'look', 'every', 'questioned', 'information', 'looking', 'critical', 'information', 'simply', 'searcher', 'use', 'single', 'search', 'engine', 'use', 'two', 'three', 'small', 'number', 'consult', 'three', 'said', 'really', 'miss', 'search', 'engine', 'said', 'could', 'live', 'without', 'search', 'third', 'questioned', 'said', 'thought', 'result', 'presented', 'fair', 'unbiased', 'selection', 'information', 'topic', 'found', 'result', 'many', 'search', 'engine', 'provide', 'alongside', 'li

# Model

Initializing model

In [25]:
model = Doc2Vec(dm = 1, min_count=1, window=20, vector_size=300, sample=1e-4, negative=10, workers=1, seed=42, dm_mean=1)
model.build_vocab(text_labeled)

Training model

In [26]:
for epoch in range(20):
    model.train(text_labeled,epochs=model.epochs,total_examples=model.corpus_count)
    print("Epoch #{} is complete.".format(epoch+1))

Epoch #1 is complete.
Epoch #2 is complete.
Epoch #3 is complete.
Epoch #4 is complete.
Epoch #5 is complete.
Epoch #6 is complete.
Epoch #7 is complete.
Epoch #8 is complete.
Epoch #9 is complete.
Epoch #10 is complete.
Epoch #11 is complete.
Epoch #12 is complete.
Epoch #13 is complete.
Epoch #14 is complete.
Epoch #15 is complete.
Epoch #16 is complete.
Epoch #17 is complete.
Epoch #18 is complete.
Epoch #19 is complete.
Epoch #20 is complete.


In [27]:
print('No. of vocabulary words in model:', len(model.wv.vocab))

No. of vocabulary words in model: 14233


Splitting each word in a row to match vocab during model evaluation

In [28]:
text1_split = []
for text in text1:
    text1_split.append(text.split())
    
text2_split = []
for text in text2:
    text2_split.append(text.split())

# Evaluating model

In [29]:
doc2vec_scores = []

In [30]:
for i in range(len(text1_split)):
    score = model.wv.n_similarity(text1_split[i], text2_split[i])
    doc2vec_scores.append(score)

In [31]:
mean_score = sum(doc2vec_scores) / len(doc2vec_scores)

In [32]:
print('Mean score', mean_score)

Mean score 0.42930078250523857


# Saving predictions

In [33]:
output = pd.DataFrame({'Unique_ID': Unique_ID,
                       'Similarity_Score': doc2vec_scores})

output.to_csv(r'submission.csv', index=False)

In [34]:
model.wv.n_similarity(text1_split[0], text2_split[0])

0.5068366