In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
from scipy.sparse import hstack
import os , pickle
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm

import spacy

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
os.chdir('/content/gdrive/MyDrive/Quora-question-pair-similarity-master')

In [None]:
df = pd.read_csv("data/data_with_preprocess_2.csv")
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))
print(df.shape)
df.head(2)

(404287, 32)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,1,1,66,57,...,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0,4,2,51,88,...,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154


### For the ease of computation we will sample only 100k points

In [None]:
#sampling 100k
df = df.sample(n=100000,random_state=40)

In [None]:
#changing columns to numeric type
num_cols = df.drop(columns=['id', 'qid1', 'qid2', 'question1', 'question2']).columns
for i in num_cols:
    df[i] = df[i].apply(pd.to_numeric)

In [None]:
y = df['is_duplicate']
X = df[df.drop(columns=['id', 'qid1', 'qid2','is_duplicate']).columns.tolist()]
print(X.shape)
print(y.shape)

(100000, 28)
(100000,)


# Train test split

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3,random_state=100)
print("Number of data points in train data :",X_train.shape)
print("Number of data points in test data :",X_test.shape)

Number of data points in train data : (70000, 28)
Number of data points in test data : (30000, 28)


# Handling text data

We have already cleaned the text data. Now we have to vectorize it. We mainly used 2 approached.

1. TFIDF
2. TFIDF weighted glove vectorization

## TFIDF

In [None]:
tfidf_vectorizer1 = TfidfVectorizer(lowercase=False,max_features= 20000)
trainqs1_tfidf = tfidf_vectorizer1.fit_transform(X_train['question1'])
testqs1_tfidf  = tfidf_vectorizer1.transform(X_test['question1'])
print(trainqs1_tfidf.shape)
print(testqs1_tfidf.shape)

(70000, 20000)
(30000, 20000)


In [None]:
tfidf_vectorizer2 = TfidfVectorizer(lowercase=False,max_features= 20000)
train_qs2_tfidf = tfidf_vectorizer2.fit_transform(X_train['question2'])
test_qs2_tfidf  = tfidf_vectorizer2.transform(X_test['question2'])
print(train_qs2_tfidf.shape)
print(test_qs2_tfidf.shape)

(70000, 20000)
(30000, 20000)


In [None]:
#Now we will hstack both the vectors
tfidf_train_vec = hstack((trainqs1_tfidf,train_qs2_tfidf))
tfidf_test_vec = hstack((testqs1_tfidf,test_qs2_tfidf)) 
print("train data shape",tfidf_train_vec.shape)
print("Test data shape ",tfidf_test_vec.shape)

train data shape (70000, 40000)
Test data shape  (30000, 40000)


In [None]:
# selecting other features
train_df = X_train.drop(columns=['question1', 'question2'])
test_df = X_test.drop(columns=['question1', 'question2'])

In [None]:
#we need to convert our data with features into sparse matrix so that we can combine our feature matrix and and tfidf vectors 
import scipy
train_sparse = scipy.sparse.csr_matrix(train_df)
test_sparse = scipy.sparse.csr_matrix(test_df)

In [None]:
# Now combining our tfidf and features into one 
tfidf_X_tr = hstack((train_sparse,tfidf_train_vec))
tfidf_X_test = hstack((test_sparse,tfidf_test_vec))
print("train data shape",tfidf_X_tr.shape)
print("Test data shape ",tfidf_X_test.shape)

train data shape (70000, 40026)
Test data shape  (30000, 40026)


In [None]:
#saving tfidf vectors
pickle.dump(tfidf_train_vec, open("data/tfidf_X_tr","wb"))
pickle.dump(tfidf_test_vec, open("data/tfidf_X_test","wb"))

## TFIDF Weighted Glove Vectors

In [None]:
# use spacy embedding
# run this from a normal command line
!python -m spacy download en_core_web_md

NotImplementedError: ignored

In [None]:
# merge texts
questions = list(X_train['question1']) + list(X_train['question2'])
tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))


In [None]:
# Load the spacy model that you have installed
import en_core_web_sm
nlp = en_core_web_sm.load()

# each vector will be of length 94..
doc = nlp("This is some text that I am processing with Spacy")
#example
doc[3].vector.shape

(96,)

In [None]:
nlp = spacy.load('en_core_web_sm')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(X_train['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), 96])
    for i,word1 in enumerate(doc1):
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1[i] += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)

X_train_glove_q1 = vecs1

100%|██████████| 70000/70000 [07:28<00:00, 156.01it/s]


In [None]:
nlp = spacy.load('en_core_web_sm')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(X_train['question2'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), 96])
    for i,word1 in enumerate(doc1):
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1[i] += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)

X_train_glove_q2 = vecs1

100%|██████████| 70000/70000 [07:39<00:00, 152.28it/s]


In [None]:
nlp = spacy.load('en_core_web_sm')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(X_test['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), 96])
    for i,word1 in enumerate(doc1):
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1[i] += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)

X_test_glove_q1 = vecs1

100%|██████████| 30000/30000 [03:17<00:00, 152.23it/s]


In [None]:
nlp = spacy.load('en_core_web_sm')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(X_test['question2'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), 96])
    for i,word1 in enumerate(doc1):
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1[i] += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)

X_test_glove_q2 = vecs1

100%|██████████| 30000/30000 [03:16<00:00, 152.75it/s]


In [None]:
X_train['q1_glove'] = X_train_glove_q1
X_train['q2_glove'] = X_train_glove_q2
X_test['q1_glove'] = X_test_glove_q1
X_test['q2_glove'] = X_test_glove_q2

In [None]:
train_glove = np.concatenate([np.array(X_train_glove_q1),np.array(X_train_glove_q2)],axis=1)
test_glove = np.concatenate([np.array(X_test_glove_q1),np.array(X_test_glove_q2)],axis=1)
train_glove.shape

(70000, 192)

In [None]:
glove_train_df = pd.DataFrame(train_glove,columns=[f'g_{i}' for i in range(train_glove.shape[1])])
glove_test_df = pd.DataFrame(test_glove,columns=[f'g_{i}' for i in range(test_glove.shape[1])])
glove_train_df.head()

Unnamed: 0,g_0,g_1,g_2,g_3,g_4,g_5,g_6,g_7,g_8,g_9,...,g_182,g_183,g_184,g_185,g_186,g_187,g_188,g_189,g_190,g_191
0,-2.050485,-3.450913,1.804888,-0.333793,-0.511347,-1.038847,2.044998,0.655494,-1.631877,-0.436064,...,0.569483,-0.426833,-2.113042,3.020969,-0.561594,-2.389376,3.596926,3.236205,-0.898927,2.942675
1,0.59153,1.419579,-1.042543,2.517551,1.515871,-0.790258,3.517147,0.966838,1.293419,-0.452368,...,-1.632971,0.367683,-0.584169,0.131638,1.710234,1.530932,3.821884,1.693522,-1.979285,-0.438928
2,0.041486,-1.862883,-0.703097,0.165039,0.369193,0.270954,3.641336,0.654601,0.295264,-0.451292,...,-1.331164,1.256921,-1.664439,-1.071017,-0.240619,0.791694,4.352497,3.575399,0.340574,1.360705
3,-0.315613,-2.306215,0.136779,0.572128,-0.342839,-1.012532,2.478711,2.351752,-2.389072,1.900219,...,0.565011,-1.097035,-2.011983,1.337454,-1.757819,-0.0175,3.16296,2.653733,-1.957361,-0.218455
4,0.96924,-3.084247,0.97713,0.661493,-1.097367,0.372337,1.693871,-0.35423,-2.635398,0.268219,...,0.246333,-1.650251,-1.734734,0.310178,-0.528062,1.327808,5.137451,3.715774,-0.92518,-0.906171


In [None]:

X_train = X_train.drop(columns=['question1','question2']).reset_index(drop=True)
X_test = X_test.drop(columns=['question1','question2']).reset_index(drop=True)
print(X_train.shape)
print(X_test.shape)

(70000, 28)
(30000, 28)


In [None]:
# concatenating
X_train_d = pd.concat([X_train,glove_train_df],axis=1)
X_test_d = pd.concat([X_test,glove_test_df],axis=1)
print(X_train_d.shape)
print(X_test_d.shape)

(70000, 220)
(30000, 220)


In [None]:
X_train_d.to_csv('data/train_data.csv',index=False)
X_test_d.to_csv('data/test_data.csv',index=False)

In [None]:
y_train.to_csv('data/train_y.csv',index=False)
y_test.to_csv('data/test_y.csv',index=False)