# Featurizing text data with tfidf weighted word-vectors 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [10]:
# avoid decoding problems
df = pd.read_csv("train.csv")
 
# encode questions to unicode
# https://stackoverflow.com/a/6812069
# ----------------- python 2 ---------------------
# df['question1'] = df['question1'].apply(lambda x: unicode(str(x),"utf-8"))
# df['question2'] = df['question2'].apply(lambda x: unicode(str(x),"utf-8"))
# ----------------- python 3 ---------------------
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [11]:
df.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy". 
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. 

In [9]:
!python -m spacy download en_core_web_lg
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
!python -m spacy download en

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.2.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.2.0
[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
nlp = spacy.load('en_core_web_lg')
x=nlp('man')
len(x.vector)

300

In [15]:
x.vector

array([-1.7310e-01,  2.0663e-01,  1.6543e-02, -3.1026e-01,  1.9719e-02,
        2.7791e-01,  1.2283e-01, -2.6328e-01,  1.2522e-01,  3.1894e+00,
       -1.6291e-01, -8.8759e-02,  3.3067e-03, -2.9483e-03, -3.4398e-01,
        1.2779e-01, -9.4536e-02,  4.3467e-01,  4.9742e-01,  2.5068e-01,
       -2.0901e-01, -5.8931e-01,  6.1615e-02,  1.0434e-01,  2.4424e-01,
       -2.9120e-01,  3.0746e-01,  3.6276e-01,  7.1151e-01, -8.0523e-02,
       -5.9524e-01,  3.4834e-01, -3.3048e-01,  7.0316e-02,  5.3329e-01,
       -2.9081e-01,  1.3459e-01, -3.9856e-01, -3.2435e-01,  1.1867e-01,
       -1.4938e-01, -3.8256e-01,  3.3116e-01, -3.1488e-01, -9.4491e-02,
       -6.1319e-02,  1.5518e-01, -2.5523e-01, -1.1813e-01,  2.5296e-01,
       -9.5174e-02, -1.6596e-01, -1.0840e-01,  8.8803e-02,  2.0890e-01,
        4.3981e-01,  1.0476e-03, -4.0666e-02,  2.6487e-01, -6.1009e-01,
       -1.4405e-01, -8.1185e-02,  7.5475e-03,  2.3373e-01, -2.7772e-02,
       -2.9315e-01, -1.1744e-01, -8.3193e-02, -2.3768e-01,  1.57

In [16]:
vecs1 = []
for qu1 in tqdm(list(df['question1'])):
    doc1 = nlp(qu1) 
     # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
         # word2vec
        vec1 = word1.vector
         # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
             idf = 0
         # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
df['q1_feats_m'] = list(vecs1)

100%|█████████████████████████████████████████████████████████████████████████| 404290/404290 [26:13<00:00, 256.96it/s]


In [19]:
vecs2 = []
for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

100%|█████████████████████████████████████████████████████████████████████████| 404290/404290 [26:10<00:00, 257.42it/s]


In [21]:
#prepro_features_train.csv (Simple Preprocessing Feartures)
#nlp_features_train.csv (NLP Features)
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv("nlp_features_train.csv",encoding='latin-1')
else:
    print("download nlp_features_train.csv from drive or run previous notebook")

if os.path.isfile('df_bfe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv("df_bfe_without_preprocessing_train.csv",encoding='latin-1')
else:
    print("download df_bfe_without_preprocessing_train.csv from drive or run previous notebook")

In [22]:
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [23]:
# dataframe of nlp features
df1.head()

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,0.833319,0.714276,0.99998,0.99998,0.833326,0.714281,0.0,1.0,2.0,13.0,100,93,93,98,0.965517
1,1,0,0.599988,0.299997,0.499975,0.333322,0.499994,0.30769,0.0,1.0,5.0,10.5,86,63,66,73,0.442308
2,2,0,0.333328,0.249997,0.249994,0.166664,0.299997,0.214284,0.0,1.0,4.0,12.0,63,63,43,41,0.15
3,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,10.0,28,24,9,20,0.039216
4,4,0,0.166664,0.090908,0.9999,0.499975,0.28571,0.153845,0.0,1.0,6.0,10.0,67,47,35,54,0.15


In [24]:
# dataframe of basic features 
df2.head()

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,1,65,56,14,12,11.0,23.0,0.478261,2,0
1,1,4,1,50,87,12,17,8.0,26.0,0.307692,5,3
2,2,1,1,72,58,14,10,4.0,24.0,0.166667,2,0
3,3,1,1,49,58,12,16,1.0,22.0,0.045455,2,0
4,4,3,1,75,38,15,7,4.0,21.0,0.190476,4,2


In [25]:
# Questions 1 tfidf weighted word2vec
df3_q1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-5.856872,17.449559,4.86272,7.971019,20.345586,-5.514759,-4.0778,-2.820742,8.029026,146.599092,...,-17.370964,5.393082,0.384676,-8.362788,-1.88029,-10.799672,-12.999799,3.225858,1.256145,16.807275
1,9.356103,13.098566,18.945098,-2.079594,-15.703841,-2.173409,8.969065,-20.458267,-20.674299,13.760798,...,25.948247,0.603713,-10.516349,6.040723,30.476707,3.97689,-28.25461,12.613432,-7.770673,31.456654
2,0.90952,16.050299,-8.126856,-4.848289,-2.80619,9.75228,4.349992,-5.120332,6.785252,106.342974,...,-20.942061,2.398984,8.663028,-0.654124,16.220601,-2.719094,10.485332,-1.103132,-7.290877,19.31425
3,-4.950745,17.098874,-15.474965,1.04468,-2.392017,-0.051889,2.650595,-8.451192,2.584123,116.184408,...,-2.551312,-4.97148,-0.478381,-1.930166,9.336016,2.574459,4.803863,-1.182989,-2.962115,3.225704
4,-11.520302,19.769948,-4.510997,-6.548994,-20.835286,33.663909,-30.390504,0.826553,-19.571472,84.458577,...,-8.331733,-4.866335,18.828458,-40.357679,-10.336167,15.29463,-0.989347,-9.072091,-8.194567,23.84756


In [26]:
# Questions 2 tfidf weighted word2vec
df3_q2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.398579,13.991607,-0.504564,9.254431,13.906436,-4.777694,-5.274421,-0.201208,4.940558,134.73595,...,-17.810438,7.231024,1.531186,-7.528823,0.473802,-11.864658,-11.293788,1.866265,3.616046,11.971096
1,4.649688,9.974928,20.330103,-0.440372,-18.128566,-1.984671,4.906458,-27.797837,-21.262646,96.965297,...,23.015827,3.435464,-5.1696,7.102491,34.516881,6.177686,-27.770856,12.926435,-4.564559,33.919834
2,-17.305105,17.355614,-9.135664,-6.03855,-1.831651,4.547895,17.935764,-4.799029,3.100311,99.380095,...,-24.310109,-1.216773,11.909693,9.591573,11.846737,1.397859,6.454157,-0.27146,-12.500337,27.634567
3,3.897911,2.545857,-2.053792,3.38545,3.424216,-2.282545,-11.763825,6.692485,5.797674,94.978085,...,-5.435584,1.672591,-0.863278,-2.906553,-3.466688,-3.867892,-4.249463,-12.551012,4.494087,-6.223341
4,-5.391206,1.767221,1.810128,-4.097073,-3.623262,8.417368,-25.246265,7.47343,-2.789541,89.594627,...,-10.407441,-8.444207,-14.450059,-12.709382,-4.44905,12.563987,-11.721362,-16.4593,3.626297,-9.790615


In [27]:
print("Number of features in nlp dataframe :", df1.shape[1])
print("Number of features in preprocessed dataframe :", df2.shape[1])
print("Number of features in question1 w2v  dataframe :", df3_q1.shape[1])
print("Number of features in question2 w2v  dataframe :", df3_q2.shape[1])
print("Number of features in final dataframe  :", df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])

Number of features in nlp dataframe : 17
Number of features in preprocessed dataframe : 12
Number of features in question1 w2v  dataframe : 300
Number of features in question2 w2v  dataframe : 300
Number of features in final dataframe  : 629


In [28]:
# storing the final features to csv file
if not os.path.isfile('final_features.csv'):
    df3_q1['id']=df1['id']
    df3_q2['id']=df1['id']
    df1  = df1.merge(df2, on='id',how='left')
    df2  = df3_q1.merge(df3_q2, on='id',how='left')
    result  = df1.merge(df2, on='id',how='left')
    result.to_csv('final_features.csv')

In [29]:
result.head(2)

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,...,290_y,291_y,292_y,293_y,294_y,295_y,296_y,297_y,298_y,299_y
0,0,0,0.833319,0.714276,0.99998,0.99998,0.833326,0.714281,0.0,1.0,...,-17.810438,7.231024,1.531186,-7.528823,0.473802,-11.864658,-11.293788,1.866265,3.616046,11.971096
1,1,0,0.599988,0.299997,0.499975,0.333322,0.499994,0.30769,0.0,1.0,...,23.015827,3.435464,-5.1696,7.102491,34.516881,6.177686,-27.770856,12.926435,-4.564559,33.919834


In [30]:
result.shape

(404290, 628)