# ALTEGRAD Challenge - Feature generation

*Abderrahim AIT-AZZI, Sébastien OHLEYER, Mickael SUTTON*

In [6]:
import pandas as pd
import numpy as np
import pickle
import gensim
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
stop_words = stopwords.words('english')

## 1. WMD distance, Sent2vec, Glove

#### WMD Distance

In [17]:
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)

#### Sentence embedding

In [18]:
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

### Import embedding model

#### Glove

In [None]:
from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = './glove.42B.300d.txt.word2vec'
word_embedding_model_glove = KeyedVectors.load_word2vec_format(filename, binary=False)

#### W2V

In [5]:
model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/Amine/Desktop/MVA2017/ALTEGRAD/TP3/for moodle/code/GoogleNews-vectors-negative300.bin.gz', binary=True)

### Generate features for train data

We can generate some useful features:

    -Using the fuzz package to compute some ratio of string similarity between question 1 et question 2.
    -The length of questions
    -Word embedding of the questions and compute different distances: WMD, cosine, 

In [6]:
data_train = pd.read_csv('data/train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
data_train = data_train.drop(['id', 'qid1', 'qid2'], axis=1)
data_train['len_q1'] = data_train.question1.apply(lambda x: len(str(x)))
data_train['len_q2'] = data_train.question2.apply(lambda x: len(str(x)))
data_train['diff_len'] = data_train.len_q1 - data_train.len_q2
data_train['len_char_q1'] = data_train.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data_train['len_char_q2'] = data_train.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data_train['len_word_q1'] = data_train.question1.apply(lambda x: len(str(x).split()))
data_train['len_word_q2'] = data_train.question2.apply(lambda x: len(str(x).split()))
data_train['common_words'] = data_train.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
data_train['fuzz_qratio'] = data_train.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data_train['fuzz_WRatio'] = data_train.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data_train['fuzz_partial_ratio'] = data_train.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_train['fuzz_partial_token_set_ratio'] = data_train.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_train['fuzz_partial_token_sort_ratio'] = data_train.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_train['fuzz_token_set_ratio'] = data_train.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_train['fuzz_token_sort_ratio'] = data_train.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

#### Embedding of the questions

In [8]:
model=word_embedding_model_glove
norm_model = word_embedding_model_glove
norm_model.init_sims(replace=True)

data_train['wmd'] = data_train.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

question1_vectors = np.zeros((data_train.shape[0], 300))
error_count = 0

for i, q in tqdm(enumerate(data_train.question1.values)):
    question1_vectors[i, :] = sent2vec(q)

question2_vectors  = np.zeros((data_train.shape[0], 300))
for i, q in tqdm(enumerate(data_train.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

#### Compute distances

In [9]:
data_train['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
data_train['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
data_train['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
data_train['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

#### Save features for train data

In [10]:
pickle.dump(question1_vectors, open('data/q1_glove.pkl', 'wb'), -1)
pickle.dump(question2_vectors, open('data/q2_glove.pkl', 'wb'), -1)

data_train.to_csv('data/train_features_glove.csv', index=False)

### Generete features for test data

In [11]:
data_test = pd.read_csv('data/test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])

In [12]:
data_test['len_q1'] = data_test.question1.apply(lambda x: len(str(x)))
data_test['len_q2'] = data_test.question2.apply(lambda x: len(str(x)))
data_test['diff_len'] = data_test.len_q1 - data_test.len_q2
data_test['len_char_q1'] = data_test.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data_test['len_char_q2'] = data_test.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data_test['len_word_q1'] = data_test.question1.apply(lambda x: len(str(x).split()))
data_test['len_word_q2'] = data_test.question2.apply(lambda x: len(str(x).split()))
data_test['common_words'] = data_test.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
data_test['fuzz_qratio'] = data_test.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data_test['fuzz_WRatio'] = data_test.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data_test['fuzz_partial_ratio'] = data_test.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_test['fuzz_partial_token_set_ratio'] = data_test.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_test['fuzz_partial_token_sort_ratio'] = data_test.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_test['fuzz_token_set_ratio'] = data_test.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_test['fuzz_token_sort_ratio'] = data_test.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

In [13]:
norm_model = model
norm_model.init_sims(replace=True)
data_test['wmd'] = data_test.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

question1_vectors = np.zeros((data_test.shape[0], 300))
error_count = 0

for i, q in tqdm(enumerate(data_test.question1.values)):
    question1_vectors[i, :] = sent2vec(q)

question2_vectors  = np.zeros((data_test.shape[0], 300))
for i, q in tqdm(enumerate(data_test.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

In [14]:
data_test['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
data_test['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
data_test['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
data_test['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

#### Save features for test data

In [21]:
pickle.dump(question1_vectors, open('data/q1_glove_test.pkl', 'wb'), -1)
pickle.dump(question2_vectors, open('data/q2_glove_test.pkl', 'wb'), -1)
data_test.to_csv('data/test_features_glove.csv', index=False)

# 2. Page Rank

In [1]:
from features_engineering.pagerank import generate_pagerank
path = "./data"

In [2]:
generate_pagerank(path)

Apply to train...
Apply to test...
Main PR generator...
Apply to train...
Writing train...
Apply to test...
Writing test...
CSV written !


# 3. Question frequency

In [3]:
from features_engineering.question_freq import generate_question_freq
path = "./data"

In [4]:
generate_question_freq(path)

Writing train features...
Writing test features...
CSV written ! see:  ./data


# 4. Intersection of questions

In [5]:
from features_engineering.question_inter import generate_question_inter
path = "./data"

In [6]:
generate_question_inter(path)

Writing train features...
Writing test features...
CSV written ! see:  ./data



# 5. K cores

In [3]:
from features_engineering.kcores import generate_kcores

In [4]:
generate_kcores(path)

100%|██████████| 100279/100279 [00:00<00:00, 190659.69it/s]
100%|██████████| 100279/100279 [00:01<00:00, 86552.54it/s]


Writing train features...
Writing test features...
CSV written ! see:  ./data  | suffix:  _kcores.csv


In [4]:
from features_engineering.question_kcores import generate_question_kcores
path='./data'

In [5]:
generate_question_kcores(path)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['kcores'][df.question.isin(ck)] = k
100%|██████████| 29/29 [00:21<00:00,  1.33it/s]


Writing train features...
Writing test features...
CSV written ! see:  ./data  | suffix:  _question_kcores.csv


# 6. TF-IDF

In [1]:
from features_engineering.tfidf import generate_tfidf
path = "./data/"

In [2]:
generate_tfidf(path)

Building Features
world_match
tfidf
tfidf_wm_stops
jaccard, wc_diff; wc_ratio, wc_diff_unique, wc_ratio_unique
wc_diff_unq_stop, wc_ratio_unique_stop
same_start, char_diff
char_diff_unq_stop
total_unique_words
total_unq_words_stop
char_ratio
world_match
tfidf
tfidf_wm_stops
jaccard, wc_diff; wc_ratio, wc_diff_unique, wc_ratio_unique
wc_diff_unq_stop, wc_ratio_unique_stop
same_start, char_diff
char_diff_unq_stop
total_unique_words
total_unq_words_stop
char_ratio
Writing train features...
Writing test features...
CSV written ! see:  ./data/  | suffix:  _tfidf.csv


# 7. Graph features

In [1]:
from features_engineering.graph_features import generate_graph_features
path = "./data"

In [2]:
generate_graph_features(path)

12it [00:00, 115.12it/s]

Number of nodes: 58940
Number of edges: 100279
Computing train features


80100it [09:02, 147.53it/s]


Writing train features...


7it [00:00, 69.11it/s]

Computing test features


20179it [03:59, 84.30it/s]


Writing test features...
CSV written ! see:  ./data  | suffix:  _graph_feat.csv


# 8. N-grams

In [1]:
from features_engineering.cooccurence_distinct_ngram import generate_cooccurence_distinct_ngram

In [2]:
path = './data'
generate_cooccurence_distinct_ngram(path,3)

8it [00:00, 75.03it/s]

Applying to train...


80100it [18:10, 73.44it/s]


Writing train features...


11it [00:00, 103.50it/s]

Applying to test...


20179it [02:46, 121.05it/s]


Writing test features...
CSV written ! see:  ./data  | suffix:  _3gram_feat.csv


# 9. Word features

See with Abderrahim

# 10. SpaCy

In [23]:
from features_engineering.spacy_features import generate_spacy_features

In [25]:
generate_spacy_features(path)

4it [00:00, 32.93it/s]

Applying to train...


80100it [48:53, 27.31it/s]


Writing train features...


2it [00:00, 18.54it/s]

Applying to train...


20179it [11:35, 29.02it/s]


Writing test features...
CSV written ! see:  ./data  | suffix:  _spacy_features.csv
