# ALTEGRAD Challenge - Feature generation

*Abderrahim AIT-AZZI, Sébastien OHLEYER, Mickael SUTTON*

In [6]:
import pandas as pd
import numpy as np
import pickle
import gensim
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
stop_words = stopwords.words('english')

## 1. WMD distance, Sent2vec, Glove

In [10]:
from features_engineering.glove import generate_glove_features
path='./data'

In [None]:
word2vec_filepath = ''
googlenews_filepath = ''
generate_glove_features(path, word2vec_filepath, googlenews_filepath)

# 2. Page Rank

In [1]:
from features_engineering.pagerank import generate_pagerank
path = "./data"

In [2]:
generate_pagerank(path)

Apply to train...
Apply to test...
Main PR generator...
Apply to train...
Writing train...
Apply to test...
Writing test...
CSV written !


# 3. Question frequency

In [3]:
from features_engineering.question_freq import generate_question_freq
path = "./data"

In [4]:
generate_question_freq(path)

Writing train features...
Writing test features...
CSV written ! see:  ./data


# 4. Intersection of questions

In [5]:
from features_engineering.question_inter import generate_question_inter
path = "./data"

In [6]:
generate_question_inter(path)

Writing train features...
Writing test features...
CSV written ! see:  ./data



# 5. K cores

In [3]:
from features_engineering.kcores import generate_kcores
path = './data'

In [4]:
generate_kcores(path)

100%|██████████| 100279/100279 [00:00<00:00, 190659.69it/s]
100%|██████████| 100279/100279 [00:01<00:00, 86552.54it/s]


Writing train features...
Writing test features...
CSV written ! see:  ./data  | suffix:  _kcores.csv


In [4]:
from features_engineering.question_kcores import generate_question_kcores
path='./data'

In [5]:
generate_question_kcores(path)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['kcores'][df.question.isin(ck)] = k
100%|██████████| 29/29 [00:21<00:00,  1.33it/s]


Writing train features...
Writing test features...
CSV written ! see:  ./data  | suffix:  _question_kcores.csv


# 6. TF-IDF

In [1]:
from features_engineering.tfidf import generate_tfidf
path = "./data/"

In [2]:
generate_tfidf(path)

Building Features
world_match
tfidf
tfidf_wm_stops
jaccard, wc_diff; wc_ratio, wc_diff_unique, wc_ratio_unique
wc_diff_unq_stop, wc_ratio_unique_stop
same_start, char_diff
char_diff_unq_stop
total_unique_words
total_unq_words_stop
char_ratio
world_match
tfidf
tfidf_wm_stops
jaccard, wc_diff; wc_ratio, wc_diff_unique, wc_ratio_unique
wc_diff_unq_stop, wc_ratio_unique_stop
same_start, char_diff
char_diff_unq_stop
total_unique_words
total_unq_words_stop
char_ratio
Writing train features...
Writing test features...
CSV written ! see:  ./data/  | suffix:  _tfidf.csv


# 7. Graph features

## 7.1 Unweighted graph

In [None]:
from features_engineering.graph_features import generate_graph_features
path = './data'

In [19]:
generate_graph_features(path)

9it [00:00, 89.32it/s]

Number of nodes: 58940
Number of edges: 100279
Computing train features


80100it [10:25, 128.15it/s]


Writing train features...


15it [00:00, 145.07it/s]

Computing test features


20179it [02:06, 159.67it/s]


Writing test features...
CSV written ! see:  ./data  | suffix:  _graph_feat.csv


## 7.2 Weighted graph

In [1]:
from features_engineering.weightedgraph_features import generate_weightedgraph_features
path = './data'

In [1]:
manual_cv = False
generate_weightedgraph_features(path, manual_cv)

# 8. N-grams

In [1]:
from features_engineering.cooccurence_distinct_ngram import generate_cooccurence_distinct_ngram
path = './data'

In [2]:
generate_cooccurence_distinct_ngram(path,3)

9it [00:00, 85.61it/s]

Applying to train...


80100it [14:20, 93.08it/s]


Writing train features...


14it [00:00, 137.09it/s]

Applying to test...


20179it [03:00, 111.68it/s]


Writing test features...
CSV written ! see:  ./data  | suffix:  _3gram_feat.csv


# 9. Word features

See with Abderrahim

In [1]:
from features_engineering.word_features import generate_word_features
path='./data'



In [2]:
generate_word_features(path)

  R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share
  Rcosine = np.dot(shared_weights, shared_weights)/Rcosine_denominator
  R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share


Writing train features...
Writing test features...
CSV written ! see:  ./data  | suffix:  _word_feat.csv


# 10. SpaCy

In [23]:
from features_engineering.spacy_features import generate_spacy_features
path = './data'

In [25]:
generate_spacy_features(path)

4it [00:00, 32.93it/s]

Applying to train...


80100it [48:53, 27.31it/s]


Writing train features...


2it [00:00, 18.54it/s]

Applying to train...


20179it [11:35, 29.02it/s]


Writing test features...
CSV written ! see:  ./data  | suffix:  _spacy_features.csv


# 11. Letter count features

In [10]:
from features_engineering.letters_count_feat import generate_letters_count_features
path = './data'

In [11]:
generate_letters_count_features(path)

Writing train features...
CSV written ! see:  ./data  | suffix:  _count_feat.csv
Writing test features...
CSV written ! see:  ./data  | suffix:  _count_feat.csv
