## Imports

In [None]:
from collections import Counter

In [None]:
import spacy

In [None]:
from scipy.spatial.distance import cosine, euclidean, jaccard

## Config

In [None]:
feature_list_id = 'nlp_tags'

## Read Data

In [None]:
df_questions_train = pd.read_csv(data_folder + 'train.csv').fillna('')
df_questions_test = pd.read_csv(data_folder + 'test.csv').fillna('')

In [None]:
question_tokens_train = load_json(preproc_data_folder + 'question_tokens_spellchecked_train.json')
question_tokens_test = load_json(preproc_data_folder + 'question_tokens_spellchecked_test.json')

In [None]:
df = pd.DataFrame(
    [
        [' '.join(pair['question1']), ' '.join(pair['question2'])]
        for pair in question_tokens_train + question_tokens_test
    ],
    columns=['question1', 'question2'],
)

In [None]:
nlp = spacy.load('en', parser=False)

## Build Features

In [None]:
pos_tags_whitelist = ['ADJ', 'ADV', 'NOUN', 'PROPN', 'NUM', 'VERB']
ner_tags_whitelist = ['GPE', 'LOC', 'ORG', 'NORP', 'PERSON', 'PRODUCT', 'DATE', 'TIME', 'QUANTITY', 'CARDINAL']

In [None]:
num_raw_features = len(pos_tags_whitelist) + len(ner_tags_whitelist)

In [None]:
X1 = np.zeros((len(df), num_raw_features))
X2 = np.zeros((len(df), num_raw_features))

In [None]:
X1.shape, X2.shape

### Collect POS and NER tags

In [None]:
for i, doc in enumerate(nlp.pipe(df['question1'].values, n_threads=os.cpu_count())):
    pos_counter = Counter(token.pos_ for token in doc)
    ner_counter = Counter(ent.label_ for ent in doc.ents)
    X1[i, :] = np.array(
        [pos_counter[pos_tag] for pos_tag in pos_tags_whitelist] +
        [ner_counter[ner_tag] for ner_tag in ner_tags_whitelist]
    )

In [None]:
for i, doc in enumerate(nlp.pipe(df['question2'].values, n_threads=os.cpu_count())):
    pos_counter = Counter(token.pos_ for token in doc)
    ner_counter = Counter(ent.label_ for ent in doc.ents)
    X2[i, :] = np.array(
        [pos_counter[pos_tag] for pos_tag in pos_tags_whitelist] +
        [ner_counter[ner_tag] for ner_tag in ner_tags_whitelist]
    )

### Create tag feature sets

In [None]:
df_pos_1 = pd.DataFrame(
    X1[:, 0:len(pos_tags_whitelist)],
    columns=['pos_q1_' + pos_tag.lower() for pos_tag in pos_tags_whitelist]
)

In [None]:
df_pos_2 = pd.DataFrame(
    X2[:, 0:len(pos_tags_whitelist)],
    columns=['pos_q2_' + pos_tag.lower() for pos_tag in pos_tags_whitelist]
)

In [None]:
df_ner_1 = pd.DataFrame(
    X1[:, -len(ner_tags_whitelist):],
    columns=['ner_q1_' + ner_tag.lower() for ner_tag in ner_tags_whitelist]
)

In [None]:
df_ner_2 = pd.DataFrame(
    X2[:, -len(ner_tags_whitelist):],
    columns=['ner_q2_' + ner_tag.lower() for ner_tag in ner_tags_whitelist]
)

### Compute pairwise distances

In [None]:
num_distance_features = 4

In [None]:
X_dist = np.zeros((len(df), num_distance_features))

In [None]:
for i in progressbar(range(len(df))):
    X_dist[i, :] = np.array([
        # POS distances.
        cosine(X1[i, 0:len(pos_tags_whitelist)], X2[i, 0:len(pos_tags_whitelist)]),
        euclidean(X1[i, 0:len(pos_tags_whitelist)], X2[i, 0:len(pos_tags_whitelist)]),
        
        # NER distances.
        euclidean(X1[i, -len(ner_tags_whitelist):], X2[i, -len(ner_tags_whitelist):]),
        np.abs(np.sum(X1[i, -len(ner_tags_whitelist):]) - np.sum(X2[i, -len(ner_tags_whitelist):])),
    ])

In [None]:
df_dist = pd.DataFrame(
    X_dist,
    columns=[
        'pos_tag_cosine',
        'pos_tag_euclidean',
        'ner_tag_euclidean',
        'ner_tag_count_diff',
    ]
)

### Build master feature list

In [None]:
df_master = pd.concat(
    [df_pos_1, df_ner_1, df_pos_2, df_ner_2, df_dist],
    axis=1,
    ignore_index=True
)

In [None]:
df_master.columns = list(df_pos_1.columns) + \
    list(df_ner_1.columns) + \
    list(df_pos_2.columns) + \
    list(df_ner_2.columns) + \
    list(df_dist.columns)

In [None]:
df_master.describe().T

## Save Features

In [None]:
feature_names = list(df_master.columns)

In [None]:
save_feature_names(feature_names, feature_list_id)

In [None]:
df_train = df_master[:len(question_tokens_train)]
df_test = df_master[len(question_tokens_train):]

In [None]:
df_train.shape, df_test.shape

In [None]:
save_feature_list(df_train.values, 'train', feature_list_id)

In [None]:
save_feature_list(df_test.values, 'test', feature_list_id)