In [6]:
import pandas as pd
import numpy as np

from zp_ihlt_project.feature_extraction import generate_valid_permutations, apply_steps_and_compare
from zp_ihlt_project.load_data import load_train_data, load_test_data

In [2]:
all_train_dt = load_train_data()
all_test_dt = load_test_data()

In [3]:
valid_permutations = generate_valid_permutations()

In [19]:
metrics = ['jaccard', 'cosine', 'euclidean', 'manhattan']
feature_names = []
feature_steps = []
features = []
test_features = []
feature_metrics = []
print(f"Generating {len(valid_permutations) * len(metrics)} features ({len(valid_permutations)} permutations * {len(metrics)} metrics)")
for i, perm in enumerate(valid_permutations):
    print(f"Generating features for permutation {i+1} of {len(valid_permutations)}")
    for metric in metrics:
        feature_names.append(f"score_{metric}_{i}")
        feature_steps.append(perm)
        feature_metrics.append(metric)
        features.append(apply_steps_and_compare(all_train_dt.s1, all_train_dt.s2, perm, metric))
        test_features.append(apply_steps_and_compare(all_test_dt.s1, all_test_dt.s2, perm, metric))
all_train_dt = all_train_dt.assign(**{name: feature for name, feature in zip(feature_names, features)})
all_test_dt = all_test_dt.assign(**{name: feature for name, feature in zip(feature_names, test_features)})
all_train_dt.head()

Generating 2080 features (520 permutations * 4 metrics)
Generating features for permutation 1 of 520
Generating features for permutation 2 of 520
Generating features for permutation 3 of 520
Generating features for permutation 4 of 520
Generating features for permutation 5 of 520
Generating features for permutation 6 of 520
Generating features for permutation 7 of 520
Generating features for permutation 8 of 520
Generating features for permutation 9 of 520
Generating features for permutation 10 of 520
Generating features for permutation 11 of 520
Generating features for permutation 12 of 520
Generating features for permutation 13 of 520
Generating features for permutation 14 of 520
Generating features for permutation 15 of 520
Generating features for permutation 16 of 520
Generating features for permutation 17 of 520
Generating features for permutation 18 of 520
Generating features for permutation 19 of 520
Generating features for permutation 20 of 520
Generating features for permutati

Unnamed: 0,s1,s2,gs,dataset,score_jaccard_0,score_cosine_0,score_euclidean_0,score_manhattan_0,score_jaccard_1,score_cosine_1,...,score_euclidean_517,score_manhattan_517,score_jaccard_518,score_cosine_518,score_euclidean_518,score_manhattan_518,score_jaccard_519,score_cosine_519,score_euclidean_519,score_manhattan_519
0,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...,4.0,MSRpar,0.75,0.858116,0.366025,0.25,0.541667,0.705024,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,Micron has declared its first quarterly profit...,Micron's numbers also marked the first quarter...,3.75,MSRpar,0.583333,0.737865,0.309017,0.166667,0.315789,0.489898,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,The fines are part of failed Republican effort...,"Perry said he backs the Senate's efforts, incl...",2.8,MSRpar,0.545455,0.717137,0.309017,0.166667,0.272727,0.429669,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,"The American Anglican Council, which represent...","The American Anglican Council, which represent...",3.4,MSRpar,0.818182,0.904534,0.414214,0.333333,0.73913,0.859727,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....,2.4,MSRpar,0.636364,0.777778,0.333333,0.2,0.363636,0.533333,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
all_train_dt.to_csv("../data/processed/train_data_with_features.csv", index=False)
all_test_dt.to_csv("../data/processed/test_data_with_features.csv", index=False)

In [22]:
feature_steps_df = pd.DataFrame([[metric] + [step.__name__ for step in steps] for steps, metric in zip(feature_steps, feature_metrics)])
feature_steps_df.columns = ['metric', *[f'step_{i}' for i in range(len(feature_steps_df.columns) - 1)]]
feature_steps_df.to_csv("../data/feature_steps.csv", index=False)