In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.stats import pearsonr

from zp_ihlt_project.config import TRAIN_DATA_WITH_FEATURES_PATH, TEST_DATA_WITH_FEATURES_PATH, FEATURE_STEPS_PATH
from zp_ihlt_project.feature_extraction import lexical_functions, preprocessing_functions, semantic_functions, ngram_functions, sentence_to_doc


In [2]:
all_train_df = pd.read_csv(TRAIN_DATA_WITH_FEATURES_PATH)
all_test_df = pd.read_csv(TEST_DATA_WITH_FEATURES_PATH)
feature_steps = pd.read_csv(FEATURE_STEPS_PATH)
feature_names = [col for col in all_train_df.columns if col.startswith("score_")]

In [6]:
results = []

def pearsonr_score(estimator, X, y):
    """Scorer function that matches sklearn's expected signature"""
    y_pred = estimator.predict(X)
    return pearsonr(y, y_pred)[0]

model = RandomForestRegressor()

scores = cross_val_score(
    model, 
    all_train_df[feature_names],
    all_train_df.gs,
    cv=5,
    n_jobs=-1,
    scoring=pearsonr_score
)
print(f"Cross-validation Pearson: {np.mean(scores)}+-{np.std(scores)}")


Cross-validation Pearson: 0.7323543248040615+-0.10546664171414358


In [5]:
train_idx, val_idx = train_test_split(range(len(all_train_df)), test_size=0.2, random_state=42)

In [20]:
model = RandomForestRegressor()
model.fit(all_train_df.iloc[train_idx][feature_names], all_train_df.iloc[train_idx].gs)


In [21]:
train_datasets = all_train_df.dataset.unique().tolist()
test_datasets = all_test_df.dataset.unique().tolist()

In [22]:
def get_X_y(df, selected_features, dataset, idx=None):
    if idx is None:
        idx = range(len(df))
    X = df.iloc[idx]
    X = X.loc[X.dataset == dataset][selected_features]
    y = df.iloc[idx]
    y = y.loc[y.dataset == dataset].gs
    return X, y

def evaluate_model_across_train_val_test(model, selected_features):
    results = []
    datasets = train_datasets
    for dataset in datasets:
        dataset_results = []
        dataset_results.append(dataset)
        
        X_train, y_train = get_X_y(all_train_df, selected_features, dataset, train_idx)
        X_val, y_val = get_X_y(all_train_df, selected_features, dataset, val_idx)
        X_test, y_test = get_X_y(all_test_df, selected_features, dataset)

        preds = model.predict(X_train)
        dataset_results.append(pearsonr(y_train, preds)[0])

        preds = model.predict(X_val)
        dataset_results.append(pearsonr(y_val, preds)[0])

        preds = model.predict(X_test)
        dataset_results.append(pearsonr(y_test, preds)[0])

        results.append(dataset_results)

    dataset_results = []
    dataset_results.append('all')
    preds = model.predict(all_train_df.iloc[train_idx][selected_features])
    dataset_results.append(pearsonr(all_train_df.iloc[train_idx].gs, preds)[0])

    preds = model.predict(all_train_df.iloc[val_idx][selected_features])
    dataset_results.append(pearsonr(all_train_df.iloc[val_idx].gs, preds)[0])

    preds = model.predict(all_test_df[all_test_df.dataset.isin(train_datasets)][selected_features])
    dataset_results.append(pearsonr(all_test_df[all_test_df.dataset.isin(train_datasets)].gs, preds)[0])

    results.append(dataset_results)

    results = pd.DataFrame(results, columns=["dataset", "train_pearson", "val_pearson", "test_pearson"])
    return results

evaluate_model_across_train_val_test(model, feature_names)

Unnamed: 0,dataset,train_pearson,val_pearson,test_pearson
0,MSRpar,0.974112,0.648178,0.642689
1,MSRvid,0.9811,0.772554,0.841235
2,SMTeuroparl,0.963739,0.787191,0.511434
3,all,0.984487,0.845604,0.810864


In [23]:
feature_importances = model.feature_importances_
ranked_feature_importance_indices = np.argsort(feature_importances)[::-1]
feature_steps['importance'] = feature_importances
feature_steps.sort_values(by='importance', ascending=False)

Unnamed: 0,name,metric,step_0,step_1,step_2,step_3,step_4,step_5,step_6,step_7,importance
660,score_jaccard_165,jaccard,sentence_to_doc,get_tokens,remove_stopwords,lemmatize_tokens,get_characters,get_2grams,,,0.166723
1029,score_cosine_257,cosine,sentence_to_doc,chunk_NEs,remove_stopwords,lemmatize_tokens,get_characters,get_2grams,,,0.087593
661,score_cosine_165,cosine,sentence_to_doc,get_tokens,remove_stopwords,lemmatize_tokens,get_characters,get_2grams,,,0.072686
1028,score_jaccard_257,jaccard,sentence_to_doc,chunk_NEs,remove_stopwords,lemmatize_tokens,get_characters,get_2grams,,,0.036327
1032,score_jaccard_258,jaccard,sentence_to_doc,chunk_NEs,remove_stopwords,lemmatize_tokens,get_characters,get_3grams,,,0.029695
...,...,...,...,...,...,...,...,...,...,...,...
1406,score_euclidean_351,euclidean,sentence_to_doc,get_tokens,remove_stopwords,get_stopwords,lemmatize_tokens,get_characters,get_4grams,,0.000000
1407,score_manhattan_351,manhattan,sentence_to_doc,get_tokens,remove_stopwords,get_stopwords,lemmatize_tokens,get_characters,get_4grams,,0.000000
1408,score_jaccard_352,jaccard,sentence_to_doc,get_tokens,remove_stopwords,get_stopwords,lemmatize_tokens,remove_non_alnum,,,0.000000
1409,score_cosine_352,cosine,sentence_to_doc,get_tokens,remove_stopwords,get_stopwords,lemmatize_tokens,remove_non_alnum,,,0.000000


In [24]:
feature_steps.sort_values(by='importance', ascending=False).to_csv("../data/feature_steps_with_importance.csv")

In [29]:
nonzero_feature_importances = feature_importances > 0
nonzero_importance_feature_indexes = ranked_feature_importance_indices[nonzero_feature_importances]
nonzero_importance_features = np.array(feature_names)[nonzero_importance_feature_indexes]
len(nonzero_importance_features)


1248

In [26]:
nonzero_importance_features_model = RandomForestRegressor()
nonzero_importance_features_model.fit(all_train_df.iloc[train_idx][nonzero_importance_features], all_train_df.iloc[train_idx].gs)

In [27]:
evaluate_model_across_train_val_test(nonzero_importance_features_model, nonzero_importance_features)

Unnamed: 0,dataset,train_pearson,val_pearson,test_pearson
0,MSRpar,0.973161,0.640263,0.639808
1,MSRvid,0.982024,0.771066,0.839934
2,SMTeuroparl,0.966586,0.785362,0.505607
3,all,0.984993,0.842846,0.810006


In [37]:
top_500_features_indices = ranked_feature_importance_indices[:500]
top_500_features = np.array(feature_names)[top_500_features_indices]

In [38]:
top_500_features_model = RandomForestRegressor()
top_500_features_model.fit(all_train_df.iloc[train_idx][top_500_features], all_train_df.iloc[train_idx].gs)

In [39]:
evaluate_model_across_train_val_test(top_500_features_model, top_500_features)

Unnamed: 0,dataset,train_pearson,val_pearson,test_pearson
0,MSRpar,0.973319,0.644597,0.640942
1,MSRvid,0.981965,0.767494,0.840509
2,SMTeuroparl,0.962743,0.773713,0.498033
3,all,0.984732,0.841256,0.807643


In [92]:
def filter_features_by_steps(
    feature_steps,
    allowed_steps,
):
    allowed_step_names = [step.__name__ for step in [sentence_to_doc, *allowed_steps]]
    step_col_names = [col for col in feature_steps.columns if col.startswith("step_")]
        
    selected_features = []
    for _, row in feature_steps.iterrows():
        if all(pd.isna(step) or step in allowed_step_names for step in row[step_col_names]):
            feature_name = row["name"]
            selected_features.append(feature_name)
    return selected_features


In [93]:
lexical_features = filter_features_by_steps(feature_steps, preprocessing_functions + lexical_functions + ngram_functions)
len(lexical_features)

320

In [94]:
lexical_model = RandomForestRegressor()
lexical_model.fit(all_train_df.iloc[train_idx][lexical_features], all_train_df.iloc[train_idx].gs)
evaluate_model_across_train_val_test(lexical_model, lexical_features)

Unnamed: 0,dataset,train_pearson,val_pearson,test_pearson
0,MSRpar,0.973151,0.631444,0.639595
1,MSRvid,0.980202,0.740311,0.816779
2,SMTeuroparl,0.964546,0.766081,0.527034
3,all,0.98393,0.828754,0.805494


In [95]:
semantic_features = filter_features_by_steps(feature_steps, preprocessing_functions + semantic_functions + ngram_functions)
len(semantic_features)

240

In [96]:
semantic_model = RandomForestRegressor()
semantic_model.fit(all_train_df.iloc[train_idx][semantic_features], all_train_df.iloc[train_idx].gs)
evaluate_model_across_train_val_test(semantic_model, semantic_features)

Unnamed: 0,dataset,train_pearson,val_pearson,test_pearson
0,MSRpar,0.963386,0.551995,0.540465
1,MSRvid,0.9667,0.76003,0.807006
2,SMTeuroparl,0.959444,0.826645,0.427814
3,all,0.976902,0.834958,0.767331


In [98]:
without_ngrams_features = filter_features_by_steps(feature_steps, preprocessing_functions + semantic_functions + lexical_functions)
len(without_ngrams_features)

520

In [99]:
without_ngrams_model = RandomForestRegressor()
without_ngrams_model.fit(all_train_df.iloc[train_idx][without_ngrams_features], all_train_df.iloc[train_idx].gs)
evaluate_model_across_train_val_test(without_ngrams_model, without_ngrams_features)


Unnamed: 0,dataset,train_pearson,val_pearson,test_pearson
0,MSRpar,0.968827,0.642555,0.585175
1,MSRvid,0.979774,0.733175,0.806594
2,SMTeuroparl,0.959558,0.746136,0.467594
3,all,0.983058,0.822749,0.77159


In [100]:
fully_trained_model = RandomForestRegressor()
fully_trained_model.fit(all_train_df[feature_names], all_train_df.gs)

In [101]:
def evaluate_model_across_test(model, selected_features):
    results = []
    datasets = test_datasets
    for dataset in datasets:
        dataset_results = []
        dataset_results.append(dataset)

        preds = model.predict(all_test_df[all_test_df.dataset == dataset][selected_features])
        dataset_results.append(pearsonr(all_test_df[all_test_df.dataset == dataset].gs, preds)[0])

        results.append(dataset_results)

    preds = model.predict(all_test_df[selected_features])
    results.append(['all', pearsonr(all_test_df.gs, preds)[0]])
    results = pd.DataFrame(results, columns=["dataset", "test_pearson"])
    return results

test_results = evaluate_model_across_test(fully_trained_model, feature_names)
test_results

Unnamed: 0,dataset,test_pearson
0,MSRpar,0.637078
1,MSRvid,0.841245
2,SMTeuroparl,0.494833
3,OnWN,0.649861
4,SMTnews,0.443494
5,all,0.731028


In [19]:
results_to_beat = pd.DataFrame(np.array([[.683, .873, .528, .664, .493, 0.823]]).T, index=[*test_datasets, 'all'], columns=["pearson_to_beat"])
results_to_beat

Unnamed: 0,pearson_to_beat
MSRpar,0.683
MSRvid,0.873
SMTeuroparl,0.528
OnWN,0.664
SMTnews,0.493
all,0.823


order of demos:

1. cross-validation
2. split train/val/test
3. evaluate model across train/val/test
4. feature selection
5. split by step types
6. evaluate for lexical and semantic in isolation
7. ~~leave one out analysis~~
8. train final largest model, and test on test set
9. discussion