In [1]:
import spacy
from typing import Tuple
import nltk
import numpy as np
import pandas as pd
from functools import partial
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.base import clone
from scipy.stats import pearsonr

from zp_ihlt_project.config import TRAIN_DATA_WITH_FEATURES_PATH, TEST_DATA_WITH_FEATURES_PATH, FEATURE_STEPS_PATH

In [2]:
all_train_df = pd.read_csv(TRAIN_DATA_WITH_FEATURES_PATH)
all_test_df = pd.read_csv(TEST_DATA_WITH_FEATURES_PATH)
feature_steps = pd.read_csv(FEATURE_STEPS_PATH)
feature_names = [col for col in all_train_df.columns if col.startswith("score_")]

In [3]:
train_datasets = all_train_df.dataset.unique().tolist()
test_datasets = all_test_df.dataset.unique().tolist()

In [4]:
feature_names

['score_jaccard_0',
 'score_cosine_0',
 'score_euclidean_0',
 'score_manhattan_0',
 'score_jaccard_1',
 'score_cosine_1',
 'score_euclidean_1',
 'score_manhattan_1',
 'score_jaccard_2',
 'score_cosine_2',
 'score_euclidean_2',
 'score_manhattan_2',
 'score_jaccard_3',
 'score_cosine_3',
 'score_euclidean_3',
 'score_manhattan_3',
 'score_jaccard_4',
 'score_cosine_4',
 'score_euclidean_4',
 'score_manhattan_4',
 'score_jaccard_5',
 'score_cosine_5',
 'score_euclidean_5',
 'score_manhattan_5',
 'score_jaccard_6',
 'score_cosine_6',
 'score_euclidean_6',
 'score_manhattan_6',
 'score_jaccard_7',
 'score_cosine_7',
 'score_euclidean_7',
 'score_manhattan_7',
 'score_jaccard_8',
 'score_cosine_8',
 'score_euclidean_8',
 'score_manhattan_8',
 'score_jaccard_9',
 'score_cosine_9',
 'score_euclidean_9',
 'score_manhattan_9',
 'score_jaccard_10',
 'score_cosine_10',
 'score_euclidean_10',
 'score_manhattan_10',
 'score_jaccard_11',
 'score_cosine_11',
 'score_euclidean_11',
 'score_manhattan_1

In [5]:
train_idx, val_idx = train_test_split(range(len(all_train_df)), test_size=0.2, random_state=42)
X_train = all_train_df[feature_names].iloc[train_idx]
X_val = all_train_df[feature_names].iloc[val_idx]
y_train = all_train_df.gs.iloc[train_idx]
y_val = all_train_df.gs.iloc[val_idx]
X_test = all_test_df[feature_names]
y_test = all_test_df.gs

In [6]:
results = []

def pearsonr_score(estimator, X, y):
    """Scorer function that matches sklearn's expected signature"""
    y_pred = estimator.predict(X)
    return pearsonr(y, y_pred)[0]

model = RandomForestRegressor()

scores = cross_val_score(
    model, 
    all_train_df[feature_names],
    all_train_df.gs,
    cv=5,
    n_jobs=-1,
    scoring=pearsonr_score
)
print(f"Cross-validation Pearson: {np.mean(scores)}+-{np.std(scores)}")


Validation Pearson: 0.86462909278209+-0.018397283792106187


In [7]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [8]:
feature_importances = model.feature_importances_
ranked_feature_importance_indices = np.argsort(feature_importances)[::-1]
feature_steps['importance'] = feature_importances
feature_steps.sort_values(by='importance', ascending=False)

Unnamed: 0,name,metric,step_0,step_1,step_2,step_3,step_4,step_5,step_6,step_7,importance
1029,score_cosine_257,cosine,sentence_to_doc,chunk_NEs,remove_stopwords,lemmatize_tokens,get_characters,get_2grams,,,0.133096
660,score_jaccard_165,jaccard,sentence_to_doc,get_tokens,remove_stopwords,lemmatize_tokens,get_characters,get_2grams,,,0.088100
1033,score_cosine_258,cosine,sentence_to_doc,chunk_NEs,remove_stopwords,lemmatize_tokens,get_characters,get_3grams,,,0.063004
661,score_cosine_165,cosine,sentence_to_doc,get_tokens,remove_stopwords,lemmatize_tokens,get_characters,get_2grams,,,0.062342
1032,score_jaccard_258,jaccard,sentence_to_doc,chunk_NEs,remove_stopwords,lemmatize_tokens,get_characters,get_3grams,,,0.047072
...,...,...,...,...,...,...,...,...,...,...,...
1406,score_euclidean_351,euclidean,sentence_to_doc,get_tokens,remove_stopwords,get_stopwords,lemmatize_tokens,get_characters,get_4grams,,0.000000
1407,score_manhattan_351,manhattan,sentence_to_doc,get_tokens,remove_stopwords,get_stopwords,lemmatize_tokens,get_characters,get_4grams,,0.000000
1408,score_jaccard_352,jaccard,sentence_to_doc,get_tokens,remove_stopwords,get_stopwords,lemmatize_tokens,remove_non_alnum,,,0.000000
1409,score_cosine_352,cosine,sentence_to_doc,get_tokens,remove_stopwords,get_stopwords,lemmatize_tokens,remove_non_alnum,,,0.000000


In [9]:
feature_steps.sort_values(by='importance', ascending=False).to_csv("../data/feature_steps_with_importance.csv")

In [10]:
nonzero_feature_importances = feature_importances > 0
top_indexes = ranked_feature_importance_indices[nonzero_feature_importances]
top_features = np.array(feature_names)[top_indexes]
top_features
len(top_features)

1248

In [21]:
model = RandomForestRegressor()
model.fit(all_train_df[top_features], all_train_df.gs)

In [22]:
def evaluate_model_across_train_val_test(model, selected_features):
    results = []
    datasets = train_datasets
    for dataset in datasets:
        dataset_results = []
        dataset_results.append(dataset)
        
        X_train = all_train_df.iloc[train_idx]
        X_train = X_train.loc[X_train.dataset == dataset][selected_features]
        y_train = all_train_df.iloc[train_idx]
        y_train = y_train.loc[y_train.dataset == dataset].gs

        X_val = all_train_df.iloc[val_idx]
        X_val = X_val.loc[X_val.dataset == dataset][selected_features]
        y_val = all_train_df.iloc[val_idx]
        y_val = y_val.loc[y_val.dataset == dataset].gs
        
        X_test = all_test_df
        X_test = X_test.loc[X_test.dataset == dataset][selected_features]
        y_test = all_test_df.loc[all_test_df.dataset == dataset].gs

        preds = model.predict(X_train)
        dataset_results.append(pearsonr(y_train, preds)[0])

        preds = model.predict(X_val)
        dataset_results.append(pearsonr(y_val, preds)[0])

        preds = model.predict(X_test)
        dataset_results.append(pearsonr(y_test, preds)[0])

        results.append(dataset_results)

    dataset_results = []
    dataset_results.append('all')
    preds = model.predict(all_train_df.iloc[train_idx][top_features])
    dataset_results.append(pearsonr(all_train_df.iloc[train_idx].gs, preds)[0])

    preds = model.predict(all_train_df.iloc[val_idx][top_features])
    dataset_results.append(pearsonr(all_train_df.iloc[val_idx].gs, preds)[0])

    preds = model.predict(all_test_df[all_test_df.dataset.isin(train_datasets)][top_features])
    dataset_results.append(pearsonr(all_test_df[all_test_df.dataset.isin(train_datasets)].gs, preds)[0])

    results.append(dataset_results)

    results = pd.DataFrame(results, columns=["dataset", "train_pearson", "val_pearson", "test_pearson"])
    return results

results = evaluate_model_across_train_val_test(model, top_features)
results


Unnamed: 0,dataset,train_pearson,val_pearson,test_pearson
0,MSRpar,0.973246,0.96765,0.643555
1,MSRvid,0.983355,0.980284,0.840581
2,SMTeuroparl,0.965687,0.970322,0.504185
3,all,0.98539,0.983708,0.806516


In [17]:
test_results = []
datasets = test_datasets
for dataset in datasets:
    dataset_results = []
    dataset_results.append(dataset)

    preds = selected_features_model.predict(all_test_df[all_test_df.dataset == dataset][selected_features])
    dataset_results.append(pearsonr(all_test_df[all_test_df.dataset == dataset].gs, preds)[0])

    test_results.append(dataset_results)

preds = selected_features_model.predict(all_test_df[selected_features])
test_results.append(['all', pearsonr(all_test_df.gs, preds)[0]])
test_results = pd.DataFrame(test_results, columns=["dataset", "test_pearson"])
test_results

Unnamed: 0,dataset,test_pearson
0,MSRpar,0.640475
1,MSRvid,0.843576
2,SMTeuroparl,0.512235
3,OnWN,0.641766
4,SMTnews,0.436355
5,all,0.733967


In [18]:
test_results = []
datasets = test_datasets
for dataset in datasets:
    dataset_results = []
    dataset_results.append(dataset)

    preds = selected_features_model.predict(all_test_df[all_test_df.dataset == dataset][selected_features])
    dataset_results.append(pearsonr(all_test_df[all_test_df.dataset == dataset].gs, preds)[0])

    test_results.append(dataset_results)

preds = selected_features_model.predict(all_test_df[selected_features])
test_results.append(['all', pearsonr(all_test_df.gs, preds)[0]])
test_results = pd.DataFrame(test_results, columns=["dataset", "test_pearson"])
test_results

Unnamed: 0,dataset,test_pearson
0,MSRpar,0.640475
1,MSRvid,0.843576
2,SMTeuroparl,0.512235
3,OnWN,0.641766
4,SMTnews,0.436355
5,all,0.733967


In [19]:
results_to_beat = pd.DataFrame(np.array([[.683, .873, .528, .664, .493, 0.823]]).T, index=[*test_datasets, 'all'], columns=["pearson_to_beat"])
results_to_beat

Unnamed: 0,pearson_to_beat
MSRpar,0.683
MSRvid,0.873
SMTeuroparl,0.528
OnWN,0.664
SMTnews,0.493
all,0.823
