In [1]:
import spacy
from typing import Tuple
import nltk
import numpy as np
import pandas as pd
from functools import partial
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from scipy.stats import pearsonr

from zp_ihlt_project.config import TRAIN_DATA_DIR, TEST_DATA_DIR

In [2]:
import spacy
from typing import Tuple
import nltk
import numpy as np
import pandas as pd
from functools import partial
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from scipy.stats import pearsonr

from zp_ihlt_project.config import TRAIN_DATA_DIR, TEST_DATA_DIR

In [3]:
all_train_dt = pd.read_csv("../data/processed/train_data_with_features.csv")
all_test_dt = pd.read_csv("../data/processed/test_data_with_features.csv")
feature_steps_df = pd.read_csv("../data/feature_steps.csv")
feature_names = [col for col in all_train_dt.columns if col.startswith("score_")]

In [4]:
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

param_grid = {
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

model = DecisionTreeRegressor()

grid_search = GridSearchCV(
    model, 
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(all_train_dt[feature_names], all_train_dt.gs)
best_dt_model = grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [5]:

feature_selector = SelectFromModel(best_dt_model, threshold=0.000000001).fit(all_train_dt[feature_names], all_train_dt.gs)
selected_features_idx = feature_selector.get_support()
selected_features = np.array(feature_names)[selected_features_idx]
selected_features
feature_steps_df[selected_features_idx]
# selected_features.shape

Unnamed: 0,metric,step_0,step_1,step_2,step_3,step_4,step_5,step_6,step_7
32,jaccard,sentence_to_doc,get_tokens,get_token_text,,,,,
53,cosine,sentence_to_doc,get_tokens,get_synsets,get_2grams,,,,
89,cosine,sentence_to_doc,chunk_NEs,lemmatize_tokens,get_3grams,,,,
132,jaccard,sentence_to_doc,get_tokens,lemmatize_tokens,get_characters,get_2grams,,,
133,cosine,sentence_to_doc,get_tokens,lemmatize_tokens,get_characters,get_2grams,,,
154,euclidean,sentence_to_doc,get_tokens,lemmatize_tokens,remove_non_alnum,get_3grams,,,
211,manhattan,sentence_to_doc,get_tokens,get_synsets,remove_non_alnum,,,,
238,euclidean,sentence_to_doc,get_tokens,remove_stopwords,get_pos_tags,get_4grams,,,
342,euclidean,sentence_to_doc,get_tokens,get_stopwords,get_synsets,get_2grams,,,
356,jaccard,sentence_to_doc,chunk_NEs,lemmatize_tokens,get_characters,get_2grams,,,


In [6]:
all_train_dt[selected_features].to_csv("../data/selected/train_data_with_features_selected.csv")
all_test_dt[selected_features].to_csv("../data/selected/test_data_with_features_selected.csv")

In [7]:
train_datasets = all_train_dt.dataset.unique().tolist()
test_datasets = all_test_dt.dataset.unique().tolist()

In [8]:
from sklearn.ensemble import RandomForestRegressor

dt = all_train_dt

X_train, X_val, y_train, y_val = train_test_split(dt[selected_features], dt.gs, test_size=0.2, random_state=42)
X_test = all_test_dt[selected_features]
y_test = all_test_dt.gs

results = []

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
}

model = RandomForestRegressor()

# Perform grid search with 5-fold cross validation
grid_search = GridSearchCV(
    model, 
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", np.sqrt(-grid_search.best_score_))

best_model = grid_search.best_estimator_
preds = best_model.predict(X_train)
results.append(pearsonr(y_train, preds)[0])

preds = best_model.predict(X_val)
results.append(pearsonr(y_val, preds)[0])

preds = best_model.predict(X_test)
results.append(pearsonr(y_test, preds)[0])

results = pd.DataFrame([results], columns=["train_pearson", "val_pearson", "test_pearson"], index=["all"])
results

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters: {'n_estimators': 200}
Best RMSE: 0.7291677824676293


Unnamed: 0,train_pearson,val_pearson,test_pearson
all,0.984887,0.834242,0.712135


In [9]:
best_model.fit(dt[selected_features], dt.gs)

In [10]:
train_results = []
datasets = train_datasets
for dataset in datasets:
    dataset_results = []
    dataset_results.append(dataset)
    dt = all_train_dt[all_train_dt.dataset == dataset]
    test_dt = all_test_dt[all_test_dt.dataset == dataset]

    X_train, X_val, y_train, y_val = train_test_split(dt[selected_features], dt.gs, test_size=0.2, random_state=42)
    X_test = test_dt[selected_features]
    y_test = test_dt.gs

    preds = best_model.predict(X_train)
    dataset_results.append(pearsonr(y_train, preds)[0])

    preds = best_model.predict(X_val)
    dataset_results.append(pearsonr(y_val, preds)[0])

    preds = best_model.predict(X_test[selected_features])
    dataset_results.append(pearsonr(y_test, preds)[0])

    train_results.append(dataset_results)

train_results = pd.DataFrame(train_results, columns=["dataset", "train_pearson", "val_pearson", "test_pearson"])
train_results

Unnamed: 0,dataset,train_pearson,val_pearson,test_pearson
0,MSRpar,0.972978,0.972423,0.622304
1,MSRvid,0.981475,0.984037,0.833832
2,SMTeuroparl,0.966676,0.956233,0.52899


In [11]:
test_results = []
datasets = test_datasets
for dataset in datasets:
    dataset_results = []
    dataset_results.append(dataset)
    test_dt = all_test_dt[all_test_dt.dataset == dataset]

    X_test = test_dt[selected_features]
    y_test = test_dt.gs

    preds = best_model.predict(X_test[selected_features])
    dataset_results.append(pearsonr(y_test, preds)[0])

    test_results.append(dataset_results)

preds = best_model.predict(all_test_dt[selected_features])
test_results.append(['all', pearsonr(all_test_dt.gs, preds)[0]])
test_results = pd.DataFrame(test_results, columns=["dataset", "test_pearson"])
test_results

Unnamed: 0,dataset,test_pearson
0,MSRpar,0.622304
1,MSRvid,0.833832
2,SMTeuroparl,0.52899
3,OnWN,0.647885
4,SMTnews,0.420387
5,all,0.715012


In [12]:
results_to_beat = pd.DataFrame(np.array([[.683, .873, .528, .664, .493, 0.823]]).T, index=[*test_datasets, 'all'], columns=["pearson_to_beat"])
results_to_beat

Unnamed: 0,pearson_to_beat
MSRpar,0.683
MSRvid,0.873
SMTeuroparl,0.528
OnWN,0.664
SMTnews,0.493
all,0.823
