In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from collections import Counter

from plotting_utils import *
from dataloader import *
from label_ranking import *

# Comparing RFR and RPC on the fragment data

In [2]:
fragment_dataset = ScienceDataset(False, "fragment", 1)
X_fp = fragment_dataset.X_fp
y_yield = fragment_dataset.y_yield
y_rank = fragment_dataset.y_ranking
print(X_fp.shape, y_yield.shape, y_rank.shape)

(383, 1024) (383, 4) (383, 4)


In [3]:
Counter(list(np.where(y_rank == 1)[1]))

Counter({0: 214, 2: 111, 3: 42, 1: 16})

In [4]:
### Baseline of choosing the top two frequent best condition
print("always choosing Cu:", round(np.mean(np.reciprocal(y_rank[:,0])), 3))
print("always choosing Pd:", round(np.mean(np.reciprocal(y_rank[:,2])), 3))

always choosing Cu: 0.559
always choosing Pd: 0.29


In [4]:
def rr(rank_true, rank_pred):
    rr = np.array([
        1 / rank_true[a, np.argmin(x)]
        for a, x in enumerate(rank_pred)
    ]).mean()
    return rr

variance_sum = 0.0
first_diff = None

rf_mrr_scores = []
rpc_mrr_scores = []
# Conducting the 5x2cv paired t-test as implemented in mlxtend
for seed in range(42,47):
    print(seed)
    X_1, X_2, y_rank_1, y_rank_2, y_yield_1, y_yield_2 = train_test_split(X_fp, y_rank, y_yield, test_size=0.5, random_state=seed)
    
    # RFR
    gcv = GridSearchCV(
        RandomForestRegressor(random_state=42),
        param_grid={"n_estimators":[50,100,200], 
                    "max_depth": [5, 10, None]},
        scoring="r2",
        n_jobs=-1,
        cv=4
    )
    first_set_of_preds = []
    second_set_of_preds = []
    for i in range(4) :
        gcv.fit(X_1, y_rank_1[:, i].flatten())
        first_set_of_preds.append(gcv.predict(X_2).reshape(-1, 1))
        gcv.fit(X_2, y_rank_2[:, i].flatten())
        second_set_of_preds.append(gcv.predict(X_1).reshape(-1, 1))
    y_rank_rfr_pred_1 = yield_to_ranking(np.hstack(tuple(first_set_of_preds)))
    y_rank_rfr_pred_2 = yield_to_ranking(np.hstack(tuple(second_set_of_preds)))
    rf_mrr1 = rr(y_rank_2, y_rank_rfr_pred_1)
    rf_mrr2 = rr(y_rank_1, y_rank_rfr_pred_2)
    rf_mrr_scores.extend([rf_mrr1, rf_mrr2])
    # RPC
    rpc1 = RPC()
    rpc2 = RPC()
    rpc1.fit(X_1, y_rank_1)
    rpc_mrr1 = rr(y_rank_2, rpc1.predict(X_2))
    rpc2.fit(X_2, y_rank_2)
    rpc_mrr2 = rr(y_rank_1, rpc2.predict(X_1))
    rpc_mrr_scores.extend([rpc_mrr1, rpc_mrr2])
    # Getting statistics
    score_diff_1 = rf_mrr1 - rpc_mrr1
    score_diff_2 = rf_mrr2 - rpc_mrr2
    score_mean = (score_diff_1 + score_diff_2) / 2.0
    score_var = (score_diff_1 - score_mean) ** 2 + (score_diff_2 - score_mean) ** 2
    variance_sum += score_var
    if first_diff is None:
        first_diff = score_diff_1

numerator = first_diff
denominator = np.sqrt(1 / 5.0 * variance_sum)
t_stat = numerator / denominator

pvalue = stats.t.sf(np.abs(t_stat), 5) * 2.0
t = float(t_stat)
p = float(pvalue)

42


43
44
45
46


In [5]:
print(t, p)
print("RFR Mean Reciprocal Rank:", round(np.mean(np.array(rf_mrr_scores)), 3))
print("RPC Mean Reciprocal Rank:", round(np.mean(np.array(rpc_mrr_scores)), 3))

-20.03530002169009 5.725349637123246e-06
RFR Mean Reciprocal Rank: 0.359
RPC Mean Reciprocal Rank: 0.722
