# Feature evaluation testing

In [6]:
import rank_eval_pipeline as rep
from helper_functions import get_true_baseline, area_under_the_curve, corrected_performance_time_metric
import numpy as np

def evaluate_ranking(features, scores, avg_exec_time=np.inf):
    RE = rep.RankEval("","")
    RE.ranking = features
    RE.scores = scores
    RE.evaluate_ranking()
    t = 0

    baseline = get_true_baseline()

    auc_first_gen = np.mean(RE.eval_res_first_gen[0] - baseline)/(1 - area_under_the_curve(baseline))
    auc_singles = np.mean(RE.eval_res_singles[0] - baseline)/(1 - area_under_the_curve(baseline))

    return {"auc_first_gen": auc_first_gen, "auc_singles": auc_singles, "exec_time": t}

In [51]:
import json
from collections import defaultdict
from operator import itemgetter

def average_ranking(rankings, times):
    """
    Using the borda count method, average the rankings in the list of rankings
    """
    scores = defaultdict(int)
    for ranking in rankings:
        for i, feature in enumerate(ranking):
            scores[feature] += len(ranking) - i

    # sort by score, highest first
    average_ranking = sorted(scores.items(), key=itemgetter(1), reverse=True)

    # extract the features and their scores from the (feature, score) pairs
    average_features = [feature for feature, score in average_ranking]
    average_scores = [score for feature, score in average_ranking]
    average_time = np.mean(times)

    return average_features, average_scores, average_time

def read_ranking(filename):
    with open(filename) as f:
        ranking = json.load(f)
    return ranking["results"]["features"]

In [15]:
import os

def get_file_lists(subsample_proportions, directory, hash='all'):
    """
    Returns a dictionary of lists of files for each subsampling proportion.

    If subsample_

    """
    # initialize dictionary to subsample_proportions as keys and empty lists as values
    files = {subsample_proportion: [] for subsample_proportion in subsample_proportions}
    
    all_files = os.listdir(directory)
    for subsample_proportion in subsample_proportions:
        for file in all_files:
            if (f"sub{subsample_proportion}_" in file and f"features-{hash}" in file):
                files[subsample_proportion].append(directory + file)
    return files

In [50]:
rf_list = get_file_lists([0.1], "results/random_forest_0.1_batch/")
rf_list = rf_list[0.1]
rf_list = [read_ranking(file) for file in rf_list]
# this is an ensemble of all sub 0.1 random forest rankings
RF_ensemble_ranking = average_ranking(rf_list)
result1 = evaluate_ranking(RF_ensemble_ranking[0], RF_ensemble_ranking[1])
# this is the ranking for the total dataset
resutlt2 = corrected_performance_time_metric('results/random_forest_score_seed0_sub1.0_features-all.json')
print("Performance of ensemble ranking: ", result1)
print("Performance of total dataset ranking: ", resutlt2)

Performance of ensemble ranking:  {'auc_first_gen': 0.5214661177060711, 'auc_singles': 0.14547636923520285, 'exec_time': 0}
Performance of total dataset ranking:  {'auc_first_gen': 0.47056990830295586, 'auc_singles': 0.12423395028042686, 'exec_time': 276.79150891304016}


In [48]:
rf_list = get_file_lists([0.01], "results/random_forest_0.01_batch/")
rf_list = rf_list[0.01]
rf_list = [read_ranking(file) for file in rf_list]
# this is an ensemble of all sub 0.01 random forest rankings
RF_ensemble_ranking = average_ranking(rf_list)
result1 = evaluate_ranking(RF_ensemble_ranking[0], RF_ensemble_ranking[1])
# this is the ranking for the total dataset
resutlt2 = corrected_performance_time_metric('results/random_forest_score_seed0_sub1.0_features-all.json')
print("Performance of ensemble ranking: ", result1)
print("Performance of total dataset ranking: ", resutlt2)

Performance of ensemble ranking:  {'auc_first_gen': 0.5017874133348473, 'auc_singles': 0.14984106673748165, 'exec_time': 0}
Performance of total dataset ranking:  {'auc_first_gen': 0.47056990830295586, 'auc_singles': 0.12423395028042686, 'exec_time': 276.79150891304016}


In [49]:
rf_list = get_file_lists([0.001], "results/random_forest_optimization/")
rf_list = rf_list[0.001]
rf_list = [read_ranking(file) for file in rf_list]
# this is an ensemble of all sub 0.001 random forest rankings
RF_ensemble_ranking = average_ranking(rf_list)
result1 = evaluate_ranking(RF_ensemble_ranking[0], RF_ensemble_ranking[1])
# this is the ranking for the total dataset
resutlt2 = corrected_performance_time_metric('results/random_forest_score_seed0_sub1.0_features-all.json')
print("Performance of ensemble ranking: ", result1)
print("Performance of total dataset ranking: ", resutlt2)

Performance of ensemble ranking:  {'auc_first_gen': 0.47949814892709847, 'auc_singles': 0.14195247239119385, 'exec_time': 0}
Performance of total dataset ranking:  {'auc_first_gen': 0.47056990830295586, 'auc_singles': 0.12423395028042686, 'exec_time': 276.79150891304016}


In [53]:
subsampling = 0.001
rf_list = get_file_lists([subsampling], "results/random_forest_optimization/")
rf_list = rf_list[subsampling]
rf_list = [read_ranking(file) for file in rf_list]
# this is an ensemble of all sub 0.001 random forest rankings
RF_ensemble_ranking = average_ranking(rf_list)
result1 = evaluate_ranking(RF_ensemble_ranking[0], RF_ensemble_ranking[1])
# this is the ranking for the total dataset
resutlt2 = corrected_performance_time_metric('results/random_forest_score_seed0_sub1.0_features-all.json')
print(f"Performance of ensemble ranking for {subsampling} subsampling: \n", result1)
print("Performance of total dataset ranking: \n", resutlt2)

Performance of ensemble ranking for 0.001 subsampling: 
 {'auc_first_gen': 0.47949814892709847, 'auc_singles': 0.14195247239119385, 'exec_time': 0}
Performance of total dataset ranking: 
 {'auc_first_gen': 0.47056990830295586, 'auc_singles': 0.12423395028042686, 'exec_time': 276.79150891304016}


In [59]:
subsampling = 0.0004
rf_list = get_file_lists([subsampling], "results/random_forest_optimization/")
rf_list = rf_list[subsampling]
rf_list = [read_ranking(file) for file in rf_list]
# this is an ensemble of all sub 0.0001 random forest rankings
RF_ensemble_ranking = average_ranking(rf_list)
result1 = evaluate_ranking(RF_ensemble_ranking[0], RF_ensemble_ranking[1])
# this is the ranking for the total dataset
resutlt2 = corrected_performance_time_metric('results/random_forest_score_seed0_sub1.0_features-all.json')
print(f"Performance of ensemble ranking for {subsampling} subsampling: \n", result1)
print("Performance of total dataset ranking: \n", resutlt2)

Performance of ensemble ranking for 0.0004 subsampling: 
 {'auc_first_gen': 0.4687214436628085, 'auc_singles': 0.14498408132936036, 'exec_time': 0}
Performance of total dataset ranking: 
 {'auc_first_gen': 0.47056990830295586, 'auc_singles': 0.12423395028042686, 'exec_time': 276.79150891304016}


In [41]:
random_forest_ranking = read_ranking()
ran

['feature98',
 'feature99',
 'feature88',
 'feature89',
 'feature84',
 'feature87',
 'feature86',
 'feature83',
 'feature85',
 'feature23',
 'feature82',
 'feature22',
 'feature81',
 'feature5',
 'feature49',
 'feature48',
 'feature32',
 'feature56',
 'feature1',
 'feature90',
 'feature39',
 'feature91',
 'feature71',
 'feature15',
 'feature2',
 'feature12',
 'feature10',
 'feature3',
 'feature33',
 'feature50',
 'feature36',
 'feature21',
 'feature55',
 'feature0',
 'feature18',
 'feature37',
 'feature62',
 'feature59',
 'feature57',
 'feature42',
 'feature17',
 'feature8',
 'feature40',
 'feature60',
 'feature9',
 'feature51',
 'feature19',
 'feature20',
 'feature14',
 'feature58',
 'feature27',
 'feature7',
 'feature44',
 'feature26',
 'feature67',
 'feature52',
 'feature47',
 'feature16',
 'feature29',
 'feature4',
 'feature30',
 'feature11',
 'feature46',
 'feature64',
 'feature41',
 'feature66',
 'feature43',
 'feature28',
 'feature38',
 'feature69',
 'feature54',
 'feature53',
 

In [3]:
xgboost_ranking = read_ranking('results/xgboost_score_seed0_sub1.0_features-all.json')

pearson_correlation_ranking = read_ranking('results/pearson_correlation_score_seed0_sub1.0_features-all.json')

In [4]:
new_ranking = average_ranking([random_forest_ranking, pearson_correlation_ranking])

In [5]:
evaluate_ranking(new_ranking[0], new_ranking[1])

{'auc_first_gen': 0.4600267567954295,
 'auc_singles': 0.08264037262534994,
 'exec_time': 0}

In [10]:
corrected_performance_time_metric("results/pearson_correlation_score_seed0_sub1.0_features-all.json", shuffle_correction=True)

{'auc_first_gen': 0.39817829102262176,
 'auc_singles': 0.048860216981625515,
 'exec_time': 2.2313971519470215}

In [9]:
corrected_performance_time_metric('results/random_forest_score_seed0_sub1.0_features-all.json', shuffle_correction=True)

{'auc_first_gen': 0.47056990830295586,
 'auc_singles': 0.12423395028042686,
 'exec_time': 276.79150891304016}

In [7]:
corrected_performance_time_metric('results/xgboost_score_seed0_sub1.0_features-all.json', shuffle_correction=True)

{'auc_first_gen': 0.09015312352184,
 'auc_singles': 0.04834261482073889,
 'exec_time': 33.497220277786255}

In [19]:
data = pd.read_csv('data/full_data.csv')

In [21]:
# print out details about feature23
data['feature23'].unique()

array([2224352179, 1066182028, 3103603272, 3770660814,  429120973,
        553222847, 3552167677, 3071463162, 2282816406, 1625990768,
       4286417659, 3750766477, 2733500236, 1622987352, 1145370728,
       3898133265,  928562214, 1343824683, 2386995490, 2882519864,
       2360873032, 1709828517,  645747896, 1928497154,  152192149,
       1408821955, 1258086162, 2469287282, 4041503919, 3899963624,
       3259400523, 2106730950, 1675661237,  108838858,  413545197,
       4294832690, 1243073278], dtype=int64)

In [2]:
import xgboost as xgb
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np

full_data = pd.read_csv('data/full_data.csv')

In [7]:
data = full_data.sample(frac=0.1, random_state=0)
X, y = full_data.drop(["info_click_valid"], axis=1), full_data['info_click_valid']
model = xgb.XGBClassifier()
model.fit(X, y)
model.feature_importances_

array([0.0000000e+00, 9.4202566e-09, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e

In [4]:
X, y = full_data.drop(["info_click_valid"], axis=1), full_data['info_click_valid']
correlations = X.apply(lambda x: x.corr(y))
correlations.sort_values(ascending=False)
print(correlations)

feature0    -0.026055
feature1    -0.035900
feature2    -0.014218
feature3     0.022289
feature4    -0.033770
               ...   
feature95         NaN
feature96         NaN
feature97         NaN
feature98    1.000000
feature99   -1.000000
Length: 100, dtype: float64


In [9]:
data = full_data.sample(frac=0.1, random_state=0)
data = data.drop(['feature99',"feature98"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(data.drop(['info_click_valid'], axis=1), data['info_click_valid'], test_size=0.2, random_state=0) 

In [74]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)
print(log_loss(y_test, y_pred))

0.3773439756674521


In [10]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)
print(log_loss(y_test, y_pred))

0.43197243402039104


In [64]:
y_pred

array([0.2, 0.2, 0.2, ..., 0.2, 0.2, 0.2])

In [51]:
np.sum(full_data["info_click_valid"])/len(full_data["info_click_valid"])

0.1988247022337397

In [54]:
np.log(0.2/0.8)

-1.3862943611198906

In [49]:
best_features = []
X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

for i in range(len(X_train.columns)):
    best_score = np.inf  # Initialize to infinity
    best_feature = None
    for feature in X_train_copy.columns:
        if feature not in best_features:  # Only consider features not yet in best_features
            temp_features = best_features.copy()
            temp_features.append(feature)
            model = xgb.XGBClassifier()
            model.fit(X_train_copy[temp_features], y_train)
            predictions = model.predict_proba(X_test_copy[temp_features])[:, 1]  # Get probability estimates
            score = log_loss(y_test, predictions)
            if score < best_score:  # Change comparison to less than
                best_score = score
                best_feature = feature
    best_features.append(best_feature)
    print(best_feature, best_score)

feature23 0.38630625195049645


KeyboardInterrupt: 

In [43]:
best_features = []
X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

for i in range(len(X_train.columns)):
    best_score = np.inf  # Initialize to infinity
    best_feature = None
    for feature in X_train_copy.columns:
        if feature not in best_features:  # Only consider features not yet in best_features
            temp_features = best_features.copy()
            temp_features.append(feature)
            model = xgb.XGBClassifier()
            model.fit(X_train_copy[temp_features], y_train)
            predictions = model.predict_proba(X_test_copy[temp_features])[:, 1]  # Get probability estimates
            score = log_loss(y_test, predictions)
            if score < best_score:  # Change comparison to less than
                best_score = score
                best_feature = feature
    best_features.append(best_feature)
    print(best_feature, best_score)


KeyboardInterrupt: 

In [6]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
import pandas as pd

In [1]:
from rank_eval_pipeline import RankEval
import rank_algos
import pandas as pd

In [24]:
data = pd.read_csv('data/full_data.csv')
# # subsample the data
# data = data.sample(frac=0.01, random_state=42)
data = data.drop(columns=['feature98', 'feature99'])
# subsample the data
data = data.sample(frac=0.001, random_state=42)

In [25]:
X, y = data.drop("info_click_valid", axis=1), data["info_click_valid"]

# make stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

In [26]:
# apply xgboost on training data
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)

In [27]:
y_proba_pred = model.predict_proba(X_test)

## Model performance on 0.1% of the dataset (without features 98,99)

In [28]:
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
logloss = log_loss(y_test, [i[0] for i in y_proba_pred])
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Logloss: %.2f" % (logloss))

Accuracy: 81.59%
Logloss: 3.72


## Model performance on entire dataset (without features 98,99)

In [22]:
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
logloss = log_loss(y_test, [i[0] for i in y_proba_pred])
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Logloss: %.2f" % (logloss))

Accuracy: 83.16%
Logloss: 2.09


In [23]:
model.feature_importances_

array([0.0067364 , 0.00763135, 0.00293042, 0.00312282, 0.06042694,
       0.04295708, 0.00168319, 0.00422322, 0.00311172, 0.00438067,
       0.00343244, 0.00571202, 0.00384746, 0.00536718, 0.07457934,
       0.00502488, 0.01268342, 0.00725955, 0.00482081, 0.0035692 ,
       0.00490443, 0.00314804, 0.00687523, 0.2657493 , 0.0026869 ,
       0.00228163, 0.00745443, 0.00318208, 0.00269152, 0.0021485 ,
       0.00198203, 0.00624343, 0.04997206, 0.00205908, 0.00262307,
       0.00297043, 0.00245738, 0.00285267, 0.00240105, 0.00234927,
       0.00257787, 0.00364806, 0.0026798 , 0.01090775, 0.00363707,
       0.00425268, 0.00335931, 0.00242065, 0.02436822, 0.00337768,
       0.00247684, 0.0028435 , 0.0025859 , 0.00254479, 0.00354356,
       0.00226208, 0.01080691, 0.0021332 , 0.00211958, 0.00252539,
       0.00216155, 0.00319993, 0.02726843, 0.00379415, 0.00245245,
       0.00216157, 0.00243531, 0.00232389, 0.00241675, 0.00752994,
       0.        , 0.00589758, 0.00314215, 0.        , 0.     

In [7]:
data.head()

Unnamed: 0,info_click_valid,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature90,feature91,feature92,feature93,feature94,feature95,feature96,feature97,feature98,feature99
0,1,3282490636,3723861767,592628169,2093237750,3617548381,2965721776,3543218850,1072184752,1418147852,...,3637550824,3790689556,2087688982,3899436442,3255798468,355094504,3479975775,3411024218,3656293508,2846559631
1,1,2736633149,3188050045,3599848058,2300041243,3503728323,1531702445,3543218850,2634671414,2266512440,...,3637550824,3790689556,2087688982,3899436442,3255798468,355094504,3479975775,3411024218,3656293508,2846559631
2,1,1067402948,108565582,2502341442,3474641266,3503728323,1531702445,3543218850,4108477053,870407079,...,147472623,3508206064,2087688982,3899436442,3255798468,355094504,3479975775,3411024218,3656293508,2846559631
3,1,3634365896,2567512859,753387811,2124887610,3503728323,2965721776,3543218850,4108477053,3981694603,...,147472623,3508206064,2087688982,3899436442,3255798468,355094504,3479975775,3411024218,3656293508,2846559631
4,1,3282490636,1241498584,99521842,868270843,3503728323,1531702445,3543218850,4108477053,3377404711,...,147472623,2450605253,2087688982,3899436442,3255798468,355094504,3479975775,3411024218,3656293508,2846559631


### reliefF

In [None]:
RE = RankEval(data, rank_method=rank_algos.ReliefF_score, seed=0, subsampling_proportion=subsampling)
RE.get_scores()

### SURF

In [None]:
RE = RankEval(data, rank_method=rank_algos.SURF_score, seed=0, subsampling_proportion=subsampling)
RE.get_scores()

### SURF*

In [None]:
RE = RankEval(data, rank_method=rank_algos.SURFstar_score, seed=0, subsampling_proportion=subsampling)
RE.get_scores()

### MultiSURF

In [None]:
RE = RankEval(data, rank_method=rank_algos.MultiSURF_score, seed=0, subsampling_proportion=subsampling)
RE.get_scores()

### MultiSURFstar

In [None]:
RE = RankEval(data, rank_method=rank_algos.MultiSURFstar_score, seed=0, subsampling_proportion=subsampling)
RE.get_scores()

### reliefE

In [None]:
RE = RankEval(data, rank_method=rank_algos.ReliefE_score, seed=0, subsampling_proportion=subsampling)
RE.get_scores()

### XGBoost

In [None]:
RE = RankEval(data, rank_method=rank_algos.xgboost_score, seed=0, subsampling_proportion=subsampling)
RE.get_scores()

### Random Forest Classifier

In [None]:
RE = RankEval(data, rank_method=rank_algos.random_forest_score, seed=0, subsampling_proportion=subsampling)
RE.get_scores()

### Chi-Squared

In [None]:
RE = RankEval(data, rank_method=rank_algos.chi2_score, seed=0, subsampling_proportion=subsampling)
RE.get_scores()

### Permutation Importance
Problem is we need to build a model, which means choosing a model.

In [None]:
# perm_model = RandomForestClassifier()
# perm_model.fit(X, y)
# perm_results = permutation_importance(perm_model, X, y, n_repeats=5, random_state=0, n_jobs=-1)
# perm_results["importances_mean"]

### Linear regression

In [None]:
# lr_ranker = LinearRegression()
# lr_ranker.fit(X, y)
# lr_results = {i:score for i, score in enumerate(lr_ranker.coef_)}