In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

bin_count = 171
def create_test_train(data_set_path, test_size=0.10):
    """ Splits a given csv file into testing and training. Target column is all the bins."""
    # Make sure the columns are set
    data_set = pd.read_csv(data_set_path)

    # Shuffle the data
    data_set = data_set.sample(frac=1, random_state=0)
 
    # Select all except output bins
    data_set_X = data_set.drop([f'Output_Bin_{i}' for i in range(bin_count)], axis=1)
    # Select only the output bins
    data_set_Y = data_set[[f'Output_Bin_{i}' for i in range(bin_count)]]

    #Split into training and test data
    return train_test_split(data_set_X,
                            data_set_Y,
                            test_size=test_size, 
                            random_state=300)

filename= "/project/SDS-capstones-kropko21/uva-astronomy/dust_training_data_all_bins_v2.csv"
X_train, X_test, y_train, y_test = create_test_train(filename, test_size=0.10)

def evaluate_fit(y_samples, y_test):
    ent = []
    js_list = []

    # Turn all negative preds to 0
    y_samples = np.clip(y_samples,0, a_max=None)
    
    y_samples_obs = y_samples
    y_test_obs = y_test
    
    # Small constant to prevent inf for 0s
    c = 1e-100
    
    y_test_obs += c
    y_samples_obs += c

    fits = []
    for i in range(len(y_test_obs)):
        # Calcuate the two entropy measures
        e = entropy(y_test_obs.iloc[i], y_samples_obs[i])
        js = jensenshannon(y_test_obs.iloc[i], y_samples_obs[i])
        ent.append(e if e != np.inf else 1000)
        js_list.append(js)
        # Add the index and the two entropy measures to an array to be used for plotting later
        fits.append((i, ent, js))
    #print("Entropy")
    #display(pd.DataFrame(ent).describe())
    print("Jensen-Shannon")
    display(pd.DataFrame(js_list).describe().apply(lambda s: s.apply('{0:.4f}'.format)))
    return fits, js_list

def fit_for_model(filename, X_test, y_test):
    """ Loads and fits model from file. Evaluates against X,y test"""
    
    rf = load(filename)
    preds = rf.predict(X_test)
    
    # Renormalize samples
    preds_normalized = []
    for s in preds:
        preds_normalized.append(np.divide(s,np.sum(s)))
        
    preds_df = pd.DataFrame(preds_normalized, columns=[f'Output_Bin_{i}' for i in range(bin_count)])
    preds_df.to_csv("preds_small_model.csv")
    
    return evaluate_fit(preds_normalized, y_test), preds_normalized

In [2]:
from joblib import dump, load
import numpy as np
from scipy.stats import entropy
from scipy.spatial.distance import jensenshannon

model_name = '/project/SDS-capstones-kropko21/uva-astronomy-models/rf-model-large.joblib'
m1_fit, model_1_preds = fit_for_model(model_name, X_test, y_test)
fits, model_1_js_list = m1_fit

  from numpy.core.umath_tests import inner1d


Jensen-Shannon


Unnamed: 0,0
count,142330.0
mean,0.0803
std,0.1214
min,0.0
25%,0.0052
50%,0.0195
75%,0.1081
max,0.8159


## Calculate Mode Bin Difference

In [3]:
#display(np.argmax(y_test.to_numpy(), axis=1) - np.argmax(model_1_preds, axis=1))
err_fits_arr = []
bin_error = np.abs(np.argmax(y_test.to_numpy(), axis=1) - np.argmax(model_1_preds, axis=1))


In [5]:
def loess2(xvals, yvals, data, alpha, poly_degree=1):
    all_data = sorted(zip(data[xvals].tolist(), data[yvals].tolist()), key=lambda x: x[0])
    xvals, yvals = zip(*all_data)
    evalDF = pd.DataFrame(columns=['v','g'])
    n = len(xvals)
    m = n + 1
    q = int(np.floor(n * alpha) if alpha <= 1.0 else n)
    avg_interval = ((max(xvals)-min(xvals))/len(xvals))
    v_lb = min(xvals)-(.5*avg_interval)
    v_ub = (max(xvals)+(.5*avg_interval))
    v = enumerate(np.linspace(start=v_lb, stop=v_ub, num=m), start=1)
    xcols = [np.ones_like(xvals)]
    for j in range(1, (poly_degree + 1)):
        xcols.append([i ** j for i in xvals])
    X = np.vstack(xcols).T
    for i in v:
        iterpos = i[0]
        iterval = i[1]
        iterdists = sorted([(j, np.abs(j-iterval)) for j in xvals], key=lambda x: x[1])
        _, raw_dists = zip(*iterdists)
        scale_fact = raw_dists[q-1]
        scaled_dists = [(j[0],(j[1]/scale_fact)) for j in iterdists]
        weights = [(j[0],((1-np.abs(j[1]**3))**3 if j[1]<=1 else 0)) for j in scaled_dists]
        _, weights      = zip(*sorted(weights,     key=lambda x: x[0]))
        _, raw_dists    = zip(*sorted(iterdists,   key=lambda x: x[0]))
        _, scaled_dists = zip(*sorted(scaled_dists,key=lambda x: x[0]))
        W         = np.diag(weights)
        b         = np.linalg.inv(X.T @ W @ X) @ (X.T @ W @ yvals)
        local_est = loc_eval(iterval, b)
        iterDF2   = pd.DataFrame({
                       'v'  :[iterval],
                       'g'  :[local_est]
                       })
        evalDF = pd.concat([evalDF, iterDF2])
    evalDF = evalDF[['v','g']]
    return(evalDF)

df = pd.DataFrame({"Xvalue" : bin_error,
                    "Yvalue" : model_1_js_list
                    })
evalDF = loess2("Xvalue", "Yvalue", data = df, alpha=0.7, poly_degree=2)


LinAlgError: Singular matrix

In [6]:
from tqdm import tqdm
def lowess(y, x, f=2.0 / 3.0, n_iter=3):
    """Lowess smoother (robust locally weighted regression).
    Fits a nonparametric regression curve to a scatterplot.
    Parameters
    ----------
    y, x : np.ndarrays
        The arrays x and y contain an equal number of elements;
        each pair (x[i], y[i]) defines a data point in the
        scatterplot.
    f : float
        The smoothing span. A larger value will result in a
        smoother curve.
    n_iter : int
        The number of robustifying iteration. Thefunction will
        run faster with a smaller number of iterations.
    Returns
    -------
    yest : np.ndarray
        The estimated (smooth) values of y.
    """
    n = len(x)
    r = int(np.ceil(f * n))
    h = np.array([np.sort(np.abs(x - x[i]))[r] for i in range(n)])
    w = np.minimum(1.0, np.maximum(np.abs((x.reshape((-1, 1)) - x.reshape((1, -1))) / h), 0.0))
    w = (1 - w ** 3) ** 3
    yest = np.zeros(n)
    delta = np.ones(n)

    for _ in range(n_iter):
        for i in tqdm(range(n)):
            weights = delta * w[:, i]
            b = np.array([np.sum(weights * y), np.sum(weights * y * x)])
            A = np.array(
                [
                    [np.sum(weights), np.sum(weights * x)],
                    [np.sum(weights * x), np.sum(weights * x * x)],
                ]
            )

            beta = np.linalg.lstsq(A, b)[0]
            yest[i] = beta[0] + beta[1] * x[i]

        residuals = y - yest
        s = np.median(np.abs(residuals))
        #delta = np.clip(residuals / (6.0 * s), -1.0, 1.0)
        delta = np.minimum(1.0, np.maximum(residuals / (6.0 * s), -1.0))
        delta = (1 - delta ** 2) ** 2

    return yest

In [7]:
import matplotlib.pyplot as plt
#from statsmodels.nonparametric.smoothers_lowess import lowess

plt.figure(figsize=(10, 8))
#Lowess cannot fit with large number of ties https://github.com/statsmodels/statsmodels/issues/2449
ys = lowess(model_1_js_list, bin_error)
plt.plot(bin_error,ys,'red',linewidth=1)
#plt.plot(evalDF['v'], evalDF['g'], color='red', linewidth= 3, label="Fit")
plt.scatter(bin_error, model_1_js_list, alpha=0.2)
plt.title("JS Entropy vs Bin Error")
plt.xlabel('Bin Error')
plt.ylabel('Entropy')
plt.show()

MemoryError: Unable to allocate 151. GiB for an array with shape (142330, 142330) and data type int64

<Figure size 720x576 with 0 Axes>

## 2 label prediction

Threshold of 0.2

In [3]:
#two_label = lambda x: 1 if x < 0.2 else 0
classification_two_category = list(map(lambda x: 1 if x < 0.2 else 0, model_1_js_list))

X_two_train, X_two_test, y_two_train, y_two_test = train_test_split(X_test,
                            classification_two_category,
                            test_size=0.1, 
                            random_state=300)

In [23]:
import xgboost as xgb

xgb_two_model = xgb.XGBClassifier(objective="binary:logistic", 
                              random_state=42, max_depth=20, scale_pos_weight=10)
xgb_two_model.fit(X_two_train, y_two_train,
             eval_set=[(X_two_train, y_two_train)],
             eval_metric='error',verbose=True, early_stopping_rounds=5)

Parameters: { n_trees } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-error:0.01478
Will train until validation_0-error hasn't improved in 5 rounds.
[1]	validation_0-error:0.01044
[2]	validation_0-error:0.00742
[3]	validation_0-error:0.00574
[4]	validation_0-error:0.00435
[5]	validation_0-error:0.00347
[6]	validation_0-error:0.00255
[7]	validation_0-error:0.00191
[8]	validation_0-error:0.00155
[9]	validation_0-error:0.00125
[10]	validation_0-error:0.00102
[11]	validation_0-error:0.00093
[12]	validation_0-error:0.00076
[13]	validation_0-error:0.00064
[14]	validation_0-error:0.00055
[15]	validation_0-error:0.00045
[16]	validation_0-error:0.00038
[17]	validation_0-error:0.00032
[18]	validation_0-error:0.00027
[19]	validation_0-error:0.00023
[20]	validation_0-erro

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=20,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=0, n_trees=250, num_parallel_tree=1,
       objective='binary:logistic', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [7]:
preds_two = xgb_two_model.predict(X_two_test)

# Count the number of matches between predictions and labels
correct_two = np.sum(preds_two == y_two_test)

# Calculate accuracy
accuracy_two = correct_two / len(y_two_test)
print(f"Accuracy: {accuracy_two}")
xgb_two_model.save_model('pred_quality_two_class.model')

Accuracy: 0.9799058525960795


  if diff:


In [8]:
import xgboost as xgb
from sklearn.metrics import confusion_matrix

quality_model = xgb.XGBClassifier()
quality_model.load_model('pred_quality_two_class.model')
preds_two = quality_model.predict(X_two_test)
display(confusion_matrix(y_two_test, preds_two))

  if diff:


array([[ 1833,   132],
       [  134, 12134]])

## 3 label prediction

Thresholds of 0.2 and 0.3

In [25]:
classification_three_category = list(map(lambda x: 2 if x < 0.2 else 1 if x < 0.3 else 0, model_1_js_list))

X_three_train, X_three_test, y_three_train, y_three_test = train_test_split(X_test,
                            classification_three_category,
                            test_size=0.1, 
                            random_state=300)

In [36]:
display(len(X_three_train))
display(len(y_three_train))
xgb_three_model = xgb.XGBClassifier(objective="multi:softmax", 
                              random_state=42, num_class=3, max_depth=25, learning_rate=0.2, n_estimators=200)
xgb_three_model.fit(X_three_train, y_three_train,
             eval_set=[(X_three_train, y_three_train)],
             eval_metric='mlogloss',verbose=True, early_stopping_rounds=5)

128097

128097

[0]	validation_0-mlogloss:0.83822
Will train until validation_0-mlogloss hasn't improved in 5 rounds.
[1]	validation_0-mlogloss:0.65716
[2]	validation_0-mlogloss:0.52361
[3]	validation_0-mlogloss:0.42200
[4]	validation_0-mlogloss:0.34295
[5]	validation_0-mlogloss:0.28056
[6]	validation_0-mlogloss:0.23081
[7]	validation_0-mlogloss:0.19091
[8]	validation_0-mlogloss:0.15858
[9]	validation_0-mlogloss:0.13239
[10]	validation_0-mlogloss:0.11091
[11]	validation_0-mlogloss:0.09338
[12]	validation_0-mlogloss:0.07908
[13]	validation_0-mlogloss:0.06728
[14]	validation_0-mlogloss:0.05760
[15]	validation_0-mlogloss:0.04951
[16]	validation_0-mlogloss:0.04273
[17]	validation_0-mlogloss:0.03710
[18]	validation_0-mlogloss:0.03242
[19]	validation_0-mlogloss:0.02852
[20]	validation_0-mlogloss:0.02525
[21]	validation_0-mlogloss:0.02251
[22]	validation_0-mlogloss:0.02019
[23]	validation_0-mlogloss:0.01818
[24]	validation_0-mlogloss:0.01651
[25]	validation_0-mlogloss:0.01511
[26]	validation_0-mlogloss:0.013

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.2, max_delta_step=0, max_depth=25,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=200, n_jobs=0, num_class=3, num_parallel_tree=1,
       objective='multi:softprob', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=None, subsample=1,
       tree_method='exact', validate_parameters=1, verbosity=None)

In [39]:
from sklearn.metrics import confusion_matrix
preds_three = xgb_three_model.predict(X_three_test)

# Count the number of matches between predictions and labels
correct_three = np.sum(preds_three == y_three_test)

# Calculate accuracy
accuracy_three = correct_three / len(y_three_test)
print(f"Accuracy: {accuracy_three}")
display(confusion_matrix(y_three_test, preds_three, labels=[0, 1, 2]))
xgb_three_model.save_model('pred_quality_three_class.model')

Accuracy: 0.9648001124148107


  if diff:


array([[  906,   114,    19],
       [  131,   656,   139],
       [   11,    87, 12170]])