In [None]:
%load_ext autoreload

%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

In [None]:
file_place = r"C:\Users\Conor\DataSets"

Breast_DF = pd.read_pickle(file_place + r"\Breast_data_raw")

In [None]:
Breast_DF.head()

Create the D_Matrix by first encoding the desired categorical labels to numbers.

# Preprocessing

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, RobustScaler

steps = [
         #("Normalise", Normalizer(norm = "l1"))
         ("RobustScaler", RobustScaler())
        ]

pipe  = Pipeline(steps)

In [None]:
import Tissue_Analysis_Tools as TAT

PCA_NR_Components = 50

dataframe = TAT.process_data(Breast_DF, paraffin = (1340,1490), balance = "Type")

PCA_reduced = TAT.clean_spectra(dataframe, PCA_NR_Components)

values = pipe.fit_transform(PCA_reduced)

Breast_DF_P = pd.DataFrame(values, index = dataframe.index, columns = dataframe.columns)

# Analysis

In [None]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

le = LabelEncoder()
lb = LabelBinarizer()

DATA = Breast_DF_P.sample(1000)

y_label = "Type"

X = DATA
Y = DATA.reset_index()[y_label]
Y_b = lb.fit_transform(Y)

In [None]:
DATA.reset_index()["Type"].value_counts()

In [None]:
from sklearn.metrics import confusion_matrix


def sensitivity_Score(y, y_pred, **kwargs):
    
    assert np.unique(y).size == 2, "Non_binary sensitivity score"
    
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    
    return tp/(tp+fn)


def specificity_Score(y, y_pred, **kwargs):
    
    assert np.unique(y).size == 2, "Non_binary speificity score"
    
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    
    return tn/(tn+fp)

In [None]:
import xgboost as xgb

In [None]:
from scipy import stats

param_dist = {'n_estimators': stats.randint(50, 500),
              'learning_rate': stats.uniform(0.01, 0.07),
              'subsample': stats.uniform(0.3, 0.7),
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': stats.uniform(0.5, 0.45),
              'min_child_weight': [1, 2, 3]
             }

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GroupKFold, GroupShuffleSplit, LeavePGroupsOut, LeaveOneGroupOut

In [None]:
{type_: np.unique(DATA.groupby(level = "Type").get_group(type_).index.get_level_values("Core")) for type_ in ["Normal","NAT","Malignant"]}

In [None]:
from sklearn.metrics import make_scorer, roc_auc_score, precision_score, recall_score, auc, accuracy_score

scorers = {
    "Accuracy": make_scorer(accuracy_score, greater_is_better = True)
    ,"Specificity": make_scorer(specificity_Score)
    ,"Sensitivity": make_scorer(sensitivity_Score, greater_is_better = True)
    ,"AUC": make_scorer(roc_auc_score)
          }

In [None]:
classifier_dict = {
                    "XGB": ()
}

In [None]:
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

if __name__ == '__main__':
    
    % time

    n_Folds = 2

    results = dict()

    for i, name in enumerate(np.unique(Y)):

        # Find out the ratio of positive classes to negative to scale positive weights.
        # Allows for compensation of unbalanced classes without throwing away data.

        pos_ratio = 1/(np.sum(Y_b[:,i])/Y_b[:,i].shape[0])

        clf_xgb = xgb.XGBClassifier(objective = "binary:logistic", n_classes = 2, scale_pos_weight = pos_ratio)
        #clf_LDA = LinearDiscriminantAnalysis()

        clf = RandomizedSearchCV(clf_xgb, param_distributions = param_dist
                                 , n_iter = 1, scoring = scorers, refit = "AUC"
                                 , error_score = 0, verbose = 3
                                 , n_jobs = -1, return_train_score = True
                                 , cv = GroupKFold(n_Folds).split(X, Y_b[:,i], DATA.reset_index()["Core"])
                                )

        results[name] = clf.fit(X,Y_b[:,i])

In [None]:
import datetime

d = datetime.datetime.now().strftime("%d/%m/%Y_%H:%M:%S")

output = pd.concat({label: pd.DataFrame(results[label].cv_results_) for label in np.unique(Y)})

save_name = r"C:\Users\Conor\Documents\Projects\Biospec_Analysis\Output\{}_Balanced_Data_{}.csv".format(y_label, d)

output.to_csv(save_name)

In [None]:
df = pd.DataFrame(results["Malignant"].cv_results_)

plt.clf()
threedee = plt.figure().gca(projection='3d')
threedee.patch.set_facecolor([1,1,1])

threedee.scatter(df["param_colsample_bytree"], df["param_learning_rate"], df["mean_test_AUC"])
plt.show()