In [None]:
# DATA CLEANING METHODS

In [2]:
from imblearn.under_sampling import RandomUnderSampler

#Takes a Panda df and the keyword for a column.
#If set is not balanced, balances it. Prints informative message.

def get_balanced_set_random(df, class_key):

    counts = df[class_key].value_counts()

    if counts.min() < counts.max() * 0.55:
        y = df[class_key]
        x = df.drop(class_key, axis=1)

        x, y = RandomUnderSampler(random_state=0).fit_resample(x, y)
        df.loc[:, class_key] = y
        df = df.dropna(axis=0)

        print("The dataset has been balanced through random undersampling.")
    else:
        print("The set did not require balancing.")

    return df


# Receives Panda Dataframe, and an optional flag. 
# Prepares dataset for training algorithms
# If autobalance=True, balances the dataset according to the last column

from sklearn.preprocessing import MinMaxScaler


def prepare_data(df, autobalance=False):

    keys = df.keys()
    class_key = keys[len(keys) - 1]

    df.drop_duplicates(subset=None, keep="first", inplace=True)
    df.dropna(axis=0, how="any", inplace=True)

    if autobalance:
        df = get_balanced_set_random(df, class_key)

    df = pd.DataFrame(
        MinMaxScaler().fit(df).transform(df), columns=keys
    )

    print(f"The final dataset contains {len(df[keys[0]])} samples")

    return df


In [3]:
#DATA VISUALIZATION

In [None]:
def density_plot(df, gen_title):

    features = df.copy().iloc[:, 0:8]
    fig = plt.figure(figsize=(11, 5))

    j = 0
    for feature in features.keys():

        plt.subplot(2, 4, j + 1)
        plt.subplots_adjust(top=0.85, bottom=0.01, hspace=0.5, wspace=0.4)
        sns.kdeplot(data=df, x=feature, hue=class_key, fill=True, common_norm=False)

        plt.title(feature)
        plt.axhline(
            df[feature].mean(),
            linestyle="dashed",
             label ="Mean value = " + str(round(df[feature].mean(), 2)),
        )
        j = j + 1

    #plt.savefig(f"{gen_title}.jpg")
    plt.show()


def strip_plot(df, gen_title):

    features = df.copy().iloc[:, 0:8]
    fig = plt.figure(figsize=(11, 5))

    j = 0
    for feature in features:

        plt.subplot(2, 4, j + 1)
        plt.subplots_adjust(top=0.85, bottom=0.01, hspace=0.5, wspace=0.4)
        sns.stripplot(data=df, x=df[class_key], y=df[feature], hue=class_key)

        plt.title(feature)
        plt.axhline(
            df[feature].mean(),
            linestyle="dashed",
             label ="Mean value = " + str(round(df[feature].mean(), 2)),
        )
        plt.legend(loc="best")
        j = j + 1

    #plt.savefig(f"{gen_title}.jpg")
    plt.show()


def violin_plot(df, gen_title):

    features = df.copy().iloc[:, 0:8]
    fig = plt.figure(figsize=(11, 5))

    j = 0
    for feature in features:

        plt.subplot(2, 4, j + 1)
        plt.subplots_adjust(top=0.85, bottom=0.01, hspace=0.5, wspace=0.4)
        sns.violinplot(x=df[class_key], y=df[feature])

        plt.title(feature)
        plt.axhline(
            df[feature].mean(),
            linestyle="dashed",
            label ="Mean value = " + str(round(df[feature].mean(), 2)),
        )
        plt.legend(loc="best")
        j = j + 1

    #plt.savefig(f"{gen_title}.jpg")
    plt.show()

In [59]:
#DATA BALANCING METHODS

In [60]:
#Undersampling method

In [61]:
# Accepts a sklearn undersampler, a Pandas df, and a column's name
# Appliers sampler to the dataframe once, based on column's name
# Returns dataframe


def single_run_undersampling(sampler, df, class_key):

    label = df[class_key]
    feats = df.drop(class_key, axis=1)

    feats, label = sampler.fit_resample(feats, label)

    df.loc[:, class_key] = label
    df = df.dropna(axis=0)

    return df #df.dropna()


# Accepts a sklearn undersampler, a Pandas df, and a column's name
# Recursively appliers sampler to the dataframe until condition is met
# Returns dataframe


def get_balanced_set_undersampling(sampler, df, class_key):
 
    counts = df[class_key].value_counts()
    imbalance_ratio = abs(counts.min() - counts.max()) / (counts.min() + counts.max())
    accepted_imbalance_ratio = 0.20

    if imbalance_ratio > accepted_imbalance_ratio:

        new_df = single_run_undersampling(sampler, df.copy(), class_key)
        new_counts = new_df[class_key].value_counts()

        if new_counts.max() == new_counts.min():
            return new_df
        if (
            counts.min() >= new_counts.max() * (1 + accepted_imbalance_ratio)
            or new_counts.max() == counts.min()
        ):
            return df

        df = get_balanced_set_undersampling(sampler, new_df, class_key)
        return df

    return df


In [69]:
# Accepts a sklearn sampler, a Pandas df, and a column's name
# Balances by applying oversampler
# Returns panda dataframe


def get_balanced_set_oversampling(sampler,df, class_key):# change name
    
    y_bal=df[class_key]
    X_bal=df.drop(class_key, axis=1)
    X_bal, y_bal = sampler.fit_resample(X_bal, y_bal)
        
    X_bal.loc[:, class_key] = y_bal
    df=pd.DataFrame(X_bal, columns=keys)
    
    return df

In [62]:
#PERFORMANCE VISUALIZATION METHODS

In [13]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Accepts list of values v as returned by cross_classifier, and a name
# Shows a plot of the Confussion Matrix with a title


def my_cross_confussion_matrix(values, sampler_name):

    name_graph_sampler = f"Best result for sampler {sampler_name}."
    name_graph_classifier = f"From {values[1]} with parameter {values[2]}"
    name_graph = name_graph_sampler + name_graph_classifier

    cmd = ConfusionMatrixDisplay(values[0], display_labels=["false pulsar", "pulsar"])
    cmd.plot()

    cmd.ax_.set(title=name_graph, xlabel="Predicted", ylabel="Actual")
    #    plt.savefig(f'{name_graph}.jpg')
    plt.show()

In [64]:
#CROSS VALIDATION

In [65]:
#Performance comparison 

In [12]:
# Accepts two Pandas df, dforiginal dataframe, df_bal balanced version of df
# Obtains the elements that were deleted from df to create df_bal
# Returns the atributes and labels of those objects as two ordered arrays


def calculate_pure_test_values(df, df_bal):

    x_pure_test = None
    y_pure_test = None

    if df is not None:
        df_pure_test = pd.concat([df.copy(), df_bal.copy()])
        df_pure_test.drop_duplicates(subset=None, keep=False, inplace=True)
        y_pure_test = df_pure_test[class_key].to_numpy()
        x_pure_test = df_pure_test.drop(class_key, axis=1).to_numpy()

    return x_pure_test, y_pure_test

#RETURN HEREEEE
# Accepts a 2D array, a 1D array of objects, two arrays for extra test cases, an estimator and a boolean flag
#Calculates the chaos matrix for each prediction run by cross_ 
#Returns the best one according to our scoring system and the corresponding confussion matrix
def cm_all_runs_crosval(x, y, x_pt, y_pt,estimators,indices_test, flag):
    
    cumulative_y_test=np.array([])
    cumulative_y_predict=np.array([])
    
    for n in range(len(estimators)):
        
        x_test=np.take(x, indices_test[n],axis=0)
        y_test=np.take(y, indices_test[n],axis=0)
        
        if((x_pt is not None) and (y_pt is not None)):
            
            x_test=np.concatenate((x_test, x_pt), axis=0)
            y_test=np.concatenate((y_test, y_pt), axis=0)
        
        y_pred=estimators[n].predict(x_test)
        cumulative_y_test=np.concatenate([cumulative_y_test, y_test], 0)
        cumulative_y_predict=np.concatenate([cumulative_y_predict, y_pred], 0)

    cm = confusion_matrix(cumulative_y_test, cumulative_y_predict, normalize='all') 

    score=0
    
    if(flag==0): score=cm.flatten()[2]
    else:score=cm.flatten()[1] + 5*cm.flatten()[2] #give more weiight to false negatives
    
    return score, cm

In [13]:
#Running classifiers through Crossvaldiate

In [14]:
# Runs cross validation trials for a single classifier and parameter
# Returns the score and confussion matrix


def run_cross(df_bal, classifier, flag, df):

    class_key = keys[len(keys) - 1]

    y = df_bal[class_key].to_numpy()
    x = df_bal.drop(class_key, axis=1).to_numpy()

    x_pt, y_pt = calculate_pure_test_values(df, df_bal)

    crossed_results = cross_validate(
        classifier,
        x,
        y,
        cv=4,
        return_train_score=True,
        return_estimator=True,
        return_indices=True,
    )
    score, cm = cm_all_runs_crosval(
        x,
        y,
        x_pt,
        y_pt,
        crossed_results["estimator"],
        crossed_results["indices"]["test"],
        flag,
    )

    return score, cm


# Creates a classifier of the given type with parameter=element of the parameter list
# Returns the score and confussion amtrix of the best performing option


def run_classifier(
    df_bal,
    classifier_cons,
    classifier_name,
    parameter,
    flag,
    df=None,
):
    # print(classifier_name)

    kw = parameter[0]
    dict = {}

    for p in parameter[1]:

        kwargs = {kw: p}
        classifier = classifier_cons(**kwargs)
        score, cm = run_cross(df_bal, classifier, flag, df)
        dict[score] = [cm, classifier_name, p]

    k, v = min(dict), dict[min(dict)]

    return k, v
