In [None]:
# inputs:
#    X: pandas.DataFrame, features
#    y: pandas.Series, target variable
#    K: number of features to select
def min_redun_max_relev(X, y, k):
    # compute F-statistics and initialize correlation matrix
    F = pd.Series(f_regression(X, y)[0], index = X.columns)
    corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)

    # initialize list of selected features and list of excluded features
    selected = []
    not_selected = X.columns.to_list()

    # initialize list of feature scores
    scores = []
    scores_ith = []

    redundancy = []
    relevancy = []
    # repeat K times
    for i in range(k):
        # compute (absolute) correlations between the last selected feature and all the (currently) excluded features
        if i > 0:
            last_selected = selected[-1]
            corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected]).abs().clip(.00001)

        # compute FCQ score for all the (currently) excluded features (this is Formula 2)
        score = F.loc[not_selected] / corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001)
        relevancy.append(F.loc[not_selected])
        redundancy.append(corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001))

        scores_ith.append(score)
        # find best feature, add it to selected and remove it from not_selected
        best = score.index[score.argmax()]
        selected.append(best)
        not_selected.remove(best)

        # add feature name and score to list of feature scores
        scores.append((best, score[best]))
        
        # create DataFrame of feature scores
        score_df = pd.DataFrame(scores, columns=['mRMR', 'Highest_score_each_iteration'])
    return scores,selected,scores_ith,score_df,relevancy,redundancy
    

In [2]:
def feature_extraction(data):
    labels_fof = ["Mean","Variance","Median","Mode","Skewness",
              "Kurtosis","Energy","EntropyFOS","MinimalGrayLevel",
              "MaximalGrayLevel","CoefficientOfVariation",
              "10Percentile","25Percentile","75Percentile",
              "90Percentile","HistogramWidth"]
    
    df1 = pd.DataFrame({'': labels_fof})
    
    df_fof_cols = {}
    
    for i, image in enumerate(data):
        first_order_features = {}
        first_order_features['A_FOS'] = fos(image,None)
        df_fof_cols["Image_" + str(i+1)] = first_order_features['A_FOS'][0]
    
    df2 = pd.DataFrame(df_fof_cols)
    
    df_fof = pd.concat([df1, df2],
                  axis = 1)
    
    labels_glcm = ["ASM", "Contrast", "Correlation",
              "SumOfSquaresVariance", "InverseDifferenceMoment",
               "SumAverage", "SumVariance", "SumEntropy",
               "EntropyGLCM", "DifferenceVariance",
               "DifferenceEntropy", "Information1",
               "Information2", "MaximalCorrelationCoefficient"]
    
    df3 = pd.DataFrame({'': labels_glcm})
    
    df_glcm_cols = {}
    for i, image in enumerate(data):
        second_order_features = {}
        second_order_features['A_GLCM'] = glcm_features(image, ignore_zeros=True)
        df_glcm_cols["Image_" + str(i+1)] = second_order_features['A_GLCM'][0]
    
    df4 = pd.DataFrame(df_glcm_cols)
    
    df_glcm = pd.concat([df3, df4],
                  axis = 1)
    
    df = pd.concat([df_fof, df_glcm], 
                  ignore_index = True)
    
    # Use the values in column 1 as the index for the DataFrame
    df = df.set_index('')

    # Transpose the DataFrame and use the values from column 1 as new column headers
    df_T = df.T.rename(columns=df.iloc[0])

    # Remove the index name
    df_T.index.name = None
    
    return df_T