# load precomputed embedding

In [None]:
import pandas as pd
def load_features(train_path, test_path):
    """
    Load precomputed features/embeddings for training and testing

    Args:
        train_path (str): where the training corpus features (after dimension reduction) is saved. Each row contains the precomputed features and the ground truth class label for one document.
        test_path (str): where the testing corpus features (after dimension reduction) is saved. Each row contains the precomputed features and the ground truth class label for one document.

    Returns:
        df (Dataframe): training corpus features
        df_test (Dataframe): testing corpus features
    """
    df = pd.read_csv(train_path,sep=',')
    df['Class_label'] = df['Class_label'].astype(str)

    df_test = pd.read_csv(test_path,sep=',')
    df_test['Class_label'] = df_test['Class_label'].astype(str)

    return df, df_test

# Overall grouping function (TODO)

# Correlation construction, grouping, and store features


In [None]:
def correlation_grouping(df,df_test, output_path_prefix, correlation_type = 'pearson', threshold_quantile = 0.9, save_csv=True):
    """
    Group and merge training and testing features based on correlation between the dimensions.

    Args:
        df (Dataframe): training corpus features
        df_test (Dataframe): testing corpus features
        output_path_prefix (str): prefix to the file path where the output files will be saved.
        correlation_type (str): method to calculate correlation among the columns. Available options are ['pearson', 'kendall', 'biserial']. Note that 'biserial' can only be applied to binary dataset. Default as 'pearson'.
        threshold_quantile (float): a threshold to define highly correlated dimensions to merge them. Default as 0.9.
        save_csv (bool): whether to save the output csv files. Default as True.


    Returns:
        group_feats_df_train (Dataframe): training corpus features after merging
        group_feats_df_test (Dataframe): testing corpus features after merging
    """

    assert correlation_type in ['pearson', 'kendall', 'biserial']

    if correlation_type == 'biserial':
        try:
            no_feats = len(df.columns)-1
            X_train = df.iloc[:,:no_feats]
            y_train = df.iloc[:,no_feats]
            corr_df = X_train.corrwith(y_train,axis=0)
            flatten_corr_df = corr_df
        except:
            print("Input data must be binary dataset for using point biserial correlation. Please try another compatible correlation method.")
    else:
        corr_df = df.corr(method=correlation_type)
        flatten_corr_df = corr_df.stack()
        threshold = flatten_corr_df.quantile(threshold_quantile)
        corr_df_tf = corr_df >= threshold
        num_of_related_feats = (corr_df > threshold).sum(axis=1)

        merge_groups = {}
        for i in corr_df_tf.index:
            for j in corr_df_tf.columns:
                if corr_df_tf.loc[i,j] == True:
                    try:
                        merge_groups[i].append(j)
                    except:
                        merge_groups[i] = [j]


        updated_groups = {}
        dims_included = []
        for key,val in merge_groups.items():
            if key in dims_included:
                continue
            else:
                updated_groups[key] = val
                dims_included.extend(val)
                dims_included = list(set(dims_included))

        new_column_names = list(updated_groups.keys()) + ['Class_label']
        group_feats_df_train = df.loc[:,new_column_names]
        for key,val in updated_groups.items():
            group_feats_df_train.loc[:,key]  = df.loc[:,val].mean(axis='columns')
        group_feats_df_test = df_test.loc[:,new_column_names]
        for key,val in updated_groups.items():
            group_feats_df_test.loc[:,key] = df_test.loc[:,val].mean(axis='columns')

        if save_csv:
            group_feats_df_train.to_csv(f"{output_path_prefix}_train.csv",sep=',',index=False)
            group_feats_df_test.to_csv(f"{output_path_prefix}_test.csv",sep=',',index=False)
        return group_feats_df_train, group_feats_df_test

# Kmeans clustering, grouping, and store features


In [None]:
import sklearn
from sklearn.cluster import KMeans

def kmeans_grouping(df,df_test, output_path_prefix, cluster_number=10, random_state=20, save_csv=True):
    """
    Group and merge training and testing features based on correlation between the dimensions.

    Args:
        df (Dataframe): training corpus features
        df_test (Dataframe): testing corpus features
        output_path_prefix (str): prefix to the file path where the output files will be saved.
        cluster_number (str): define the number of dimension clusters. Default as 10.
        random_state (float): random state to start kmeans clustering. Default as 20.
        save_csv (bool): whether to save the output csv files. Default as True.


    Returns:
        group_feats_df_train (Dataframe): training corpus features after merging
        group_feats_df_test (Dataframe): testing corpus features after merging
    """
    no_feats = len(df.columns)-1
    X_train = df.iloc[:,:no_feats]
    y_train = df.iloc[:,no_feats]

    transposed_df = X_train.T

    mat = transposed_df.values
    km = sklearn.cluster.KMeans(n_clusters=cluster_number,init='k-means++',random_state=random_state)
    km.fit(mat)
    labels = km.labels_

    results = pd.DataFrame([transposed_df.index,labels]).T

    label_groups = {}
    for i in range(cluster_number):
        label_groups[i]=[]


    for i in range(len(results.index)):
        label_groups[int(results.iloc[i,1])].append(results.iloc[i,0])

    updated_groups = {}
    dims_included = []
    for key,val in label_groups.items():

        updated_groups[val[0]] = val
        dims_included.extend(val)
        dims_included = list(set(dims_included))

    new_column_names = list(updated_groups.keys()) + ['Class_label']
    group_feats_df_train = df.loc[:,new_column_names]
    for key,val in updated_groups.items():
        group_feats_df_train.loc[:,key]  = df.loc[:,val].mean(axis='columns')
    group_feats_df_test = df_test.loc[:,new_column_names]
    for key,val in updated_groups.items():
        group_feats_df_test.loc[:,key] = df_test.loc[:,val].mean(axis='columns')

    if save_csv:
        group_feats_df_train.to_csv(f"{output_path_prefix}_train.csv",sep=',',index=False)
        group_feats_df_test.to_csv(f"{output_path_prefix}_test.csv",sep=',',index=False)
    return group_feats_df_train, group_feats_df_test