In [13]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

In [20]:
df4 = pd.read_csv('/Users/Akashgupta/Desktop/sample_df4.csv').iloc[:, 1:]

In [21]:
###########################################################################################################
# ******************************************************************************************************* #
# FUNCTION TO CREATE CLUSTERS OF SIMILAR AND CORRELATED VARIABLES - TO ADDRESS VARIALE REDUNDANCY IN DATA #
# ******************************************************************************************************* #
###########################################################################################################


# ******************* #
# INITIALIZE CLUSTERS #
# ******************* #

clusters = []

# ****************************************** #
# FUNCTION THAT PERFORMS VARIABLE CLUSTERING #
# ****************************************** #

def varclus(dsn, p):
    
    # ****************************************************************************** #
    # PERFORM PCA ON DATA - EXTRACT TOP 2 PRINCIPAL COMPONENTS AND SECOND EIGENVALUE #
    # ****************************************************************************** #
    
    pr1 = PCA(n_components=2)
    pr1.fit_transform(dsn.values)
    pcs = pd.DataFrame(pr1.components_, columns=dsn.columns, index=['PC1', 'PC2'])
    eig = pr1.explained_variance_[1]
    
    # ******************************************************************************************** #
    # IF SECOND EIGENVALUE GREATER THAN THRESHOLD - DIVIDE VARIABLES SUCH THAT PC1>PC2 AND PC2>PC1 #
    # ******************************************************************************************** #
    
    if eig > p:
        d = pd.DataFrame(np.abs(pcs.iloc[0, :]) > np.abs(pcs.iloc[1, :]), columns=['val'])
        l1 = d.loc[d['val'] == True, 'val'].index
        l2 = d.loc[d['val'] == False, 'val'].index
        d1 = dsn.loc[:, dsn.columns.isin(list(l1))]
        d2 = dsn.loc[:, dsn.columns.isin(list(l2))]
        
    # IF NOT GREATER THAN THRESHOLD - NO MORE SPLITTING POSSIBLE - RETURN COLUMNS AS A CLUSTER #
    
    else:
        clusters.append(dsn.columns)
        return 0
    
    # ***************************************************************************************** #
    # IF CLUSTER CONTAINS ONLY ONE VARIABLE, THEN ADD AS A SEPARATE CLUSTER TO THE CLUSTER LIST #
    # ***************************************************************************************** #
    
    if (len(l1) == 1 or len(l2) == 1):
        clusters.append(dsn.columns)
        return 0
    
    # ************************************************************************** #
    # PERFORM PCA ON TWO SUB-CLUSTERS FORMED FROM MAIN CLUSTER IN PREVIOUS STEPS #
    # ************************************************************************** #
    
    pr11 = PCA(n_components=2)
    pr11.fit_transform(d1.values)
    pcs1 = pd.DataFrame(pr11.components_, columns=d1.columns, index=['PC1', 'PC2'])
    eig1 = pr11.explained_variance_[1]
    
    pr12 = PCA(n_components=2)
    pr12.fit_transform(d2.values)
    pcs2 = pd.DataFrame(pr12.components_, columns=d2.columns, index=['PC1', 'PC2'])
    eig2 = pr12.explained_variance_[1]
    
    if eig1 > eig1:
        maxdf = d1
        maxeig = eig1
        mindf = d2
        mineig = eig2
    else:
        maxdf = d2
        maxeig = eig2
        mindf = d1
        mineig = eig1
        
    # *************************************************************************** #
    # SUB-CLUSTER WITH HIGHER SECOND EIGENVAUE IS PASSED ON FOR FURTHER DIVISIONS #
    # *************************************************************************** #
    
    r = varclus(maxdf, p)
    if r == 0:
        clusters.append(mindf.columns)
        return 1
    else:
        r = varclus(mindf, p)
        return 1
    
##################################################################################    
# ****************************************************************************** #
# CALL FUNCTION TO GENERATE LIST OF CLUSTERS OF SIMILAR AND CORRELATED VARIABLES #
# ****************************************************************************** #
    
varclus(df4, 1)

1

In [22]:
##################################################################################################
# ********************************************************************************************** #
# FUNCTIONS THAT COMPUTE METRICS FOR DECIDING BEST VARIABLE IN EACH CLUSTER OF SIMILAR VARIABLES #
# ********************************************************************************************** #
##################################################################################################

def redundancy(clusters, data):
    
    # ************************************************************************** #
    # CHECK CORRELATIONS OF EACH VARIABLE IN EACH CLUSTER - WITH ITS OWN CLUSTER #
    # ************************************************************************** #
    
    main_cor_list = []
    for i in range(len(clusters)):
        varlist = list(clusters[i])
        cormat = data[varlist].corr()
        for j in range(len(varlist)):
            avgcor = np.round((sum(cormat.iloc[:, j]) - 1)/len(varlist), 2)
            main_cor_list.append((varlist[j], i, avgcor))
            
    # *********************************************************************************** #
    # CHECK CORRELATIONS OF EACH VARIABLE IN EACH CLUSTER - WITH ITS NEXT CLOSEST CLUSTER #
    # *********************************************************************************** #        
            
    inter_cluster = []
    for i in range(len(clusters)):
        for j in range(len(list(clusters[i]))):
            closest = []
            for k in range(len(clusters)):
                if list(clusters[i]) != list(clusters[k]):
                    newlist = list(clusters[k])
                    newelem = list(clusters[i])[j]
                    newlist.append(newelem)
                    cmat = data[newlist].corr()
                    value = np.round((np.sum(cmat.loc[newelem, :]) - 1)/(len(newlist) - 1), 2)
                    closest.append(value)
                else:
                    pass
            final_closest = max(closest)
            inter_cluster.append((list(clusters[i])[j], final_closest))   
            
    # *************************************************************************************************** #
    # COMPUTE RATIO OF OWN CLUSTER-NEXT CLOSEST CLUSTER CORRELATIONS FOR EACH VARIABLE - LOWER THE BETTER #
    # *************************************************************************************************** #        
            
    a = pd.DataFrame(main_cor_list, columns = ['col', 'cluster_#','own_cluster_R2'])
    b = pd.DataFrame(inter_cluster, columns = ['col', 'next_closest_R2'])
    final = pd.merge(a, b, how="inner", on="col")
    final['ratio'] = (1-final['own_cluster_R2'])/(1-final['next_closest_R2'])
    return final

In [23]:
r = redundancy(clusters, df4)

In [24]:
r

Unnamed: 0,col,cluster_#,own_cluster_R2,next_closest_R2,ratio
0,variable_1,0,-0.01,0.13,1.16092
1,variable_5,0,0.01,0.03,1.020619
2,variable_8,0,0.0,0.03,1.030928
3,variable_35,0,0.01,0.37,1.571429
4,variable_26,1,0.37,0.03,0.649485
5,variable_27,1,0.51,0.01,0.494949
6,variable_28,1,0.52,0.02,0.489796
7,variable_29,1,0.45,0.01,0.555556
8,variable_37,2,0.36,0.02,0.653061
9,variable_38,2,0.5,0.03,0.515464
